use crate::cache::Fingerprinter;
use crate::faer_ndarray::FaerEigh;
use crate::faer_ndarray::{FaerCholesky, fast_atb, fast_av};
use crate::matrix::{
DesignMatrix, EmbeddedColumnBlock, LinearOperator, SignedWeightsView, SymmetricMatrix,
dense_rowwise_kronecker,
};
use crate::pirls::{LinearInequalityConstraints, solve_newton_directionwith_lower_bounds};
use crate::resource::{DerivativeStorageMode, ResourcePolicy};
use crate::solver::active_set::{
project_stationarity_residual_on_constraint_cone, solve_quadratic_with_linear_constraints,
};
use crate::solver::estimate::reml::penalty_logdet::PenaltyPseudologdet;
use crate::solver::estimate::reml::unified::{
BlockCoupledOperator, ContractedPsiSecondOrder, ContractedPsiSecondOrderFn,
DenseSpectralOperator, DispersionHandling, DriftDerivResult, FixedDriftDerivFn,
HessianDerivativeProvider, HessianOperator, HyperCoord, HyperCoordDrift, HyperCoordPair,
HyperOperator, MatrixFreeSpdOperator, PenaltySubspaceTrace, ProjectedKktResidual,
StochasticTraceState, compute_block_penalty_logdet_derivs, exact_pseudo_logdet,
positive_eigenvalue_threshold, spectral_epsilon, spectral_regularize,
};
use crate::solver::estimate::{
EstimationError, FitGeometry, ensure_finite_scalar_estimation, validate_all_finite_estimation,
};
use crate::solver::persistent_warm_start::{
PersistentBlockInnerSummary, PersistentBlockWarmStartRecord, load_block_record,
store_block_record,
};
use crate::types::{RidgeDeterminantMode, RidgePolicy};
use faer::Side;
use ndarray::{Array1, Array2, ArrayView1, ArrayViewMut1, s};
use std::any::{Any, type_name};
use std::cell::RefCell;
use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::ops::Range;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::{Arc, Mutex, OnceLock, Weak};
use thiserror::Error;
pub use crate::solver::estimate::reml::unified::{EvalMode, PseudoLogdetMode};
/// A penalty matrix that may be stored in Kronecker-factored form.
///
/// For tensor-product terms (e.g. time-varying survival covariates), the penalty
/// has the structure `S = left ⊗ right` (Kronecker product). Keeping this
/// factored avoids materializing (p_left × p_right)² dense entries and enables
/// exact log-determinant computation via `log|A ⊗ B| = n_B log|A| + n_A log|B|`.
///
/// Dense penalties are stored as-is. Callers that need a raw `Array2<f64>` can
/// call `as_dense()` (zero-cost for Dense, lazy-materialized for KroneckerFactored).
#[derive(Clone, Debug)]
pub enum PenaltyMatrix {
Dense(Array2<f64>),
KroneckerFactored {
left: Array2<f64>,
right: Array2<f64>,
},
/// Block-local penalty: `local` is `block_dim × block_dim`, embedded at
/// `col_range` in the full parameter space of dimension `total_dim`.
/// Avoids materializing the full `total_dim × total_dim` matrix.
Blockwise {
local: Array2<f64>,
col_range: std::ops::Range<usize>,
total_dim: usize,
},
/// Wrapper assigning this penalty component to a user-visible precision
/// label. Components with the same label share one smoothing parameter.
Labeled {
label: String,
inner: Box<PenaltyMatrix>,
},
/// Wrapper fixing this penalty component at a physical log-precision.
/// Fixed components remain in the block-local physical penalty layout but
/// are removed from the REML outer coordinate vector.
Fixed {
log_lambda: f64,
inner: Box<PenaltyMatrix>,
},
}
impl PenaltyMatrix {
/// Number of rows (= number of columns, since penalties are square).
pub fn dim(&self) -> usize {
match self {
Self::Dense(m) => m.nrows(),
Self::KroneckerFactored { left, right } => left.nrows() * right.nrows(),
Self::Blockwise { total_dim, .. } => *total_dim,
Self::Labeled { inner, .. } | Self::Fixed { inner, .. } => inner.dim(),
}
}
/// Returns (nrows, ncols) like Array2::dim().
pub fn shape(&self) -> (usize, usize) {
let d = self.dim();
(d, d)
}
/// Materialize the full dense matrix.
pub fn to_dense(&self) -> Array2<f64> {
match self {
Self::Dense(m) => m.clone(),
Self::KroneckerFactored { left, right } => {
crate::terms::construction::kronecker_product(left, right)
}
Self::Blockwise {
local,
col_range,
total_dim,
} => {
let mut g = Array2::zeros((*total_dim, *total_dim));
g.slice_mut(ndarray::s![
col_range.start..col_range.end,
col_range.start..col_range.end
])
.assign(local);
g
}
Self::Labeled { inner, .. } | Self::Fixed { inner, .. } => inner.to_dense(),
}
}
/// Borrow the inner dense matrix if Dense, otherwise materialize.
pub fn as_dense_cow(&self) -> std::borrow::Cow<'_, Array2<f64>> {
match self {
Self::Dense(m) => std::borrow::Cow::Borrowed(m),
Self::KroneckerFactored { .. }
| Self::Blockwise { .. }
| Self::Labeled { .. }
| Self::Fixed { .. } => std::borrow::Cow::Owned(self.to_dense()),
}
}
/// Returns a reference to the inner matrix if this is a Dense variant.
pub fn as_dense_ref(&self) -> Option<&Array2<f64>> {
match self {
Self::Dense(m) => Some(m),
Self::Fixed { inner, .. } => inner.as_dense_ref(),
Self::KroneckerFactored { .. } | Self::Blockwise { .. } | Self::Labeled { .. } => None,
}
}
pub fn with_precision_label(self, label: impl Into<String>) -> Self {
Self::Labeled {
label: label.into(),
inner: Box::new(self),
}
}
pub fn precision_label(&self) -> Option<&str> {
match self {
Self::Labeled { label, .. } => Some(label.as_str()),
Self::Fixed { .. } => None,
_ => None,
}
}
pub fn with_fixed_log_lambda(self, log_lambda: f64) -> Self {
Self::Fixed {
log_lambda,
inner: Box::new(self),
}
}
pub fn fixed_log_lambda(&self) -> Option<f64> {
match self {
Self::Fixed { log_lambda, .. } => Some(*log_lambda),
Self::Labeled { inner, .. } => inner.fixed_log_lambda(),
_ => None,
}
}
/// Compute S * v using the Kronecker vec trick when factored:
/// (A ⊗ B) vec(V) = vec(B V Aᵀ)
/// where V = reshape(v, (p_right, p_left)).
pub fn dot(&self, v: &Array1<f64>) -> Array1<f64> {
match self {
Self::Dense(m) => m.dot(v),
Self::KroneckerFactored { left, right } => {
let p_left = left.nrows();
let p_right = right.nrows();
// v is (p_left * p_right,). Reshape as (p_right, p_left).
let v_mat =
ndarray::ArrayView2::from_shape((p_right, p_left), v.as_slice().unwrap())
.unwrap();
// result = B V A' then flatten.
let bv = right.dot(&v_mat);
let bva = bv.dot(&left.t());
Array1::from_iter(bva.iter().copied())
}
Self::Blockwise {
local,
col_range,
total_dim,
} => {
let mut out = Array1::zeros(*total_dim);
let v_block = v.slice(ndarray::s![col_range.clone()]);
let result_block = local.dot(&v_block);
out.slice_mut(ndarray::s![col_range.clone()])
.assign(&result_block);
out
}
Self::Labeled { inner, .. } | Self::Fixed { inner, .. } => inner.dot(v),
}
}
/// Add λ * self to a mutable dense accumulator.
pub fn add_scaled_to(&self, lambda: f64, target: &mut Array2<f64>) {
match self {
Self::Dense(m) => {
target.scaled_add(lambda, m);
}
Self::KroneckerFactored { left, right } => {
let p_left = left.nrows();
let p_right = right.nrows();
for i1 in 0..p_left {
for j1 in 0..p_left {
let a_ij = left[[i1, j1]];
if a_ij == 0.0 {
continue;
}
let scaled_a = lambda * a_ij;
for i2 in 0..p_right {
let row = i1 * p_right + i2;
for j2 in 0..p_right {
let col = j1 * p_right + j2;
target[[row, col]] += scaled_a * right[[i2, j2]];
}
}
}
}
}
Self::Blockwise {
local, col_range, ..
} => {
target
.slice_mut(ndarray::s![col_range.clone(), col_range.clone()])
.scaled_add(lambda, local);
}
Self::Labeled { inner, .. } | Self::Fixed { inner, .. } => {
inner.add_scaled_to(lambda, target)
}
}
}
/// Add λ * diag(self) to a mutable diagonal accumulator.
pub fn add_scaled_diag_to(&self, lambda: f64, target: &mut Array1<f64>) {
match self {
Self::Dense(m) => {
let p = m.nrows().min(target.len());
for j in 0..p {
target[j] += lambda * m[[j, j]];
}
}
Self::KroneckerFactored { left, right } => {
let p_left = left.nrows();
let p_right = right.nrows();
assert_eq!(target.len(), p_left * p_right);
for i_left in 0..p_left {
let left_diag = left[[i_left, i_left]];
if left_diag == 0.0 {
continue;
}
let scaled_left = lambda * left_diag;
for i_right in 0..p_right {
target[i_left * p_right + i_right] +=
scaled_left * right[[i_right, i_right]];
}
}
}
Self::Blockwise {
local, col_range, ..
} => {
let width = local.nrows().min(col_range.len());
for local_idx in 0..width {
target[col_range.start + local_idx] += lambda * local[[local_idx, local_idx]];
}
}
Self::Labeled { inner, .. } | Self::Fixed { inner, .. } => {
inner.add_scaled_diag_to(lambda, target)
}
}
}
/// Compute the quadratic form β' S β.
pub fn quadratic_form(&self, beta: &Array1<f64>) -> f64 {
match self {
Self::Dense(m) => beta.dot(&m.dot(beta)),
Self::KroneckerFactored { .. } => {
let sv = self.dot(beta);
beta.dot(&sv)
}
Self::Blockwise {
local, col_range, ..
} => {
let beta_block = beta.slice(ndarray::s![col_range.clone()]);
let sv = local.dot(&beta_block);
beta_block.dot(&sv)
}
Self::Labeled { inner, .. } | Self::Fixed { inner, .. } => inner.quadratic_form(beta),
}
}
/// Access dimensions like an Array2.
pub fn nrows(&self) -> usize {
self.dim()
}
pub fn ncols(&self) -> usize {
self.dim()
}
}
impl From<Array2<f64>> for PenaltyMatrix {
fn from(m: Array2<f64>) -> Self {
Self::Dense(m)
}
}
/// Per-subject channel Hessian provider for multi-output families.
///
/// The Fisher information decomposition for multi-output families is
///
/// ```text
/// I(β) = Σ_i J_iᵀ W_i J_i
/// ```
///
/// where `J_i` is the channel-stacked Jacobian (shape `n_outputs × p` for
/// subject `i`) and `W_i` is the `n_outputs × n_outputs` per-subject channel
/// Hessian of the row negative log-likelihood (the second-derivative block of
/// `−log L_i(u_i)` at a pilot β, PSD-clamped).
///
/// For single-output families this is the scalar IRLS weight; for multi-output
/// families (survival marginal-slope: `n_outputs = 4`; location-scale:
/// `n_outputs = 2`) it carries full cross-channel curvature.
///
/// The identifiability canonicalisation step uses the `n_outputs`-channel
/// weighted joint design `W_joint = Σ_i sqrt(W_i) ⊗ J_i` to detect
/// block-against-block aliasing. When this trait is present on
/// `ParameterBlockSpec::channel_hessian`, `canonicalize_for_identifiability`
/// routes through `audit_identifiability_channel_aware`; when absent it falls
/// back to the scalar-weight flat audit.
///
/// # W-metric rank theorem
///
/// The canonicalisation computes `rank(J^T W J)` where `W_blkdiag =
/// block-diagonal of per-subject W_i`. This rank equals
///
/// ```text
/// rank(J) − dim(range(J) ∩ ker(W_blkdiag))
/// ```
///
/// i.e. columns of `J` that lie in the kernel of `W_blkdiag` (flat directions
/// in the curvature landscape at the pilot β) are correctly identified as
/// curvature-redundant and may be dropped.
pub trait FamilyChannelHessian: Send + Sync {
/// Number of output channels `n_outputs` (= K in the row Jacobian).
fn n_outputs(&self) -> usize;
/// Number of subjects (rows).
fn n_subjects(&self) -> usize;
/// Fill the `n_outputs × n_outputs` per-subject channel Hessian `W_i`
/// into `out` (row-major, length `n_outputs * n_outputs`) for subject `i`.
/// Negative eigenvalues must be clamped to zero (PSD projection) before
/// or inside this call.
fn fill_subject(&self, i: usize, out: &mut [f64]);
/// Materialise the full `(n_subjects × n_outputs × n_outputs)` tensor.
/// Default implementation calls `fill_subject` for each row.
fn evaluate_full(&self) -> ndarray::Array3<f64> {
let n = self.n_subjects();
let k = self.n_outputs();
let mut out = ndarray::Array3::<f64>::zeros((n, k, k));
let mut buf = vec![0.0_f64; k * k];
for i in 0..n {
self.fill_subject(i, &mut buf);
for a in 0..k {
for b in 0..k {
out[[i, a, b]] = buf[a * k + b];
}
}
}
out
}
/// Return a refreshed W evaluated at `beta` using `family_scalars` when
/// those scalars carry the per-row primary state at the current β.
///
/// # Fisher information identity
///
/// `I(β) = J(β)^T W(β) J(β)`. T8 originally froze W at β=0; T34 refreshes
/// both J and W at the current β so the audit's rank verdict reflects the
/// actual local identifiability.
///
/// # Default implementation (β-independent W)
///
/// Families whose W is β-independent (e.g. Gaussian-identity where
/// `W = prior_w`) return a clone of their frozen W by delegating to
/// `evaluate_full()`. No recomputation is performed. `beta` and
/// `family_scalars` are ignored.
///
/// # Override (β-dependent W)
///
/// Families with β-dependent W (e.g. survival marginal-slope where
/// `W_i(β)` depends on `(q0_i, q1_i, qd1_i, g_i)`) must override this
/// method and recompute W from the current primary state.
///
/// When `beta` is non-zero in a way that affects W (i.e. `g_i != 0`),
/// `family_scalars` MUST be `Some(..)`. Return `Err` if scalars are
/// missing in that case (same error-message style as T26's contract).
fn channel_hessian_at(
&self,
beta: &[f64],
family_scalars: Option<&std::sync::Arc<dyn std::any::Any + Send + Sync>>,
) -> Result<Arc<dyn FamilyChannelHessian>, String> {
// Default: W is β-independent — return a snapshot of the frozen W
// wrapped in a simple tensor-backed implementation. β and
// family_scalars are validated (NaN-guard, presence flag) so callers
// that pass garbage state still see an Err rather than a silently-stale
// W. The default impl does not require family_scalars; family-specific
// overrides may.
if beta.iter().any(|v| v.is_nan()) {
return Err("channel_hessian_at: beta contains NaN".to_string());
}
// Acknowledge family_scalars without binding it to a discarded name.
if family_scalars.is_some() && beta.is_empty() {
return Err(
"channel_hessian_at: family_scalars supplied but beta is empty".to_string(),
);
}
let tensor = self.evaluate_full();
Ok(Arc::new(TensorChannelHessian { h: tensor }))
}
}
/// A [`FamilyChannelHessian`] backed directly by a pre-computed
/// `(n × K × K)` tensor. Used by the default `channel_hessian_at`
/// implementation and by tests.
///
/// This is the β-independent path: `fill_subject` reads from the frozen
/// tensor without any recomputation.
pub struct TensorChannelHessian {
pub h: ndarray::Array3<f64>,
}
impl FamilyChannelHessian for TensorChannelHessian {
fn n_outputs(&self) -> usize {
self.h.shape()[1]
}
fn n_subjects(&self) -> usize {
self.h.shape()[0]
}
fn fill_subject(&self, i: usize, out: &mut [f64]) {
let k = self.h.shape()[1];
assert_eq!(out.len(), k * k);
for a in 0..k {
for b in 0..k {
out[a * k + b] = self.h[[i, a, b]];
}
}
}
fn evaluate_full(&self) -> ndarray::Array3<f64> {
self.h.clone()
}
}
/// β-linearization state passed to [`BlockEffectiveJacobian::effective_jacobian_at`].
///
/// At pre-fit initialization, pass `beta = &[]` / zeros and `family_scalars = None`.
/// Families that need β-dependent scalars (e.g. survival marginal-slope's q0, q1,
/// g, c, z) store them in `family_scalars` as a concrete type behind
/// `Arc<dyn Any + Send + Sync>` and downcast inside their impl.
pub struct FamilyLinearizationState<'a> {
pub beta: &'a [f64],
/// Optional family-shared scalars at this β linearization.
/// Downcast via `state.family_scalars.as_ref().and_then(|a| a.downcast_ref::<T>())`.
pub family_scalars: Option<Arc<dyn std::any::Any + Send + Sync>>,
/// Optional per-subject channel Hessian for multi-output families.
/// When `Some`, the identifiability canonicalisation step and the Gram
/// builder use the channel-stacked Fisher information instead of the
/// scalar-weight approximation. Single-output families leave this `None`.
pub channel_hessian: Option<Arc<dyn FamilyChannelHessian>>,
/// Probit frailty scale factor `s_f = 1/√(1+σ²)`.
///
/// For survival marginal-slope families the logslope η contribution is
/// `s_f · g · z`, so any Jacobian callback that depends on g or z must
/// read `s_f` from here rather than from a captured-at-construction value.
/// When σ = 0 (no frailty) or for non-frailty families, set this to 1.0.
///
/// Since σ is always **fixed** (not jointly optimised with β) in the
/// survival family, `s_f` is a static scalar for the entire inner fit;
/// `∂s_f/∂σ` never appears in the β-Jacobian. The field is nonetheless
/// carried through state so that Jacobian callbacks are not required to
/// capture `s_f` at spec-construction time — they can read it at
/// evaluation time and thus stay correct across outer-loop σ updates.
pub probit_frailty_scale: f64,
}
/// β-dependent Jacobian callback for a parameter block.
///
/// Principled long-term contract for expressing how a block contributes to
/// the stacked linear predictor at a given β:
///
/// ```text
/// J(β) ∈ ℝ^{n_rows · n_outputs × p_block}
/// ```
///
/// - Single-output linear block: returns `design.clone()`.
/// - Row-scaled block (`RowScaledJacobian`): returns `diag(eta_scaling) · design` (still linear in β).
/// - Multi-output block (e.g. survival marginal-slope with η0, η1, ad1):
/// stacks `∂eta_r/∂β_k` for `r ∈ 0..n_outputs`, row-major ordering.
///
/// The default impl on [`ParameterBlockSpec::effective_jacobian_at`] is:
/// - `jacobian_callback = None` → `design.clone()`.
/// - `jacobian_callback = Some(cb)` → delegates to `cb.effective_jacobian_at`.
pub trait BlockEffectiveJacobian: Send + Sync {
/// Stacked multi-output Jacobian at the current β.
///
/// Shape: `(n_rows * n_outputs, p_block)`, **channel-major**: rows
/// `r * n_rows .. (r + 1) * n_rows` carry output channel `r`'s row
/// Jacobian, so `stacked[r * n_rows + i, j]` is observation `i`'s row at
/// output `r` and coefficient column `j`. Every consumer that destacks
/// this matrix (audit, canonicaliser, fit) relies on this layout — see
/// `BlockJacobianAsRowOp::from_callback` for the destacking transpose.
/// For `n_outputs = 1` this is identical to the `(n_rows, p_block)` effective
/// design used by the flat identifiability audit.
fn effective_jacobian_at(
&self,
state: &FamilyLinearizationState<'_>,
) -> Result<Array2<f64>, String>;
/// Number of stacked output channels. 1 for most blocks.
fn n_outputs(&self) -> usize {
1
}
/// Returns the per-row scaling vector when this callback is a simple
/// diagonal-scaling block (`RowScaledJacobian`). Used by the
/// identifiability audit's skewness-aware bias correction (T25).
///
/// Returns `None` for all blocks except `RowScaledJacobian`.
fn eta_row_scaling_for_skewness(&self) -> Option<Arc<[f64]>> {
None
}
}
/// A [`BlockEffectiveJacobian`] for any block that contributes linearly to
/// exactly one output of a multi-output family.
///
/// `own_output` is the zero-based output index that this block drives.
/// `n_family_outputs` is the total number of outputs (e.g. 2 for location-scale).
/// `design` is the block's effective design matrix (n × p_block).
///
/// The returned Jacobian has shape `(n_family_outputs * n, p_block)`:
/// rows `own_output * n .. (own_output + 1) * n` contain `design`,
/// all other rows are zero.
pub struct AdditiveBlockJacobian {
pub design: Array2<f64>,
pub own_output: usize,
pub n_family_outputs: usize,
}
impl BlockEffectiveJacobian for AdditiveBlockJacobian {
fn effective_jacobian_at(
&self,
state: &FamilyLinearizationState<'_>,
) -> Result<Array2<f64>, String> {
let n = self.design.nrows();
let p = self.design.ncols();
// Additive (linear) block: Jacobian is β-independent — design does
// not depend on state.beta. Verify beta contains no NaN when provided.
if !state.beta.is_empty() && state.beta.iter().any(|v| v.is_nan()) {
return Err(
"AdditiveBlockJacobian::effective_jacobian_at: beta contains NaN".to_string(),
);
}
let total_rows = self.n_family_outputs * n;
let mut jac = Array2::<f64>::zeros((total_rows, p));
let row_start = self.own_output * n;
jac.slice_mut(ndarray::s![row_start..row_start + n, ..])
.assign(&self.design);
Ok(jac)
}
fn n_outputs(&self) -> usize {
self.n_family_outputs
}
}
/// A [`BlockEffectiveJacobian`] for a single-output block whose contribution
/// to the linear predictor is `diag(eta_scaling) · design` (row-wise scaling).
///
/// This is the canonical replacement for the former `eta_row_scaling` field on
/// [`ParameterBlockSpec`]. The identifiability audit's skewness-aware bias
/// correction can recover the scaling vector via
/// [`BlockEffectiveJacobian::eta_row_scaling_for_skewness`].
pub struct RowScaledJacobian {
pub design: Arc<Array2<f64>>,
pub eta_scaling: Arc<[f64]>,
}
impl BlockEffectiveJacobian for RowScaledJacobian {
fn effective_jacobian_at(
&self,
state: &FamilyLinearizationState<'_>,
) -> Result<Array2<f64>, String> {
let n = self.design.nrows();
if self.eta_scaling.len() != n {
return Err(format!(
"RowScaledJacobian: eta_scaling length {} != design nrows {}",
self.eta_scaling.len(),
n,
));
}
// Row-scaled blocks are β-linear; verify the linearization point
// contains no NaN when β is provided (sanity check on caller state).
if !state.beta.is_empty() && state.beta.iter().any(|v| v.is_nan()) {
return Err(
"RowScaledJacobian::effective_jacobian_at: state.beta contains NaN".to_string(),
);
}
let mut scaled = self.design.as_ref().clone();
for i in 0..n {
let s = self.eta_scaling[i];
for j in 0..scaled.ncols() {
scaled[[i, j]] *= s;
}
}
Ok(scaled)
}
fn eta_row_scaling_for_skewness(&self) -> Option<Arc<[f64]>> {
Some(Arc::clone(&self.eta_scaling))
}
}
/// Static specification for one parameter block in a custom family.
///
/// `design` and `stacked_design` are two structurally distinct operators:
///
/// * `design` is the **canonical, single-channel, n-observation operator**.
/// `design.nrows()` ALWAYS equals `n_obs` (one row per training
/// observation). This is the matrix the identifiability audit, the
/// shape policy, and every "what shape is this block?" reader inspect.
/// For most blocks `design` is also the eta-producing operator used by
/// the solver — see [`Self::solver_design`].
/// * `stacked_design`, when `Some`, is the **multi-channel eta-producing
/// operator** used by the solver. Survival time-varying blocks stack
/// `[exit; entry; deriv]` into a `(3·n × p)` operator here so the
/// solver can produce a `3·n`-long `eta` in one mat-vec; the audit
/// never sees this matrix. When `None`, the solver uses `design` (the
/// single-channel default).
///
/// The single contract that downstream code can rely on:
/// `design.nrows() == n_obs`. No more dual semantics on `design`.
///
/// Read access:
/// * Audit / canonicalize / "n_obs is the row count" code → `&spec.design`.
/// * Eta-producing solver code → [`Self::solver_design`].
#[derive(Clone)]
pub struct ParameterBlockSpec {
pub name: String,
pub design: DesignMatrix,
pub offset: Array1<f64>,
/// Block-local penalty matrices (all p_block x p_block).
pub penalties: Vec<PenaltyMatrix>,
/// Structural nullspace dimension of each penalty matrix (same length as `penalties`).
/// Used by the penalty pseudo-logdet to determine rank without numerical thresholds.
/// If empty, falls back to eigenvalue-based rank detection.
pub nullspace_dims: Vec<usize>,
/// Initial log-smoothing parameters for this block (same length as `penalties`).
pub initial_log_lambdas: Array1<f64>,
/// Optional initial coefficients (defaults to zeros if omitted).
pub initial_beta: Option<Array1<f64>>,
/// Gauge ownership priority. Higher = more likely to retain a
/// redundant direction during canonical-gauge reparameterisation.
/// Defaults to 100. Set higher for blocks that should "own" shared
/// affine/null-space directions (e.g. baseline time in survival).
pub gauge_priority: u8,
/// Full β-dependent Jacobian callback. When `Some`, this is the
/// authoritative source for `effective_jacobian_at`. For simple
/// single-output row-scaled blocks use [`RowScaledJacobian`].
pub jacobian_callback: Option<Arc<dyn BlockEffectiveJacobian>>,
/// Optional multi-channel eta-producing operator used by the solver.
///
/// When `Some`, the solver consumes this matrix (typically
/// `(k·n × p)` for `k` stacked channels — e.g. survival
/// `[exit; entry; deriv]` with `k = 3`) to evaluate `eta = stacked · β + stacked_offset`.
/// The audit and shape policy NEVER read this field; they only ever
/// inspect `design` (which always has `n_obs` rows).
///
/// When `None`, the solver falls back to `design` — the correct
/// behavior for every single-channel block (i.e. all non-survival
/// time-varying blocks).
///
/// Read this field via [`Self::solver_design`], never directly.
///
/// Invariant: when `stacked_design = Some(_)`, `stacked_offset` MUST
/// also be `Some(_)` and its length MUST equal `stacked_design.nrows()`.
pub stacked_design: Option<DesignMatrix>,
/// Optional offset paired with [`Self::stacked_design`]. Same Option
/// state as `stacked_design` (both `Some` or both `None`).
/// Read via [`Self::solver_offset`].
pub stacked_offset: Option<Array1<f64>>,
}
impl std::fmt::Debug for ParameterBlockSpec {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ParameterBlockSpec")
.field("name", &self.name)
.field("design", &self.design)
.field("offset", &self.offset)
.field("penalties", &self.penalties)
.field("nullspace_dims", &self.nullspace_dims)
.field("initial_log_lambdas", &self.initial_log_lambdas)
.field("initial_beta", &self.initial_beta)
.field("gauge_priority", &self.gauge_priority)
.field(
"jacobian_callback",
&self
.jacobian_callback
.as_ref()
.map(|_| "<BlockEffectiveJacobian>"),
)
.finish()
}
}
impl ParameterBlockSpec {
/// Returns a ParameterBlockSpec with sensible defaults for all optional
/// fields. Callers using struct literal syntax can use
/// `..ParameterBlockSpec::defaults()` to fill in any fields added after
/// the literal was written.
pub fn defaults() -> Self {
Self {
name: String::new(),
design: DesignMatrix::Dense(crate::linalg::matrix::DenseDesignMatrix::from(
ndarray::Array2::<f64>::zeros((0, 0)),
)),
offset: ndarray::Array1::<f64>::zeros(0),
penalties: Vec::new(),
nullspace_dims: Vec::new(),
initial_log_lambdas: ndarray::Array1::<f64>::zeros(0),
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
}
}
/// Returns the eta-producing operator used by the solver.
///
/// Resolution order:
/// 1. `stacked_design = Some(d)` → return `d` (multi-channel
/// operator, e.g. `(3n × p)` for survival time-varying blocks).
/// 2. otherwise → return `&self.design` (the single-channel default).
///
/// Solver code that needs `eta = D · β` MUST call this accessor;
/// reading `&self.design` directly silently breaks multi-channel
/// (survival LS time-varying) blocks because `self.design.nrows()`
/// always equals `n_obs`, never `3·n_obs`.
pub fn solver_design(&self) -> &DesignMatrix {
self.stacked_design.as_ref().unwrap_or(&self.design)
}
/// Returns the offset paired with [`Self::solver_design`]. When
/// `stacked_offset = Some(o)` this returns `&o`; otherwise it falls
/// back to `&self.offset`.
pub fn solver_offset(&self) -> &Array1<f64> {
self.stacked_offset.as_ref().unwrap_or(&self.offset)
}
/// Returns the effective design `D_eff` for this block at β = 0 with no
/// family scalars — a convenience wrapper around [`Self::effective_jacobian_at`]
/// for the single-output (n_outputs = 1) case.
///
/// Callers that need multi-output Jacobians or β-dependent scalars should
/// call `effective_jacobian_at` directly with the appropriate state.
///
/// Returns `Err` if the design cannot be densified.
pub fn effective_design(&self, caller: &str) -> Result<ndarray::Array2<f64>, String> {
let p = self.design.ncols();
let zeros = vec![0.0f64; p];
let state = FamilyLinearizationState {
beta: &zeros,
family_scalars: None,
channel_hessian: None,
probit_frailty_scale: 1.0,
};
self.effective_jacobian_at(caller, &state)
}
/// Returns the β-dependent stacked Jacobian `J(β)` for this block.
///
/// Shape: `(n_rows * n_outputs, p_block)`. For most blocks `n_outputs = 1`
/// and the result is the familiar `(n_rows, p_block)` effective design.
///
/// Dispatch order:
/// 1. `jacobian_callback = Some(cb)` → `cb.effective_jacobian_at(state)`.
/// 2. `jacobian_callback = None` → `design.clone()` (ignores `beta` and `family_scalars`).
///
/// Returns `Err` if the design cannot be densified.
pub fn effective_jacobian_at(
&self,
caller: &str,
state: &FamilyLinearizationState<'_>,
) -> Result<ndarray::Array2<f64>, String> {
if let Some(cb) = self.jacobian_callback.as_ref() {
return cb.effective_jacobian_at(state);
}
self.design
.try_to_dense_arc(&format!(
"{caller}::effective_jacobian_at block '{}'",
self.name
))
.map(|arc| arc.as_ref().clone())
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CoefficientBlockSelector {
Name(String),
Index(usize),
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CoefficientLabel {
pub block: CoefficientBlockSelector,
pub column: usize,
}
impl CoefficientLabel {
pub fn by_block_name(block: impl Into<String>, column: usize) -> Self {
Self {
block: CoefficientBlockSelector::Name(block.into()),
column,
}
}
}
pub fn coefficient_label(block: impl Into<String>, column: usize) -> CoefficientLabel {
CoefficientLabel::by_block_name(block, column)
}
#[derive(Debug, Clone, PartialEq)]
pub enum CoefficientGroupPrior {
Flat,
NormalLogPrecision {
mean: f64,
sd: f64,
},
GammaPrecision {
shape: f64,
rate: f64,
},
/// Penalized-complexity prior calibrated by `P(exp(-ρ/2) > upper) =
/// tail_prob`; see [`crate::types::RhoPrior::PenalizedComplexity`].
PenalizedComplexity {
upper: f64,
tail_prob: f64,
},
}
impl CoefficientGroupPrior {
pub fn to_rho_prior(&self) -> crate::types::RhoPrior {
match *self {
Self::Flat => crate::types::RhoPrior::Flat,
Self::NormalLogPrecision { mean, sd } => crate::types::RhoPrior::Normal { mean, sd },
Self::GammaPrecision { shape, rate } => {
crate::types::RhoPrior::GammaPrecision { shape, rate }
}
Self::PenalizedComplexity { upper, tail_prob } => {
crate::types::RhoPrior::PenalizedComplexity { upper, tail_prob }
}
}
}
fn validate(&self, context: &str) -> Result<(), String> {
match *self {
Self::Flat => Ok(()),
Self::NormalLogPrecision { mean, sd } => {
if !mean.is_finite() {
return Err(format!(
"{context} Normal log-precision prior requires finite mean, got {mean}"
));
}
if !sd.is_finite() || sd <= 0.0 {
return Err(format!(
"{context} Normal log-precision prior requires sd > 0, got {sd}"
));
}
Ok(())
}
Self::PenalizedComplexity { upper, tail_prob } => {
validate_penalized_complexity_prior(context, upper, tail_prob)
}
Self::GammaPrecision { shape, rate } => {
if !shape.is_finite() || shape <= 0.0 {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"{context} Gamma precision prior requires shape > 0, got {shape}"
),
}
.into());
}
if !rate.is_finite() || rate < 0.0 {
return Err(format!(
"{context} Gamma precision prior requires rate >= 0, got {rate}"
));
}
Ok(())
}
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct CoefficientGroupSpec {
pub label: String,
pub coefficients: Vec<CoefficientLabel>,
pub parent: Option<String>,
pub prior: Option<CoefficientGroupPrior>,
pub initial_log_precision: Option<f64>,
}
impl CoefficientGroupSpec {
pub fn new(label: impl Into<String>, coefficients: Vec<CoefficientLabel>) -> Self {
Self {
label: label.into(),
coefficients,
parent: None,
prior: None,
initial_log_precision: None,
}
}
pub fn with_parent(mut self, parent: impl Into<String>) -> Self {
self.parent = Some(parent.into());
self
}
pub fn with_prior(mut self, prior: CoefficientGroupPrior) -> Self {
self.prior = Some(prior);
self
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct RealizedCoefficientGroup {
pub label: String,
pub parent: Option<String>,
pub coefficients: Vec<(usize, usize)>,
pub prior: Option<CoefficientGroupPrior>,
pub initial_log_precision: f64,
}
#[derive(Debug, Clone)]
pub struct RealizedCoefficientGroupSpecs {
pub specs: Vec<ParameterBlockSpec>,
pub groups: Vec<RealizedCoefficientGroup>,
/// One entry per realized penalty in flattened block order. Built-in
/// penalties receive unique internal labels; user groups carry their
/// declared labels. Consumers that optimize one coordinate per label can
/// use this to tie cross-block penalty pieces to a shared precision.
pub penalty_labels: Vec<String>,
/// Per-coordinate priors in `outer_labels` order.
pub rho_prior: crate::types::RhoPrior,
pub outer_labels: Vec<String>,
}
fn coefficient_group_block_index(
specs: &[ParameterBlockSpec],
selector: &CoefficientBlockSelector,
) -> Result<usize, String> {
match selector {
CoefficientBlockSelector::Index(index) => {
if *index >= specs.len() {
Err(format!(
"coefficient group references block index {index}, but only {} blocks exist",
specs.len()
))
} else {
Ok(*index)
}
}
CoefficientBlockSelector::Name(name) => specs
.iter()
.position(|spec| spec.name == *name)
.ok_or_else(|| format!("coefficient group references unknown block '{name}'")),
}
}
fn validate_group_rho_prior_coordinate(
prior: &crate::types::RhoPrior,
context: &str,
) -> Result<(), String> {
match prior {
crate::types::RhoPrior::Flat => Ok(()),
crate::types::RhoPrior::Normal { mean, sd } => {
if !mean.is_finite() {
return Err(format!(
"{context} Normal log-precision prior requires finite mean, got {mean}"
));
}
if !sd.is_finite() || *sd <= 0.0 {
return Err(format!(
"{context} Normal log-precision prior requires sd > 0, got {sd}"
));
}
Ok(())
}
crate::types::RhoPrior::GammaPrecision { shape, rate } => {
if !shape.is_finite() || *shape <= 0.0 {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"{context} Gamma precision prior requires shape > 0, got {shape}"
),
}
.into());
}
if !rate.is_finite() || *rate < 0.0 {
return Err(format!(
"{context} Gamma precision prior requires rate >= 0, got {rate}"
));
}
Ok(())
}
crate::types::RhoPrior::PenalizedComplexity { upper, tail_prob } => {
validate_penalized_complexity_prior(context, *upper, *tail_prob)
}
crate::types::RhoPrior::Independent(_) => Err(CustomFamilyError::ConstraintViolation {
reason: format!("{context} must be a scalar rho prior, not a nested Independent prior"),
}
.into()),
}
}
/// Shared validation of penalized-complexity hyperparameters: `upper` finite and
/// strictly positive, `tail_prob` a probability in the open interval `(0, 1)`.
fn validate_penalized_complexity_prior(
context: &str,
upper: f64,
tail_prob: f64,
) -> Result<(), String> {
if !upper.is_finite() || upper <= 0.0 {
return Err(format!(
"{context} penalized-complexity prior requires upper > 0, got {upper}"
));
}
if !tail_prob.is_finite() || tail_prob <= 0.0 || tail_prob >= 1.0 {
return Err(format!(
"{context} penalized-complexity prior requires tail probability in (0, 1), got {tail_prob}"
));
}
Ok(())
}
fn expand_custom_group_base_prior(
base_prior: &crate::types::RhoPrior,
base_count: usize,
context: &str,
) -> Result<Vec<crate::types::RhoPrior>, String> {
match base_prior {
crate::types::RhoPrior::Independent(priors) => {
if priors.len() != base_count {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"{context} base Independent rho prior length mismatch: got {}, expected {base_count}",
priors.len()
) }.into());
}
for (idx, prior) in priors.iter().enumerate() {
validate_group_rho_prior_coordinate(prior, &format!("{context} base prior {idx}"))?;
}
Ok(priors.clone())
}
prior => {
validate_group_rho_prior_coordinate(prior, context)?;
Ok((0..base_count).map(|_| prior.clone()).collect())
}
}
}
pub fn realize_coefficient_groups_for_custom_family(
specs: &[ParameterBlockSpec],
groups: &[CoefficientGroupSpec],
base_prior: crate::types::RhoPrior,
) -> Result<RealizedCoefficientGroupSpecs, String> {
use crate::terms::coefficient_group_resolver::{ResolvedGroup, ResolvedGroupHierarchy};
validate_blockspecs(specs)?;
// Carrier-specific validation. The prior and the custom-only
// `initial_log_precision` field are validated here because they have no
// analogue on the standard-term carrier; label, duplicate, empty-set, and
// hierarchy checks are delegated to the shared resolver below.
for group in groups {
if let Some(prior) = group.prior.as_ref() {
prior.validate(&format!("coefficient group '{}'", group.label))?;
}
if let Some(initial) = group.initial_log_precision
&& !initial.is_finite()
{
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"coefficient group '{}' initial log precision must be finite, got {initial}",
group.label
),
}
.into());
}
}
// Carrier = `(block_idx, column)` coordinates of the parameter blocks.
// Resolve every declared label into its coordinate set, then hand the
// carrier-agnostic policy (labels, hierarchy, subsets, child unions) to the
// shared resolver.
let resolved_groups = groups
.iter()
.map(|group| {
let mut coordinates = BTreeSet::<(usize, usize)>::new();
for label in &group.coefficients {
let block_idx = coefficient_group_block_index(specs, &label.block)?;
let p = specs[block_idx].design.ncols();
if label.column >= p {
return Err(format!(
"coefficient group '{}' references column {} in block '{}' (index {block_idx}), but the block has {p} columns",
group.label, label.column, specs[block_idx].name
));
}
coordinates.insert((block_idx, label.column));
}
Ok(ResolvedGroup {
label: group.label.clone(),
parent: group.parent.clone(),
coordinates,
})
})
.collect::<Result<Vec<_>, String>>()?;
let hierarchy = ResolvedGroupHierarchy::build(resolved_groups)?;
let realized_groups = groups
.iter()
.zip(hierarchy.groups())
.map(|(group, resolved)| RealizedCoefficientGroup {
label: group.label.clone(),
parent: group.parent.clone(),
coefficients: resolved.coordinates.iter().copied().collect(),
prior: group.prior.clone(),
initial_log_precision: group.initial_log_precision.unwrap_or(0.0),
})
.collect::<Vec<_>>();
let mut realized_specs = specs.to_vec();
let mut penalty_labels = Vec::<String>::new();
let mut outer_labels = Vec::<String>::new();
let base_count = specs.iter().map(|spec| spec.penalties.len()).sum::<usize>();
let mut priors = expand_custom_group_base_prior(&base_prior, base_count, "coefficient groups")?;
let mut base_prior_idx = 0usize;
for (block_idx, spec) in specs.iter().enumerate() {
for penalty_idx in 0..spec.penalties.len() {
let label = format!("__block_{block_idx}_penalty_{penalty_idx}");
penalty_labels.push(label.clone());
outer_labels.push(label);
base_prior_idx += 1;
}
}
assert_eq!(base_prior_idx, base_count);
for group in &realized_groups {
outer_labels.push(group.label.clone());
let group_prior = match group.prior.as_ref() {
Some(prior) => prior.to_rho_prior(),
None => match &base_prior {
crate::types::RhoPrior::Independent(_) => {
return Err(CustomFamilyError::ConstraintViolation { reason: format!(
"coefficient group '{}' must declare a prior when base_prior is Independent",
group.label
) }.into());
}
prior => prior.clone(),
},
};
priors.push(group_prior);
// Hierarchical Gamma precision update.
//
// For one Gaussian coefficient group with fixed beta and precision
// lambda,
//
// p(beta_g | lambda) p(lambda)
// ∝ lambda^{|g|/2} exp[-lambda q_g/2]
// lambda^{a_g-1} exp[-b_g lambda],
// q_g = (beta_g - mu_g)' S_g (beta_g - mu_g).
//
// Maximizing the log posterior in lambda gives
//
// lambda* = (a_g + |g|/2 - 1) / (b_g + q_g/2).
//
// If a node has children, beta_g is the concatenation of the child
// coefficient vectors. The parent density is therefore the product
// of those child Gaussian factors under one lambda_g: replace |g| and
// q_g by sums over the child components, expanding recursively when a
// child is itself an interior node. We preserve that identity by
// emitting one physical penalty piece per concatenated child component
// and tying those pieces with the parent's precision label. This is
// not a block-sum shortcut: overlapping children remain separate
// factors, so their log normalizers and quadratic contributions both
// add.
let penalty_components = hierarchy.concatenated_penalty_components(&group.label);
for component in penalty_components {
let mut by_block = BTreeMap::<usize, Vec<usize>>::new();
for &(block_idx, column) in &component {
by_block.entry(block_idx).or_default().push(column);
}
for (block_idx, columns) in by_block {
let p = realized_specs[block_idx].design.ncols();
let mut matrix = Array2::<f64>::zeros((p, p));
for column in &columns {
matrix[[*column, *column]] = 1.0;
}
realized_specs[block_idx]
.penalties
.push(PenaltyMatrix::Dense(matrix).with_precision_label(group.label.clone()));
realized_specs[block_idx]
.nullspace_dims
.push(p.saturating_sub(columns.len()));
let mut rho =
Array1::<f64>::zeros(realized_specs[block_idx].initial_log_lambdas.len() + 1);
if !realized_specs[block_idx].initial_log_lambdas.is_empty() {
let old_len = realized_specs[block_idx].initial_log_lambdas.len();
rho.slice_mut(s![..old_len])
.assign(&realized_specs[block_idx].initial_log_lambdas);
}
let last = rho.len() - 1;
rho[last] = group.initial_log_precision;
realized_specs[block_idx].initial_log_lambdas = rho;
penalty_labels.push(group.label.clone());
}
}
}
Ok(RealizedCoefficientGroupSpecs {
specs: realized_specs,
groups: realized_groups,
penalty_labels,
rho_prior: crate::types::RhoPrior::Independent(priors),
outer_labels,
})
}
fn custom_family_block_role(
name: &str,
index: usize,
n_blocks: usize,
) -> crate::solver::estimate::BlockRole {
use crate::solver::estimate::BlockRole;
if n_blocks == 1 {
return BlockRole::Mean;
}
match name.trim().to_ascii_lowercase().as_str() {
"eta" | "mean" | "beta" => BlockRole::Mean,
"mu" | "location" | "marginal_surface" => BlockRole::Location,
"threshold" => BlockRole::Threshold,
"log_sigma" | "scale" | "logslope_surface" => BlockRole::Scale,
"time" | "time_transform" | "time_surface" => BlockRole::Time,
name if name.starts_with("time_cause_") => BlockRole::Time,
"wiggle" | "linkwiggle" => BlockRole::LinkWiggle,
_ if index == 0 => BlockRole::Location,
_ => BlockRole::Scale,
}
}
/// Current state for a parameter block.
#[derive(Clone, Debug)]
pub struct ParameterBlockState {
pub beta: Array1<f64>,
pub eta: Array1<f64>,
}
#[derive(Clone)]
pub struct BlockGeometryDirectionalDerivative {
/// Directional derivative of the block design matrix along a coefficient-space direction.
pub d_design: Option<Array2<f64>>,
/// Directional derivative of the block offset along the same direction.
pub d_offset: Array1<f64>,
}
/// Working quantities supplied by a custom family for one block.
///
/// # Observed vs expected information (see response.md Section 3)
///
/// For the outer REML/LAML criterion, the Hessian used in log|H| and trace terms
/// must be the **observed** (actual) Hessian at the mode, not the expected Fisher.
///
/// - `ExactNewton`: provides -nabla^2 log L directly, which is the observed Hessian
/// by construction. This is always correct.
///
/// - `Diagonal`: provides IRLS working weights W such that the per-block Hessian
/// is X'WX. For canonical links (logit-Binomial, log-Poisson), W_obs = W_Fisher.
/// For supported non-canonical diagonal links, W must be the observed weight
/// W_obs = W_Fisher - (y-mu)*B so the outer REML uses the exact Laplace
/// Hessian. The matching [`CustomFamily::diagonalworking_weights_directional_derivative`]
/// callback must differentiate the same observed W surface; silently using Fisher
/// weights or zero `dW` would change the criterion into a PQL-type surrogate.
#[derive(Clone, Debug)]
pub enum BlockWorkingSet {
/// Standard IRLS/GLM-style diagonal working set for eta-space updates.
Diagonal {
/// IRLS pseudo-response for this block's linear predictor.
working_response: Array1<f64>,
/// IRLS working weights for this block (non-negative, length n).
///
/// For the inner solver, Fisher or observed weights both find the same mode.
/// For the outer REML/LAML log|H| term, observed weights are the correct
/// Laplace choice (see response.md Section 3). Canonical-link families need
/// no correction since observed = Fisher.
working_weights: Array1<f64>,
},
/// Exact Newton block update in coefficient space.
///
/// `gradient` is nabla log L wrt block coefficients.
/// `hessian` is -nabla^2 log L wrt block coefficients (positive semidefinite near optimum).
///
/// This is the observed Hessian by construction (actual second derivative of the
/// log-likelihood), which is the correct quantity for the outer REML Laplace
/// approximation.
ExactNewton {
gradient: Array1<f64>,
hessian: SymmetricMatrix,
},
}
impl BlockWorkingSet {
/// Construct a `Diagonal` working set with the length invariant
/// (`working_response.len() == working_weights.len()`) enforced at the
/// type boundary. Use this from any new code path that produces a
/// diagonal IRLS block; the legacy struct-literal form is preserved for
/// existing call sites pending a full migration.
#[inline]
pub fn diagonal_checked(
working_response: Array1<f64>,
working_weights: Array1<f64>,
) -> Result<Self, String> {
if working_response.len() != working_weights.len() {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"BlockWorkingSet::Diagonal length mismatch: working_response={}, working_weights={}",
working_response.len(),
working_weights.len(),
) }.into());
}
Ok(Self::Diagonal {
working_response,
working_weights,
})
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ExactNewtonOuterObjective {
RidgedQuadraticReml,
StrictPseudoLaplace,
}
/// Highest exact outer derivative order a family wants to expose at the
/// current realized problem scale.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum ExactOuterDerivativeOrder {
Zeroth,
First,
Second,
}
impl ExactOuterDerivativeOrder {
pub const fn has_gradient(self) -> bool {
!matches!(self, Self::Zeroth)
}
pub const fn has_hessian(self) -> bool {
matches!(self, Self::Second)
}
}
/// Exact outer derivative order for families that expose second-order
/// coefficient geometry.
///
/// This used to be a cost gate that demoted large large-scale problems to
/// first-order BFGS. That was a policy leak into the math layer: if the family
/// supplies analytic dense Hessian blocks or an analytic profiled-Hessian HVP,
/// the outer optimizer should see the exact second-order objective. Runtime
/// representation choices (dense vs operator) belong below this declaration,
/// not in a first-order downgrade.
/// Precondition check for the family capability / operator hooks (e.g.
/// `batched_outer_hessian_terms`, `outer_hyper_hessian_operator`).
///
/// These hooks operate on whatever block geometry the caller has assembled and
/// must validate the *consistency* of the specs they are handed — never the
/// fit-level "at least one block" precondition, which belongs to the fit entry
/// points (`validate_blockspecs`). An empty, self-consistent argument set is a
/// valid no-op probe of the operator path (the operator may ignore the specs
/// entirely), so it must not panic here.
fn assert_valid_blockspecs(specs: &[ParameterBlockSpec], context: &str) {
assert!(
validate_blockspec_consistency(specs).is_ok(),
"{context}: inconsistent parameter block specs"
);
}
fn assert_valid_options(options: &BlockwiseFitOptions, context: &str) {
assert!(
options.inner_tol.is_finite() && options.inner_tol >= 0.0,
"{context}: inner_tol must be finite and non-negative"
);
assert!(
options.outer_tol.is_finite() && options.outer_tol >= 0.0,
"{context}: outer_tol must be finite and non-negative"
);
assert!(
options.minweight.is_finite() && options.minweight >= 0.0,
"{context}: minweight must be finite and non-negative"
);
assert!(
options.ridge_floor.is_finite() && options.ridge_floor >= 0.0,
"{context}: ridge_floor must be finite and non-negative"
);
if let Some(threshold) = options.early_exit_threshold {
assert!(
threshold.is_finite(),
"{context}: early_exit_threshold must be finite"
);
}
}
fn assert_states_match_specs(
states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
context: &str,
) {
assert_eq!(
states.len(),
specs.len(),
"{context}: state/spec block count mismatch"
);
for (block, (state, spec)) in states.iter().zip(specs).enumerate() {
assert_eq!(
state.beta.len(),
spec.design.ncols(),
"{context}: beta length mismatch in block {block}"
);
// `state.eta` is produced from `solver_design()` (see
// `refresh_all_block_etas`), which is `stacked_design` when set
// (3·n_obs rows for survival LS time-varying blocks) and `design`
// (n_obs rows) otherwise. Use the same accessor here.
assert_eq!(
state.eta.len(),
spec.solver_design().nrows(),
"{context}: eta length mismatch in block {block}"
);
}
}
fn assert_derivative_blocks_match_specs(
derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
specs: &[ParameterBlockSpec],
context: &str,
) {
assert_eq!(
derivative_blocks.len(),
specs.len(),
"{context}: derivative/spec block count mismatch"
);
}
fn assert_rho_matches_specs(rho: &Array1<f64>, specs: &[ParameterBlockSpec], context: &str) {
let expected = specs.iter().map(|spec| spec.penalties.len()).sum::<usize>();
assert_eq!(
rho.len(),
expected,
"{context}: rho length does not match penalty count"
);
}
fn validate_hessian_workspace_ready(
hessian_workspace: &Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
context: &str,
) -> Result<(), String> {
if let Some(workspace) = hessian_workspace.as_ref() {
workspace
.warm_up_outer_caches()
.map_err(|err| format!("{context}: failed to warm Hessian workspace caches: {err}"))?;
}
Ok(())
}
pub fn exact_outer_order_from_capability(
specs: &[ParameterBlockSpec],
coefficient_cost: u64,
) -> ExactOuterDerivativeOrder {
assert_valid_blockspecs(specs, "exact outer derivative order");
match coefficient_cost {
0 => ExactOuterDerivativeOrder::Second,
_ => ExactOuterDerivativeOrder::Second,
}
}
/// Capability-aware variant of [`exact_outer_order_from_capability`].
///
/// Kept as the public declaration helper for existing family impls, but it no
/// longer gates by cost. Once a caller has established dense or HVP analytic
/// second-order support, the correct derivative order is `Second`.
pub fn exact_outer_order_with_outer_hvp(
specs: &[ParameterBlockSpec],
coefficient_cost: u64,
outer_hyper_hessian_hvp_available: bool,
) -> ExactOuterDerivativeOrder {
if outer_hyper_hessian_hvp_available {
assert_valid_blockspecs(specs, "exact outer derivative order with HVP");
match coefficient_cost {
0 => ExactOuterDerivativeOrder::Second,
_ => ExactOuterDerivativeOrder::Second,
}
} else {
exact_outer_order_from_capability(specs, coefficient_cost)
}
}
/// Realized outer-derivative policy at the current problem size.
///
/// Capability (the family can produce exact second-order calculus) controls
/// whether the Hessian is declared. Runtime cost controls only representation
/// and staging choices below this layer. Large problems must stay on the exact
/// analytic Hessian path and use an operator representation when dense assembly
/// is too expensive; they are not demoted to first-order BFGS here.
///
/// `OuterDerivativePolicy` records the family's *capability*, the *predicted
/// per-eval cost* for both gradient-only and Hessian paths, and exposes the
/// two policy queries the outer optimizer actually needs:
///
/// * [`order_for_evaluation`](Self::order_for_evaluation) — clamp a requested
/// evaluation order against the policy gate.
/// * [`declared_hessian_form`](Self::declared_hessian_form) — what shape the
/// outer-strategy planner should declare to its plan ladder.
/// * [`should_use_staged_kappa`](Self::should_use_staged_kappa) — auto-route
/// the κ optimizer through the pilot/polish schedule at large `n`.
///
/// All thresholds are *const* — no env vars, no CLI flags. The cost model is
/// the family's own `coefficient_gradient_cost` / `coefficient_hessian_cost`
/// scaled by the joint outer-coordinate dimension, with `saturating_mul` so
/// overflow rounds up to the budget ceiling rather than wrapping silently.
#[derive(Clone, Copy, Debug)]
pub struct OuterDerivativePolicy {
/// What exact calculus the family advertises in principle.
pub capability: ExactOuterDerivativeOrder,
/// Predicted per-eval work for one `ValueGradientHessian` evaluation.
/// Rounded conservatively *up* via `saturating_mul`. Informational for
/// representation and diagnostics; it does not disable Hessian capability.
pub predicted_hessian_work: u128,
/// Predicted per-eval work for one `ValueAndGradient` evaluation.
/// Rounded conservatively *up* via `saturating_mul`.
pub predicted_gradient_work: u128,
/// True when the family's outer-only paths consume
/// [`BlockwiseFitOptions::outer_score_subsample`] and produce
/// Horvitz-Thompson-weighted partial sums (i.e. the family overrides
/// `log_likelihood_only_with_options`,
/// `exact_newton_joint_psi_workspace_with_options`, and any other
/// outer-only hooks reached by `evaluate_custom_family_joint_hyper`).
///
/// Determines whether the κ optimizer's pilot/polish staging schedule
/// engages: when this is `false`, [`Self::should_use_staged_kappa`]
/// returns `false` regardless of `n`. Engaging the schedule on a
/// family that ignores the subsample is strictly worse than not
/// engaging it — the schedule builds a `RowSet::Subsample` and the
/// boundary plumbing installs an `OuterScoreSubsample` on options,
/// but the family's default outer-only paths fall back to full-data
/// sums, so the pilot evaluation costs the same as the polish but
/// adds a Vec allocation per eval.
///
/// Families that do **not** consume the subsample (default for new
/// implementations, including the GAMLSS location-scale families
/// today) leave this `false`. Families that do consume (today:
/// `BernoulliMarginalSlopeFamily`) override `outer_derivative_policy`
/// to set this `true`.
pub subsample_capable: bool,
}
impl OuterDerivativePolicy {
/// Per-eval gradient work ceiling above which the κ schedule switches
/// to the staged pilot/polish path. At large scale (n ≳ 100 k) even
/// the gradient sweep takes minutes per outer iter; subsampling the
/// pilot stage cuts that to seconds and leaves the final polish on
/// full data to recover the MLE.
pub const OUTER_GRADIENT_WORK_BUDGET: u128 = 50_000_000_000;
/// Pilot subsample auto-engages when full-data `n` exceeds this. Below
/// this the κ schedule collapses to a single full-data stage —
/// behaviour identical to the pre-P7 path.
pub const STAGED_KAPPA_TRIGGER_N: usize = 30_000;
/// Clamp a requested evaluation order against the policy gate.
///
/// Returns the highest order this policy permits for the requested order:
/// * `ValueGradientHessian` requested → keep only if `declared_hessian_form`
/// is something other than `Unavailable`.
/// * `ValueAndGradient` requested → always permitted (gradient-only is
/// universal).
pub fn order_for_evaluation(
&self,
requested: crate::solver::outer_strategy::OuterEvalOrder,
) -> crate::solver::outer_strategy::OuterEvalOrder {
use crate::solver::outer_strategy::OuterEvalOrder;
match requested {
OuterEvalOrder::ValueAndGradient => OuterEvalOrder::ValueAndGradient,
OuterEvalOrder::ValueGradientHessian => {
if matches!(
self.declared_hessian_form(),
crate::solver::outer_strategy::DeclaredHessianForm::Unavailable
) {
OuterEvalOrder::ValueAndGradient
} else {
OuterEvalOrder::ValueGradientHessian
}
}
}
}
/// Outer Hessian declaration for the outer-strategy planner.
///
/// `Either` ⇔ capability has Hessian. Work estimates select dense vs
/// operator assembly later; they must not erase analytic second-order
/// capability from the planner.
pub fn declared_hessian_form(&self) -> crate::solver::outer_strategy::DeclaredHessianForm {
use crate::solver::outer_strategy::DeclaredHessianForm;
if !self.capability.has_hessian() {
return DeclaredHessianForm::Unavailable;
}
DeclaredHessianForm::Either
}
/// True when the κ optimizer should auto-route through the staged
/// pilot/polish schedule. Triggers when **either** the data is big
/// (`n ≥ STAGED_KAPPA_TRIGGER_N`) **or** the per-eval gradient work
/// exceeds `OUTER_GRADIENT_WORK_BUDGET`. The second clause catches
/// problems with moderate `n` but very wide design (large `p_total`
/// or `psi_dim`) where a single full-data gradient sweep still
/// dominates the κ trajectory.
pub fn should_use_staged_kappa(&self, n: usize) -> bool {
if !self.subsample_capable {
// Family does not consume `outer_score_subsample` on its
// outer-only paths. Engaging the schedule would build a
// pilot `RowSet::Subsample` whose only effect is per-eval
// Vec/Arc bookkeeping — the underlying coefficient gradient
// would still sum every row. Gate the schedule off until
// the family override declares consumption.
return false;
}
n >= Self::STAGED_KAPPA_TRIGGER_N
|| self.predicted_gradient_work > Self::OUTER_GRADIENT_WORK_BUDGET
}
}
/// Total outer-coordinate dimensionality used by the default policy work
/// model: `rho_dim + psi_dim`. Each outer evaluation propagates one
/// directional derivative per outer coordinate through the inner solve.
#[inline]
fn outer_coord_dim_for_policy(specs: &[ParameterBlockSpec], psi_dim: usize) -> u128 {
let rho_total: u128 = specs
.iter()
.map(|s| s.penalties.len() as u128)
.fold(0u128, |acc, k| acc.saturating_add(k));
rho_total.saturating_add(psi_dim as u128)
}
/// Default predicted-cost model for [`OuterDerivativePolicy`]:
///
/// * gradient work ≈ `coefficient_gradient_cost · (rho_dim + psi_dim)`
/// * Hessian work ≈ `coefficient_hessian_cost · (rho_dim + psi_dim)`
///
/// Each outer coordinate triggers one analytic directional derivative
/// through the inner solve; the dense Hessian assembly carries the extra
/// `O(p_total)` factor already captured by `coefficient_hessian_cost`.
///
/// All multiplications saturate so an overflow rounds *up* to the gate
/// ceiling: we'd rather drop one Hessian evaluation that we could have
/// afforded than crash on a 600 s eval.
pub fn default_outer_derivative_policy_costs(
specs: &[ParameterBlockSpec],
psi_dim: usize,
grad_cost: u64,
hess_cost: u64,
) -> (u128, u128) {
let k = outer_coord_dim_for_policy(specs, psi_dim);
let grad = (grad_cost as u128).saturating_mul(k.max(1));
let hess = (hess_cost as u128).saturating_mul(k.max(1));
(grad, hess)
}
/// Default coefficient-space Hessian cost: `Σ_b n_b · p_b²`, summed across
/// blocks. Represents the work to assemble or apply the dense block-diagonal
/// inner Hessian once.
pub fn default_coefficient_hessian_cost(specs: &[ParameterBlockSpec]) -> u64 {
specs
.iter()
.map(|s| {
let n = s.design.nrows() as u64;
let p = s.design.ncols() as u64;
n.saturating_mul(p.saturating_mul(p))
})
.fold(0u64, |acc, c| acc.saturating_add(c))
}
/// Joint-coupled coefficient-space Hessian cost: `n · (Σ_b p_b)²`. The honest
/// per-evaluation work for any family whose row likelihood couples every block
/// (every observation contributes a rank-`m` outer-product update to the full
/// joint Hessian over `Σ p_b` coefficients), as opposed to the block-diagonal
/// `default_coefficient_hessian_cost` which assumes each `X_b' W_b X_b` is
/// assembled independently.
///
/// Used by all GAMLSS, marginal-slope, and joint-latent families. CTN does
/// not delegate here — it uses its Khatri–Rao factor dimensions internally.
pub fn joint_coupled_coefficient_hessian_cost(n: u64, specs: &[ParameterBlockSpec]) -> u64 {
let p_total: u64 = specs
.iter()
.map(|s| s.design.ncols() as u64)
.fold(0u64, |acc, p| acc.saturating_add(p));
n.saturating_mul(p_total.saturating_mul(p_total))
}
/// Default coefficient-space gradient cost: half the Hessian cost.
///
/// The first-order analytic gradient in the unified evaluator runs the same
/// inner Newton solve as the second-order path but skips the `K`-fold
/// pairwise Hessian assembly (`B_{j,k}` blocks) and the `K`-fold inner
/// derivative solves; what remains is the inner solve plus a single
/// gradient-only sweep through the data. Empirically this is roughly half
/// the per-evaluation arithmetic of forming the dense Hessian, hence the
/// `/2` default. Families whose gradient assembly differs structurally
/// (e.g. matrix-free Hv operators with no dense Hessian assembly to halve)
/// should override [`CustomFamily::coefficient_gradient_cost`] explicitly.
pub fn default_coefficient_gradient_cost(specs: &[ParameterBlockSpec]) -> u64 {
default_coefficient_hessian_cost(specs) / 2
}
/// Compute β-block column ranges from a slice of `ParameterBlockSpec`s.
///
/// Returns one `Range<usize>` per spec, covering the spec's columns in the
/// concatenated β vector (i.e. `offset .. offset + p_block` where `p_block =
/// spec.design.ncols()`). The ranges are non-overlapping, sorted, and their
/// union covers `0..Σ p_block`.
///
/// This is the canonical source of `block_offsets` for every
/// [`crate::solver::arrow_schur::ArrowSchurSystem`] built for a custom family
/// (survival, GAMLSS, transformation-normal, latent-survival, marginal-slope,
/// …). Pass the result to
/// [`crate::solver::arrow_schur::ArrowSchurSystem::set_block_offsets`] before
/// calling `solve` or `solve_with_options` whenever the system will use
/// [`crate::solver::arrow_schur::ArrowSolverMode::InexactPCG`].
///
/// Specs with zero columns produce a zero-width range; callers that want to
/// skip trivial blocks may filter on `r.start < r.end` after calling this
/// function.
pub fn block_offsets_from_specs(specs: &[ParameterBlockSpec]) -> Arc<[Range<usize>]> {
let mut ranges: Vec<Range<usize>> = Vec::with_capacity(specs.len());
let mut cursor = 0usize;
for spec in specs {
let p = spec.design.ncols();
ranges.push(cursor..cursor + p);
cursor += p;
}
Arc::from(ranges.into_boxed_slice())
}
/// Bound first-order outer iterations when each analytic-gradient evaluation is
/// already large-scale work. This is only applied after the planner has
/// selected a gradient-only route; second-order/ARC plans keep their requested
/// iteration budget.
pub fn cost_gated_first_order_max_iter(
requested: usize,
coefficient_gradient_cost: u64,
has_outer_hessian: bool,
) -> usize {
const FIRST_ORDER_OUTER_WORK_BUDGET: u64 = 80_000_000_000;
const MIN_FIRST_ORDER_ITERS: usize = 4;
if has_outer_hessian || requested <= 1 || coefficient_gradient_cost == 0 {
return requested;
}
let affordable = (FIRST_ORDER_OUTER_WORK_BUDGET / coefficient_gradient_cost) as usize;
requested.min(affordable.max(MIN_FIRST_ORDER_ITERS))
}
/// Local trust budget for first-order outer BFGS on log-smoothing parameters.
///
/// One unit in `rho = log(lambda)` is an `e`-fold smoothing-parameter change.
/// Previously this cap was `1.0`, which throttled BFGS to ~1/5 of its
/// quasi-Newton step on flat REML surfaces (the natural BFGS direction has
/// `|d|_inf` of ~5 in log-λ for large-scale survival fits). Probes whose
/// `step_inf > cap` are rejected for free in `OuterFirstOrderBridge::eval_cost`
/// (returning `BFGS_LINE_SEARCH_REJECT_COST` without running an inner solve),
/// so a larger cap costs nothing on rejection — it only lets Strong-Wolfe
/// accept bigger steps that the inner-PIRLS divergence guard can already
/// validate. `5.0` allows up to `e^5 ≈ 148`-fold smoothing-parameter change
/// per accepted outer iter, which matches the typical quasi-Newton direction
/// magnitude while still bounding pathological probes.
pub const fn first_order_bfgs_loglambda_step_cap(has_outer_hessian: bool) -> Option<f64> {
if has_outer_hessian { None } else { Some(5.0) }
}
pub(crate) fn exact_newton_outer_geometry_supports_second_order_solver<F: CustomFamily + ?Sized>(
family: &F,
) -> bool {
family.exact_newton_outerobjective() == ExactNewtonOuterObjective::StrictPseudoLaplace
}
/// Family evaluation over all parameter blocks.
#[derive(Clone, Debug)]
pub struct FamilyEvaluation {
pub log_likelihood: f64,
pub blockworking_sets: Vec<BlockWorkingSet>,
}
pub struct ExactNewtonJointGradientEvaluation {
pub log_likelihood: f64,
pub gradient: Array1<f64>,
}
/// Batched per-θ_j contributions to the analytic outer gradient.
///
/// Used by [`CustomFamily::batched_outer_gradient_terms`] to amortize the
/// joint-Hessian factorization across all K hyperparameters: instead of
/// computing each `tr(H⁻¹ · Ḣ_j)` independently (K independent solves), the
/// family factors `H` once, computes per-row leverages `L_i = Z_i H⁻¹ Z_iᵀ`,
/// and accumulates all K traces in a single streaming pass.
///
/// All three vectors have length equal to the total number of outer
/// hyperparameters (K = `rho.len() + Σ derivative_blocks[b].len()`), in the
/// same coordinate order as the unified evaluator's gradient: ρ-coords first,
/// ψ-coords appended.
///
/// # Assembly formula
///
/// The caller assembles the outer gradient as
///
/// ```text
/// grad[j] = objective_theta[j]
/// + 0.5 * trace_h_inv_hdot[j]
/// - 0.5 * trace_s_pinv_sdot[j]
/// ```
///
/// matching the three-term convention in [`outer_gradient_entry`] (penalty +
/// trace − det).
pub struct BatchedOuterHessianTerms {
/// Exact profiled outer Hessian over θ = (ρ, ψ), assembled or exposed in
/// operator form by the family in one amortized evaluation.
pub outer_hessian: crate::solver::outer_strategy::HessianResult,
}
pub struct BatchedOuterGradientTerms {
/// Explicit ∂J/∂θ_j contributions evaluated at the converged β̂ holding
/// β fixed (i.e. the part that does NOT flow through H or S):
///
/// * For ρ-coords: `½ β̂ᵀ A_k β̂` (penalty quadratic).
/// * For ψ-coords: `V_i^explicit + g_i^explicit · β̂` style contributions.
pub objective_theta: Array1<f64>,
/// `tr(H⁻¹ · ∂H/∂θ_j)` for each j, with H = -∇²log L + S the full
/// penalized Hessian at the mode.
pub trace_h_inv_hdot: Array1<f64>,
/// `tr(S⁺ · ∂S/∂θ_j)` for each j (penalty pseudo-logdet first derivative).
pub trace_s_pinv_sdot: Array1<f64>,
}
/// User-defined family contract for multi-block generalized models.
pub trait CustomFamily {
/// Family-owned fingerprint for persistent coefficient warm-starts.
///
/// The generic block specs contain design matrices, offsets, penalties,
/// and dimensions, but they deliberately do not know the family response
/// vector or likelihood-side data stored on `Self`. Reusing β across
/// different responses is mathematically unsafe, so persistent block-level
/// warm-starts are enabled only for families that provide a fingerprint of
/// the data that defines their likelihood. Outer ρ cache remains available
/// independently through `BlockwiseFitOptions::cache_session`.
fn persistent_warm_start_fingerprint(
&self,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
) -> Option<String> {
assert_valid_blockspecs(specs, "persistent warm-start fingerprint");
assert_valid_options(options, "persistent warm-start fingerprint");
None
}
/// Evaluate log-likelihood and per-block working quantities at current block predictors.
fn evaluate(&self, block_states: &[ParameterBlockState]) -> Result<FamilyEvaluation, String>;
/// Compute only the log-likelihood without building working sets.
///
/// This is used in backtracking line searches where only the objective value
/// is needed, avoiding the O(n × blocks) cost of assembling IRLS working
/// weights and responses that will be immediately discarded.
///
/// The default implementation falls back to `evaluate()` and discards the
/// working sets. Families with expensive working-set assembly should
/// override this for a significant speedup.
fn log_likelihood_only(&self, block_states: &[ParameterBlockState]) -> Result<f64, String> {
self.evaluate(block_states).map(|e| e.log_likelihood)
}
/// Options-aware log-likelihood evaluation for line search.
///
/// Default forwards to [`log_likelihood_only`] and ignores `_options`.
/// Families that consult `options.outer_score_subsample` (or other
/// per-call options that affect the LL value) must override this so the
/// joint-Newton line search and the post-accept gradient reload agree
/// on which row subset is being evaluated. Large-scale outer-only
/// callers (including the joint-Newton line-search screening path) can
/// override this to evaluate a deterministic paired Horvitz-Thompson
/// estimate without constructing a full exact-Newton workspace.
fn log_likelihood_only_with_options(
&self,
block_states: &[ParameterBlockState],
options: &BlockwiseFitOptions,
) -> Result<f64, String> {
assert_valid_options(options, "log_likelihood_only_with_options");
self.log_likelihood_only(block_states)
}
/// Whether `log_likelihood_only_with_options` can use
/// `BlockwiseFitOptions::early_exit_threshold` to reject line-search trials
/// without computing the full log-likelihood.
fn supports_log_likelihood_early_exit(&self) -> bool {
false
}
/// Selects the outer objective semantics for exact-Newton families.
///
/// `RidgedQuadraticReml` is the explicit ridged surrogate REML surface:
///
/// -loglik + penalty + 0.5 (log|H| - log|S|_+)
///
/// The determinant terms in this mode are evaluated on the stabilized
/// curvature surface declared by `ridge_policy`, so this objective is an
/// explicitly modified surrogate rather than an exact Laplace expansion
/// at an indefinite Hessian.
///
/// `StrictPseudoLaplace` is the exact-mode pseudo-Laplace surface used by the
/// Charbonnier spatial family:
///
/// -loglik + penalty + 0.5 log|H|
///
/// The latter deliberately omits the quadratic-only `-0.5 log|S|_+`
/// normalization term because there is no tractable exact analogue for the
/// nonquadratic prior without introducing the intractable prior normalizer.
fn exact_newton_outerobjective(&self) -> ExactNewtonOuterObjective {
ExactNewtonOuterObjective::RidgedQuadraticReml
}
/// Whether the joint likelihood Hessian H_L depends on β.
///
/// When `true`, the unified evaluator includes M_j[u] = D_β B_j[u]
/// moving-design drift correction for ψ coordinates and marks
/// `HyperCoord::b_depends_on_beta = true`.
///
/// Default: `true` for StrictPseudoLaplace, `false` for RidgedQuadraticReml.
/// Gaussian location-scale must override to `true` because their
/// joint Hessian depends on β even though outer objective is RidgedQuadraticReml.
fn exact_newton_joint_hessian_beta_dependent(&self) -> bool {
self.exact_newton_outerobjective() != ExactNewtonOuterObjective::RidgedQuadraticReml
}
/// Whether the outer REML/LAML logdet term `½ log|H + Sλ|` and its analytic
/// trace gradient `½ tr((H+Sλ)⁺ ∂Sλ)` are evaluated over the FULL
/// identifiable subspace `range(H + Sλ)` (mgcv's generalized determinant,
/// gam#752) rather than the penalty-range subspace `range(Sλ)`.
///
/// This is a value/gradient SUBSPACE-CONSISTENCY concern, orthogonal to
/// whether the Hessian depends on β (`exact_newton_joint_hessian_beta_dependent`,
/// which gates the *drift* corrections). The previous code conflated the two
/// by gating the projected logdet on β-dependence, so `RidgedQuadraticReml`
/// families (survival/bernoulli marginal-slope) silently used the
/// `range(Sλ)`-only determinant: on a near-collinear penalty-null trend (the
/// clustered-PC matern marginal-slope geometry) that DROPS the penalty-null
/// likelihood determinant `log|U_kᵀ H U_k|` from the value while
/// `½ log|Sλ|₊` is correctly over `range(Sλ)`, making the ρ-derivative of the
/// REML criterion inconsistent. The outer optimizer then drives that block's
/// λ → ∞ and the envelope gradient (valid only at a stationary β̂) freezes —
/// the constant-‖g‖ outer stall in gam#808/#787.
///
/// The generalized determinant is the correct objective in ALL cases: when
/// `H + Sλ` is full rank it equals the ordinary logdet (the projection is a
/// no-op, so the correction is ≈0), and when it is rank-deficient it drops
/// only the truly unidentified `ker(H) ∩ ker(Sλ)` directions — exactly the
/// directions `½ log|Sλ|₊` also omits, keeping value and gradient over one
/// subspace. Always enabled by default.
fn use_projected_penalty_logdet(&self) -> bool {
true
}
/// Per-evaluation arithmetic cost of forming or applying the inner
/// coefficient-space Hessian once, in flop-equivalent units. This is used
/// for diagnostics, seed-budget policy, and first-order iteration caps
/// when a family genuinely lacks analytic second-order support. It is not
/// allowed to hide an analytic Hessian from the outer optimizer.
///
/// The default returns `Σ_b n_b · p_b²` via [`default_coefficient_hessian_cost`],
/// which is the honest assembly cost only when the joint Hessian is
/// **block-diagonal** — i.e. the inner solver assembles each block's
/// `X_b' W_b X_b` independently, with no cross-block coupling per row.
/// Families whose row likelihood couples all blocks (every row contributes
/// a rank-`m` outer-product update to the full joint Hessian over
/// `Σ p_b` coefficients) **must** override and delegate to
/// [`joint_coupled_coefficient_hessian_cost`] (or the equivalent factored
/// form for tensor designs), otherwise the default undercounts the
/// cross-block outer-product terms `2·Σ_{a<b} n·p_a·p_b`.
///
/// Concretely:
///
/// * **Block-diagonal** (default OK): `LatentBinaryFamily` collects
/// separate `hess_time` and `hess_mean` per row, never forming an
/// off-diagonal contribution.
/// * **Joint-coupled** (override via [`joint_coupled_coefficient_hessian_cost`]):
/// GAMLSS location-scale, GAMLSS wiggle variants, marginal-slope families
/// (Bernoulli, Survival), `LatentSurvivalFamily`,
/// `SurvivalLocationScaleFamily` — every row contributes to the full
/// `(Σ p_b)²` joint Hessian via Jacobian pullback of a multi-dimensional
/// primary kernel.
/// * **Single-block** (default OK): tensor designs whose `design.ncols()`
/// already equals `p_total` (e.g. CTN's Khatri–Rao `n × (p_resp·p_cov)`);
/// `n · p²` reduces correctly to `n · p_resp² · p_cov²`.
/// * **Matrix-free Hessian operator**: families that expose
/// [`Self::exact_newton_joint_hessian_workspace`] with operator-form
/// directional derivatives (CTN at large scale) may instead return
/// the per-`Hv` matvec cost (e.g. `n·(p_resp + p_cov)` for Khatri–Rao)
/// so the gate reflects the operator path rather than the dense
/// build that the unified evaluator skips.
fn coefficient_hessian_cost(&self, specs: &[ParameterBlockSpec]) -> u64 {
default_coefficient_hessian_cost(specs)
}
/// Per-evaluation arithmetic cost of one analytic-gradient outer
/// evaluation, in flop-equivalent units. Used only when the family
/// genuinely has no analytic outer Hessian and the planner must use a
/// first-order optimizer.
///
/// The default returns `coefficient_hessian_cost / 2` (see
/// [`default_coefficient_gradient_cost`]). Families whose gradient
/// assembly differs structurally should override; in particular,
/// joint-coupled families that override `coefficient_hessian_cost` to
/// `joint_coupled_coefficient_hessian_cost(n, specs)` automatically
/// inherit the corresponding gradient cost via this default — no
/// per-family override is required for the GAMLSS / marginal-slope /
/// joint-latent path.
fn coefficient_gradient_cost(&self, specs: &[ParameterBlockSpec]) -> u64 {
self.coefficient_hessian_cost(specs) / 2
}
/// Declares how much exact outer calculus this family wants to expose for
/// the current realized problem size.
///
/// The default exposes exact second-order calculus whenever the family
/// advertises either dense outer Hessian blocks or profiled outer-Hessian
/// HVPs. Large problems must stay exact and select an operator
/// representation; they are not demoted to first-order optimizers.
///
/// **Capability vs representation.** This method reports the highest
/// analytic order this family implements. The realized policy carries
/// work estimates for dense/operator routing and staged κ schedules, but
/// those estimates do not downgrade a second-order family to a first-order
/// optimizer.
fn exact_outer_derivative_order(
&self,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
) -> ExactOuterDerivativeOrder {
assert!(std::mem::size_of_val(options) > 0);
let coefficient_work = self
.coefficient_hessian_cost(specs)
.max(self.coefficient_gradient_cost(specs));
if !self.outer_hyper_hessian_dense_available(specs)
&& !self.outer_hyper_hessian_hvp_available(specs)
{
return ExactOuterDerivativeOrder::First;
}
exact_outer_order_with_outer_hvp(
specs,
coefficient_work,
self.outer_hyper_hessian_hvp_available(specs),
)
}
/// Realized outer-derivative policy at the current problem size.
///
/// Combines the capability query [`Self::exact_outer_derivative_order`]
/// with predicted per-eval costs from [`Self::coefficient_gradient_cost`] /
/// [`Self::coefficient_hessian_cost`] and the joint outer-coordinate
/// dimension `rho_dim + psi_dim`. Capability decides derivative order;
/// predicted costs inform dense/operator routing and staged κ schedules.
///
/// Families with non-generic cost models (Khatri–Rao CTN, matrix-free
/// HVP families, marginal-slope row-third workloads) should override
/// this directly and set the `predicted_*_work` fields from their own
/// cost model. The default uses the generic
/// `n × (rho_dim + psi_dim) × p_total` shape via
/// [`default_outer_derivative_policy_costs`].
fn outer_derivative_policy(
&self,
specs: &[ParameterBlockSpec],
psi_dim: usize,
options: &BlockwiseFitOptions,
) -> OuterDerivativePolicy {
let capability = self.exact_outer_derivative_order(specs, options);
let grad_cost = self.coefficient_gradient_cost(specs);
let hess_cost = self.coefficient_hessian_cost(specs);
let (predicted_gradient_work, predicted_hessian_work) =
default_outer_derivative_policy_costs(specs, psi_dim, grad_cost, hess_cost);
OuterDerivativePolicy {
capability,
predicted_gradient_work,
predicted_hessian_work,
subsample_capable: self.outer_derivative_subsample_capable(),
}
}
/// Whether this family's outer-only paths honour HT-weighted partial sums
/// over `options.outer_score_subsample`.
///
/// Default `false`: the trait's default outer-only paths
/// (`log_likelihood_only_with_options`,
/// `exact_newton_joint_psi_workspace_with_options`, ...) forward to the
/// no-options variants and ignore `outer_score_subsample`. Families that
/// override those hooks to honour HT-weighted partial sums should override
/// this hook to return `true`; the default [`Self::outer_derivative_policy`]
/// then threads the flag into the emitted [`OuterDerivativePolicy`].
fn outer_derivative_subsample_capable(&self) -> bool {
false
}
/// Family-specific outer seeding policy.
///
/// The default preserves the generic custom-family behavior. Families with
/// a strong warm start can override this to keep seed screening from
/// dominating the fit.
fn outer_seed_config(&self, n_params: usize) -> crate::seeding::SeedConfig {
if n_params == 0 {
return crate::seeding::SeedConfig::default();
}
let mut config = crate::seeding::SeedConfig::default();
config.max_seeds = if n_params <= 4 { 6 } else { 4 };
config.seed_budget = 1;
config.screen_max_inner_iterations = 2;
config
}
/// Whether outer hyper-derivative evaluation must use a joint exact path.
///
/// Default `false` allows the generic blockwise diagonal fallback when a
/// family does not provide joint exact curvature.
///
/// Families with coupled multi-block likelihoods can override this to
/// prevent the outer code from silently evaluating a mathematically
/// invalid block-local surrogate. The failure mode is:
///
/// 1. the outer derivative still has block-local forcing
/// g_k = A_k beta
/// because `rho_k` enters only through the penalty;
/// 2. but the fitted mode response is not block-local,
/// H u_k = -g_k,
/// because the likelihood Hessian has off-diagonal block coupling;
/// 3. therefore a blockwise solve
/// H_b u_{k,b} = -(A_k beta)_b
/// is not the derivative of the profiled objective the code claims to
/// be optimizing.
///
/// When this flag is `true`, the family is asserting that any outer
/// hyper-derivative path must first obtain the full joint exact curvature
/// before it can return a mathematically valid result.
fn requires_joint_outer_hyper_path(&self) -> bool {
false
}
/// Per-block output-channel assignment for the identifiability audit.
///
/// Multi-parameter families (Dirichlet, beta, Gaussian/binomial
/// location-scale, multinomial, …) drive several *independent* linear
/// predictors `η_r = X_r β_r`, one per distributional parameter / class.
/// Each [`ParameterBlockSpec`] feeds exactly one of those output channels.
/// When two blocks share the same covariate basis (e.g. every Dirichlet
/// component uses the same `[1 | B]`), their columns are *not* gauge
/// aliases — they are block-diagonal entries of the true joint Jacobian
/// `blkdiag(X_0, …, X_{m-1})`, full rank `Σ p_b`.
///
/// The pre-fit identifiability audit can only see this block-diagonal
/// structure through the **channel-aware** route, which requires each
/// block to carry a multi-output `jacobian_callback` (n_outputs > 1).
/// Families built via the canonical helpers (`build_location_scale_block`,
/// `MultinomialFamily::build_block_specs`) wire that callback themselves;
/// families fit through the low-level `fit_custom_family` API with
/// hand-built specs do not, and the flat audit then mistakes the repeated
/// shared basis for cross-block aliases and refuses a well-posed fit
/// (issues #319 / #363 / #558).
///
/// Returning `Some(channels)` — a vector of length `specs.len()` giving the
/// zero-based output channel each block drives — lets `fit_custom_family`
/// install the appropriate [`AdditiveBlockJacobian`] on any block that
/// lacks an explicit callback, so the audit routes channel-aware
/// automatically. The total channel count is `channels.iter().max() + 1`.
///
/// Default: every block drives output channel 0. `wire_output_channels`
/// recognizes this as the single-output flat route and leaves specs unchanged.
///
/// When `Some`, the returned vector MUST have length equal to the number
/// of blocks; `fit_custom_family` surfaces a structured error otherwise.
fn output_channel_assignment(&self, specs: &[ParameterBlockSpec]) -> Option<Vec<usize>> {
Some(vec![0; specs.len()])
}
/// Optional dynamic geometry hook for blocks whose design/offset depend on
/// current values of other blocks.
fn block_geometry(
&self,
block_states: &[ParameterBlockState],
spec: &ParameterBlockSpec,
) -> Result<(DesignMatrix, Array1<f64>), String> {
assert!(block_states.len() <= isize::MAX as usize);
Ok((spec.design.clone(), spec.offset.clone()))
}
/// Whether `block_geometry(...)` can change with the current block state.
///
/// The default implementation is static: the effective geometry is just the
/// stored `spec.design/spec.offset`, so the fit engine can use those
/// references directly without repeatedly cloning dense matrices.
///
/// Families that override `block_geometry(...)` with state-dependent
/// behavior must override this to return `true`.
fn block_geometry_is_dynamic(&self) -> bool {
false
}
/// Optional directional derivative of the effective block geometry wrt the
/// current block coefficients.
///
/// For a block with effective predictor
///
/// eta(beta) = X(beta) beta + o(beta),
///
/// the directional derivative along `d_beta` is
///
/// D eta[d_beta] = X d_beta + (D X[d_beta]) beta + D o[d_beta].
///
/// For diagonal working-set REML derivatives this contributes to both:
///
/// D H[d_beta]
/// = (D X[d_beta])^T W X
/// + X^T W (D X[d_beta])
/// + X^T diag(D w[D eta[d_beta]]) X,
///
/// and to the predictor drift fed into the weight directional derivative.
///
/// Default `None` means the family is declaring that the current block's
/// geometry has no coefficient-dependent drift beyond the base `X d_beta`
/// term. Families with dynamic `block_geometry` must implement this hook
/// when that declaration is false.
fn block_geometry_directional_derivative(
&self,
block_states: &[ParameterBlockState],
idx: usize,
block_spec: &ParameterBlockSpec,
arr: &Array1<f64>,
) -> Result<Option<BlockGeometryDirectionalDerivative>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(idx < usize::MAX);
assert!(!block_spec.name.is_empty());
assert!(arr.iter().all(|v| !v.is_nan()));
Ok(None)
}
/// Optional per-block coefficient projection applied after each block update.
fn post_update_block_beta(
&self,
block_states: &[ParameterBlockState],
idx: usize,
block_spec: &ParameterBlockSpec,
beta: Array1<f64>,
) -> Result<Array1<f64>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(idx < usize::MAX);
assert!(!block_spec.name.is_empty());
Ok(beta)
}
/// Optional barrier-aware maximum feasible step size for a block update.
///
/// Given the current block state and a proposed step direction `delta`,
/// returns `Some(alpha_max)` where `alpha_max` is the largest step size
/// in `(0, 1]` such that `beta + alpha_max * delta` remains strictly
/// feasible with respect to any implicit barrier in the likelihood.
///
/// Families whose log-likelihood contains natural log-barrier terms
/// (e.g. `log(h')` in transformation-normal) should implement this to
/// prevent the line search from evaluating the likelihood at infeasible
/// points. A fraction-to-boundary safety factor (e.g. 0.995) should be
/// applied internally.
///
/// Returns `None` if no barrier constraint applies (the default).
fn max_feasible_step_size(
&self,
block_states: &[ParameterBlockState],
idx: usize,
arr: &Array1<f64>,
) -> Result<Option<f64>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(idx < usize::MAX);
assert!(arr.iter().all(|v| !v.is_nan()));
Ok(None)
}
/// Optional linear inequality constraints for a block update:
/// `A * beta_block >= b`.
fn block_linear_constraints(
&self,
block_states: &[ParameterBlockState],
idx: usize,
block_spec: &ParameterBlockSpec,
) -> Result<Option<LinearInequalityConstraints>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(idx < usize::MAX);
assert!(!block_spec.name.is_empty());
Ok(None)
}
/// Optional exact directional derivative of a block's ExactNewton Hessian.
///
/// Returns `Some(dH)` where:
/// - `dH` is the directional derivative of the block Hessian with respect to
/// the provided coefficient-space direction `d_beta` at current state.
/// - shape is `(p_block, p_block)`.
///
/// Default `None` means no exact directional Hessian drift is available.
/// Exact REML/LAML derivative paths that require this term should treat
/// `None` as unavailable rather than silently substituting zero.
fn exact_newton_hessian_directional_derivative(
&self,
block_states: &[ParameterBlockState],
idx: usize,
arr: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(idx < usize::MAX);
assert!(arr.iter().all(|v| !v.is_nan()));
Ok(None)
}
/// Optional exact second directional derivative of a block's ExactNewton Hessian.
///
/// Returns `Some(d2H)` where:
/// - `d2H` is `D²_beta H_L[u, v]` for the provided block-local
/// coefficient-space directions.
/// - shape is `(p_block, p_block)`.
///
/// Generic single-block REML/LAML Hessian evaluation requires this term for
/// `BlockWorkingSet::ExactNewton` blocks; `None` means the exact second
/// Hessian drift is unavailable.
fn exact_newton_hessian_second_directional_derivative(
&self,
block_states: &[ParameterBlockState],
idx: usize,
arr: &Array1<f64>,
arr2: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(idx < usize::MAX);
assert!(arr.iter().all(|v| !v.is_nan()));
assert!(arr2.iter().all(|v| !v.is_nan()));
Ok(None)
}
/// Optional exact joint coefficient-space Hessian across all blocks.
///
/// Returns the unpenalized matrix `H_L = -nabla^2 log L` in the flattened block order.
///
/// This is the **observed** (actual) Hessian of the log-likelihood at the mode,
/// NOT the expected Fisher information. The outer REML/LAML evaluator requires
/// the observed Hessian for the exact Laplace approximation (see response.md
/// Section 3). Since this method returns the actual second derivative of log L,
/// it is correct by construction.
///
/// For families using `BlockWorkingSet::Diagonal` (IRLS-style updates), the
/// per-block Hessian is X'WX where W is the working weight. For canonical links
/// W_obs = W_Fisher, but for non-canonical links the working weight should include
/// the observed-information correction W_obs = W_Fisher - (y-mu)*B.
fn exact_newton_joint_hessian(
&self,
block_states: &[ParameterBlockState],
) -> Result<Option<Array2<f64>>, String> {
// Default block-diagonal assembly from per-block ExactNewton hessians.
// This is the inner-fit-side default and is *intentionally* not gated
// by `likelihood_blocks_uncoupled()`: the inner joint-Newton loop only
// uses this Hessian as a Newton-direction surrogate that is
// immediately validated by the line-search + objective decrease, so
// even if the family is coupled, an under-resolved block-diagonal
// direction will simply backtrack instead of corrupting the outer
// REML score. The strict coupling gate lives one layer up, on
// `exact_newton_joint_hessian_with_specs`, where outer REML trace
// algebra would silently produce wrong answers from a missing
// cross-block term.
exact_newton_joint_hessian_from_exact_blocks(self, block_states)
}
/// Optional exact joint log-likelihood / score evaluation in flattened
/// coefficient space without building per-block Hessian working sets.
fn exact_newton_joint_gradient_evaluation(
&self,
block_states: &[ParameterBlockState],
block_specs: &[ParameterBlockSpec],
) -> Result<Option<ExactNewtonJointGradientEvaluation>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(block_specs.len() <= isize::MAX as usize);
Ok(None)
}
/// Optional exact directional derivative of the joint coefficient-space Hessian.
///
/// Returns `Some(dH)` where `dH` is the directional derivative of the
/// unpenalized joint Hessian `H = -∇² log L` along the flattened
/// coefficient-space direction `d_beta_flat`.
fn exact_newton_joint_hessian_directional_derivative(
&self,
block_states: &[ParameterBlockState],
d_beta_flat: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
exact_newton_joint_hessian_directional_derivative_from_blocks(
self,
block_states,
d_beta_flat,
)
}
/// Optional exact second directional derivative of the joint Hessian.
///
/// Returns `Some(d2H)` where `d2H` is:
/// D²H[u, v] = d/dε d/dδ H(beta + εu + δv) |_{ε=δ=0}
/// for flattened coefficient-space directions `u = d_beta_u_flat`,
/// `v = d_betav_flat`.
fn exact_newton_joint_hessiansecond_directional_derivative(
&self,
block_states: &[ParameterBlockState],
d_beta_u_flat: &Array1<f64>,
d_betav_flat: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
exact_newton_joint_hessiansecond_directional_derivative_from_blocks(
self,
block_states,
d_beta_u_flat,
d_betav_flat,
)
}
/// Optional per-evaluation workspace for exact joint Hessian operators and
/// directional derivatives.
///
/// Families with expensive cache construction can override this to build
/// shared state once and reuse it across the repeated `dH[v]` / `d²H[u,v]`
/// calls made by the unified outer evaluator.
fn exact_newton_joint_hessian_workspace(
&self,
block_states: &[ParameterBlockState],
block_specs: &[ParameterBlockSpec],
) -> Result<Option<Arc<dyn ExactNewtonJointHessianWorkspace>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(block_specs.len() <= isize::MAX as usize);
Ok(None)
}
/// Outer-aware variant of `exact_newton_joint_hessian_workspace`.
///
/// Families that consume the optional outer-only stratified row subsample
/// (`options.outer_score_subsample`) override this method so the joint
/// Hessian workspace can be constructed with the subsample mask attached.
/// Generic families can stick with the default implementation, which
/// simply forwards to the legacy no-options method and ignores the
/// options. This keeps full backward compatibility with existing
/// implementors while letting the marginal-slope families thread the
/// subsample down into the cached per-evaluation joint-Hessian directional
/// derivative paths.
fn exact_newton_joint_hessian_workspace_with_options(
&self,
states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
) -> Result<Option<Arc<dyn ExactNewtonJointHessianWorkspace>>, String> {
assert_valid_options(options, "exact Newton joint Hessian workspace");
self.exact_newton_joint_hessian_workspace(states, specs)
}
/// Optional batched analytic-gradient hook.
///
/// Returns the K per-θ_j gradient contributions ([`BatchedOuterGradientTerms`])
/// in one amortized pass when the family can factor its joint Hessian
/// once and stream row-block leverages instead of computing each
/// `tr(H⁻¹ · ∂H/∂θ_j)` independently.
///
/// # Cost amortization
///
/// Generic per-θ_j path: `O(K · n · p²)` (K independent dense traces).
/// Batched path: `O(n · p²)` (single factor + leverage stream)
/// + `O(K · n · m²)` (per-row block-diagonal accumulators
/// with `m` = per-row predictor dimension; m = 2 for
/// GAMLSS location-scale, 1 for scalar GLMs).
///
/// At large scale with K ≈ 15, p ≈ 64, m = 2 the batched path is
/// ≈ K·p²/(p² + K·m²) ≈ 15× cheaper.
///
/// # Default
///
/// Returns `Ok(None)`. The unified outer gradient evaluator falls back
/// to its generic per-coordinate path. Families with row-coupled
/// likelihoods (GAMLSS location-scale, marginal-slope) should override.
///
/// Implementations may return `Ok(None)` for ψ-coordinates whose
/// design-drift is too involved for a batched leverage form, letting
/// the generic path handle those cases.
fn batched_outer_gradient_terms(
&self,
block_states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
rho: &Array1<f64>,
options: &BlockwiseFitOptions,
hessian_workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Result<Option<BatchedOuterGradientTerms>, String> {
assert_valid_blockspecs(specs, "batched outer gradient terms");
assert_states_match_specs(block_states, specs, "batched outer gradient terms");
assert_derivative_blocks_match_specs(
derivative_blocks,
specs,
"batched outer gradient terms",
);
assert_rho_matches_specs(rho, specs, "batched outer gradient terms");
assert_valid_options(options, "batched outer gradient terms");
validate_hessian_workspace_ready(&hessian_workspace, "batched outer gradient terms")?;
Ok(None)
}
/// Optional batched analytic-Hessian / HVP hook.
///
/// This is the Hessian-side analogue of
/// [`Self::batched_outer_gradient_terms`]: families that can share a
/// single factorization, row-leverage stream, or directional θθ kernel
/// across all explicit outer-Hessian terms return the exact profiled
/// Hessian here. The evaluator uses this hook only for Hessian-capable
/// families and only after the inner mode has been fitted; default
/// `None` leaves unsupported families on their existing exact path.
fn batched_outer_hessian_terms(
&self,
block_states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
rho: &Array1<f64>,
hessian_workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Result<Option<BatchedOuterHessianTerms>, String> {
assert_valid_blockspecs(specs, "batched outer Hessian terms");
assert_states_match_specs(block_states, specs, "batched outer Hessian terms");
assert_derivative_blocks_match_specs(
derivative_blocks,
specs,
"batched outer Hessian terms",
);
assert_rho_matches_specs(rho, specs, "batched outer Hessian terms");
validate_hessian_workspace_ready(&hessian_workspace, "batched outer Hessian terms")?;
Ok(self
.outer_hyper_hessian_operator(specs)
.map(|operator| BatchedOuterHessianTerms {
outer_hessian: crate::solver::outer_strategy::HessianResult::Operator(operator),
}))
}
/// Explicit name for the inner coefficient-space Hessian HVP capability.
///
/// Kept separate from outer hyper-Hessian capabilities so CTN/GAMLSS row
/// operators do not accidentally advertise pairwise θθ calculus as cheap.
fn inner_coefficient_hessian_hvp_available(&self, specs: &[ParameterBlockSpec]) -> bool {
assert_valid_blockspecs(specs, "inner coefficient Hessian HVP availability");
false
}
fn inner_joint_workspace_gradient_available(&self, specs: &[ParameterBlockSpec]) -> bool {
assert_valid_blockspecs(specs, "inner joint workspace gradient availability");
false
}
/// Opt families in to the matrix-free inner-Newton/PCG path on top of the
/// generic `use_joint_matrix_free_path` heuristic.
///
/// `use_joint_matrix_free_path` is tuned for families with cheap per-row
/// work where dense `O(n·p²)` assembly is itself the bottleneck and HVPs
/// cost the same. Families with very expensive per-row work (e.g. BMS flex
/// streaming cell partitions + flex-jet evaluations per row) can override
/// this to force the operator path even at moderate `p`, because each HVP
/// reuses the row stream once and PCG converges in a handful of iters.
/// Default `false` keeps the heuristic untouched for everyone else.
fn prefers_matrix_free_inner_joint(
&self,
specs: &[ParameterBlockSpec],
states: &[ParameterBlockState],
) -> bool {
assert_valid_blockspecs(specs, "matrix-free inner-joint preference");
assert!(states.len() <= isize::MAX as usize);
false
}
fn inner_joint_workspace_log_likelihood_available(&self, specs: &[ParameterBlockSpec]) -> bool {
assert_valid_blockspecs(specs, "inner joint workspace log-likelihood availability");
false
}
/// True only when the family has a real profiled outer Hessian-vector
/// product over θ = (ρ, ψ), without enumerating all θ_i θ_j pairs.
fn outer_hyper_hessian_hvp_available(&self, specs: &[ParameterBlockSpec]) -> bool {
assert_valid_blockspecs(specs, "outer hyper-Hessian HVP availability");
false
}
/// True when the family can expose the dense profiled outer Hessian.
/// Generic custom-family pairwise derivative paths default to dense
/// availability; families with only inner HVP support should override this
/// if dense θθ assembly is not a valid capability for their path.
fn outer_hyper_hessian_dense_available(&self, specs: &[ParameterBlockSpec]) -> bool {
assert_valid_blockspecs(specs, "outer hyper-Hessian dense availability");
true
}
/// Family-supplied exact outer Hessian operator over θ = (ρ, ψ).
///
/// When a family can produce the full profiled outer Hessian as a
/// matrix-free Hv operator — using its own directional θθ kernels and
/// trace algebra rather than the generic per-pair enumeration — it
/// overrides this method and returns `Some(op)`. The unified REML/LAML
/// evaluator wires the operator into [`HessianResult::Operator`] via
/// the [`HessianDerivativeProvider::family_outer_hessian_operator`] hook
/// the family installs on its provider; consumers see a generic
/// `Arc<dyn OuterHessianOperator>` (matvec / dim / mul_mat /
/// is_cheap_to_materialize).
///
/// Default returns `None`, leaving the family on the existing pairwise
/// assembly path. This is the architectural contract for CTN, survival
/// (Gompertz-Makeham + timewiggle), GAMLSS location-scale, and
/// Bernoulli marginal-slope families to plug their directional
/// outer-HVP operators into the same surface.
fn outer_hyper_hessian_operator(
&self,
specs: &[ParameterBlockSpec],
) -> Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>> {
assert_valid_blockspecs(specs, "outer hyper-Hessian operator");
None
}
/// Optional spec-aware exact joint Hessian.
///
/// This hook exists because the outer hyper-derivative code works from the
/// realized block specs, while some family instances may or may not cache
/// those realized designs internally.
///
/// The profiled/Laplace outer objective used here is
///
/// J(theta)
/// = V(beta(theta), theta)
/// + 0.5 log|H(beta(theta), theta)|
/// - 0.5 log|S(theta)|_+,
///
/// evaluated at the fitted inner mode defined by
///
/// F(beta, theta) := D_beta V(beta, theta) = 0,
/// H(beta, theta) := F_beta(beta, theta) = H_L(beta, theta) + S(theta).
///
/// For pure rho directions on families whose likelihood has no explicit
/// rho-dependence, the fixed-beta forcing is
///
/// g_k := F_{rho_k} = A_k beta,
/// A_k := dS/drho_k.
///
/// Differentiating stationarity gives the exact joint mode response
///
/// H u_k = -g_k,
/// u_k = d beta / d rho_k.
///
/// Even if `A_k` is supported in only one penalty block, the solve for
/// `u_k` must use the full joint Hessian `H`, because the likelihood can
/// couple blocks through off-diagonal curvature. The first outer
/// derivative is then
///
/// dJ/dtheta_i
/// = 0.5 beta^T A_k beta
/// + 0.5 tr(H^{-1}(A_k + D_beta H_L[u_k]))
/// - 0.5 tr(S^+ A_k),
///
/// and when psi moves realized penalties the same spec-aware hook must be
/// able to reconstruct H(beta, theta), D_beta H[u], and D_beta^2 H[u, v]
/// from the current realized specs so the generic joint assembler can form
///
/// dot H_i = H_i + D_beta H[beta_i],
/// ddot H_ij
/// = H_ij + T_i[beta_j] + T_j[beta_i]
/// + D_beta H[beta_ij] + D_beta^2 H[beta_i, beta_j].
///
/// Families such as binomial location-scale with
///
/// q = -eta_t exp(-eta_ls)
///
/// have exactly that coupled structure: the penalty forcing is block-local
/// but the fitted mode response and the resulting `D_beta H_L[u_k]` drift
/// are joint objects. If the realized `specs` already contain the designs
/// needed to build those objects, the outer code should use them directly
/// rather than falling back to a weaker blockwise surrogate just because
/// the family instance itself did not cache the same designs.
///
/// The default implementation delegates to `exact_newton_joint_hessian`.
///
/// For multi-block families, the working-set fallback only fires when the
/// family has explicitly declared its blocks are uncoupled in the
/// likelihood Hessian via `likelihood_blocks_uncoupled() = true`. This
/// is critical: `exact_newton_joint_hessian_from_working_sets` produces a
/// strictly block-diagonal joint Hessian, which silently drops cross-block
/// `∂²L/∂β_a∂β_b` terms for coupled likelihoods (GAMLSS μ-σ, marginal
/// slope, survival location-scale, etc.). Default `false` ⇒ multi-block
/// custom families must override `exact_newton_joint_hessian` (or
/// `exact_newton_outer_curvature`) and the higher layer surfaces a loud
/// "joint outer path required" error rather than silently using
/// block-diagonal curvature.
fn exact_newton_joint_hessian_with_specs(
&self,
block_states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
) -> Result<Option<Array2<f64>>, String> {
// Multi-axis dispatch over the joint Hessian source:
//
// * Single-block, or family declared `likelihood_blocks_uncoupled` —
// the working-sets block-diagonal IS exact (no cross-block coupling
// exists), so it's a valid fallback when the family override
// returns None.
//
// * Multi-block coupled with `has_explicit_joint_hessian = true` —
// the family override IS the only trusted joint Hessian. If it
// returns None (e.g. dense form too large for memory at large-scale
// scale), propagate None. Substituting the working-sets
// block-diagonal would silently drop the cross-block
// ∂²L/∂β_a∂β_b curvature the family is the only source of —
// exactly the corruption this gate exists to prevent.
//
// * Multi-block coupled, no explicit override — refuse entirely so
// the multi-block error surfaces upstream.
if specs.len() <= 1 || self.likelihood_blocks_uncoupled() {
match self.exact_newton_joint_hessian(block_states)? {
Some(hessian) => Ok(Some(hessian)),
None => exact_newton_joint_hessian_from_working_sets(self, block_states, specs),
}
} else if self.has_explicit_joint_hessian() {
self.exact_newton_joint_hessian(block_states)
} else {
// Multi-block coupled family that did NOT set the explicit marker.
// The marker exists because the trait cannot reflect on whether
// `exact_newton_joint_hessian` was overridden — its *default* impl
// assembles a strictly block-diagonal matrix from per-block exact
// blocks, which would silently drop cross-block ∂²L/∂β_a∂β_b
// curvature for a coupled likelihood. But the marker is not the
// only available signal: a family that genuinely overrides the
// joint Hessian with true coupled curvature produces a matrix with
// *nonzero off-diagonal blocks*, which the block-diagonal default
// can never produce. Detect that structurally and trust it. A
// returned matrix that is block-diagonal is indistinguishable from
// the default for a coupled family, so it stays gated to None.
match self.exact_newton_joint_hessian(block_states)? {
Some(hessian) if joint_hessian_has_cross_block_coupling(&hessian, block_states) => {
Ok(Some(hessian))
}
_ => Ok(None),
}
}
}
/// Structural-coupling probe shared by the `_with_specs` joint dispatch
/// gates: is the family's `exact_newton_joint_hessian` a genuinely coupled
/// matrix (nonzero off-diagonal blocks), as opposed to the trait's
/// block-diagonal default? This is the marker-free signal that lets the
/// engine trust a coupled multi-block family that overrode the joint
/// Hessian without hand-setting `has_explicit_joint_hessian()`. Returns
/// `false` when no joint Hessian is available or it is block-diagonal.
fn joint_hessian_is_structurally_coupled(
&self,
block_states: &[ParameterBlockState],
) -> Result<bool, String> {
Ok(match self.exact_newton_joint_hessian(block_states)? {
Some(hessian) => joint_hessian_has_cross_block_coupling(&hessian, block_states),
None => false,
})
}
/// Whether the family's log-likelihood Hessian is block-diagonal in the
/// joint coefficient vector — i.e. `∂²L/∂β_a∂β_b = 0` for every pair of
/// distinct blocks `a ≠ b`. Default `false` (assume coupling, the safe
/// answer); families whose blocks share no η/W coupling override to
/// `true` to opt into the default working-set joint-Hessian assembly for
/// multi-block specs.
fn likelihood_blocks_uncoupled(&self) -> bool {
false
}
/// Whether the family has an explicit override of `exact_newton_joint_hessian`
/// (or its `_with_specs` variant) that returns the *true* coupled joint
/// Hessian rather than the trait's block-diagonal default.
///
/// Default `false`. Production families that override
/// `exact_newton_joint_hessian` with their analytic coupled curvature must
/// set this to `true` so the outer-REML path can trust the override
/// downstream of `exact_newton_joint_hessian_with_specs`. The trait can't
/// detect override status by reflection, so this marker is the contract
/// signal.
fn has_explicit_joint_hessian(&self) -> bool {
false
}
/// Whether the family's inner/outer solves need the full-span Jeffreys
/// curvature `H_Φ` and score `∇Φ`.
///
/// Default `true` to preserve the existing separation/near-singular
/// robustness on every family the term was historically armed for
/// (probit/binomial, GAMLSS location-scale, BMS, survival marginal-slope).
///
/// A family overrides this to `false` when it has no
/// separation/under-identification regime by construction — the
/// canonical case is a continuous-response monotone-transformation
/// family like `TransformationNormalFamily`, where the Fisher information
/// is `O(n)` on every identified direction at every working point and
/// the Jeffreys gate would always smooth-step to zero anyway. There the
/// term is pure overhead: each evaluation runs `p` directional
/// derivatives of the joint Hessian (`O(n·p²)` per call for the SCOP
/// directional derivative), called multiple times per inner cycle and
/// once per outer evaluation. At large scale (`p=144`, `n=20000`) the
/// overhead is the dominant per-cycle cost and exhausts the CI budget
/// long before the inner Newton converges, while contributing
/// essentially zero to the converged gradient and curvature.
fn joint_jeffreys_term_required(&self) -> bool {
true
}
/// Whether the coupled-joint inner Newton should engage its self-vanishing
/// Levenberg–Marquardt damping `μ` on a FULL-RANK-but-ILL-CONDITIONED
/// penalized Hessian (cond > `COND_NEWTON_SAFETY`), not only on a
/// rank-deficient one (`nullity > 0`). Default `false` (binary / AFT /
/// others byte-identical). Survival marginal-slope overrides to `true`
/// (#808: full-rank but cond ≈ 5.8e6; the self-vanishing μ shapes only the
/// trajectory, so the converged β is unbiased and the log-slope target is
/// preserved). Survival-local by trait override so the shared spectral-range
/// solver stays byte-identical for every other family — in particular AFT
/// (`survival_location_scale`), whose intercept-only-scale fits can be
/// high-cond and which a shared (unconditional) gate would regress (#735/#736).
fn levenberg_on_ill_conditioning(&self) -> bool {
false
}
/// Internal helper: do the outer-REML `_with_specs` defaults trust the
/// inner-fit's block-diagonal-from-blocks output for this family?
///
/// Trustworthy iff:
/// - single-block (no cross-block coupling possible), or
/// - the family has declared its blocks uncoupled in the likelihood
/// Hessian (`likelihood_blocks_uncoupled` ⇒ block-diagonal IS exact),
/// or
/// - the family has an explicit joint-Hessian override
/// (`has_explicit_joint_hessian` ⇒ what we receive from
/// `exact_newton_joint_hessian` is the true coupled Hessian, not the
/// block-diagonal default).
fn outer_default_trustworthy_for_joint_hessian(&self, specs: &[ParameterBlockSpec]) -> bool {
specs.len() <= 1 || self.likelihood_blocks_uncoupled() || self.has_explicit_joint_hessian()
}
/// Optional scale-aware exact joint curvature for the outer REML calculus.
///
/// Families whose exact derivatives can overflow may return a uniformly
/// rescaled Hessian together with the metadata needed to keep every outer
/// path consistent:
///
/// - `hessian`: the scale-stabilized unpenalized joint Hessian
/// - `rho_curvature_scale`: the uniform factor applied to every ρ-driven
/// penalty Hessian derivative in H-dependent trace / solve terms
/// - `hessian_logdet_correction`: the additive correction needed to recover
/// `log|H_exact|` from `log|H_scaled|`
///
/// The scale is evaluation-local metadata: callers must use the same
/// factor for `H`, `dH`, `d²H`, and penalized trace operators within that
/// evaluation, but they do not differentiate the scale itself.
///
/// Families overriding this must also make
/// `exact_newton_outer_curvature_directional_derivative[_with_specs]` and
/// `exact_newton_outer_curvature_second_directional_derivative[_with_specs]`
/// return derivatives in that same scaled curvature space.
fn exact_newton_outer_curvature(
&self,
block_states: &[ParameterBlockState],
) -> Result<Option<ExactNewtonOuterCurvature>, String> {
assert!(block_states.len() <= isize::MAX as usize);
Ok(None)
}
/// Optional first directional derivative matching
/// `exact_newton_outer_curvature`.
fn exact_newton_outer_curvature_directional_derivative(
&self,
block_states: &[ParameterBlockState],
d_beta_flat: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
self.exact_newton_joint_hessian_directional_derivative(block_states, d_beta_flat)
}
/// Spec-aware variant of `exact_newton_outer_curvature_directional_derivative`.
fn exact_newton_outer_curvature_directional_derivative_with_specs(
&self,
block_states: &[ParameterBlockState],
block_specs: &[ParameterBlockSpec],
d_beta_flat: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
assert!(block_specs.len() <= isize::MAX as usize);
self.exact_newton_outer_curvature_directional_derivative(block_states, d_beta_flat)
}
/// Optional second directional derivative matching
/// `exact_newton_outer_curvature`.
fn exact_newton_outer_curvature_second_directional_derivative(
&self,
block_states: &[ParameterBlockState],
d_beta_u_flat: &Array1<f64>,
d_beta_v_flat: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
self.exact_newton_joint_hessiansecond_directional_derivative(
block_states,
d_beta_u_flat,
d_beta_v_flat,
)
}
/// Spec-aware variant of `exact_newton_outer_curvature_second_directional_derivative`.
fn exact_newton_outer_curvature_second_directional_derivative_with_specs(
&self,
block_states: &[ParameterBlockState],
block_specs: &[ParameterBlockSpec],
d_beta_u_flat: &Array1<f64>,
d_beta_v_flat: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
assert!(block_specs.len() <= isize::MAX as usize);
self.exact_newton_outer_curvature_second_directional_derivative(
block_states,
d_beta_u_flat,
d_beta_v_flat,
)
}
/// Optional spec-aware exact first directional derivative of the joint Hessian.
///
/// This is the spec-aware analogue of
/// `exact_newton_joint_hessian_directional_derivative`. It returns the
/// exact joint likelihood-curvature drift
///
/// D_beta H_L[u],
///
/// for a flattened coefficient-space direction `u`. In the profiled
/// Laplace gradient this appears after solving the exact joint mode
/// response
///
/// H u_k = -A_k beta,
/// dot H_k = A_k + D_beta H_L[u_k].
///
/// Families that can reconstruct the exact joint geometry from `specs`
/// should override this alongside
/// `exact_newton_joint_hessian_with_specs`.
fn exact_newton_joint_hessian_directional_derivative_with_specs(
&self,
block_states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
d_beta_flat: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
// Same trust dispatch as `exact_newton_joint_hessian_with_specs` —
// the default `_directional_derivative` and `_from_working_sets`
// both build a block-diagonal `D_β H[u]`, which silently drops the
// cross-block `∂²L_ab/∂β_a∂β_b · u_b` rows that drive the outer
// mode-response correction for coupled families.
if specs.len() <= 1 || self.likelihood_blocks_uncoupled() {
match self
.exact_newton_joint_hessian_directional_derivative(block_states, d_beta_flat)?
{
Some(dh) => Ok(Some(dh)),
None => exact_newton_joint_hessian_directional_derivative_from_working_sets(
self,
block_states,
specs,
d_beta_flat,
),
}
} else if self.has_explicit_joint_hessian()
|| self.joint_hessian_is_structurally_coupled(block_states)?
{
// Marked, or structurally detected coupled (see
// `exact_newton_joint_hessian_with_specs`): the family's own
// directional derivative is the trusted cross-block `D_β H[u]`.
self.exact_newton_joint_hessian_directional_derivative(block_states, d_beta_flat)
} else {
Ok(None)
}
}
/// Optional spec-aware exact second directional derivative of the joint Hessian.
///
/// This is the spec-aware analogue of
/// `exact_newton_joint_hessiansecond_directional_derivative`. For
/// rho/rho outer Hessian entries it supplies the exact joint second-order
/// likelihood-curvature drift
///
/// D_beta^2 H_L[u_l, u_k],
///
/// which combines with
///
/// dot H_k = A_k + D_beta H_L[u_k]
///
/// and the second mode response
///
/// H u_{k,l}
/// = -(A_k u_l + A_l u_k + B_{k,l} beta + D_beta H_L[u_l] u_k)
///
/// to form
///
/// ddot H_{k,l}
/// = B_{k,l} + D_beta H_L[u_{k,l}] + D_beta^2 H_L[u_l, u_k].
fn exact_newton_joint_hessian_second_directional_derivative_with_specs(
&self,
block_states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
d_beta_u_flat: &Array1<f64>,
d_betav_flat: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
// Same trust dispatch as the Hessian / first-derivative paths. The
// delegated `exact_newton_joint_hessiansecond_directional_derivative`
// default is block-diagonal-from-blocks, which is silently wrong for
// outer trace assembly on coupled families. Unlike the lower-order
// paths, there is no working-sets fallback — both trusted branches
// call the same delegate, so a single helper predicate suffices.
// The marker predicate is supplemented by the marker-free structural
// probe so an auto-routed coupled family (one that returns a genuinely
// off-diagonal joint Hessian without setting the explicit marker) is
// trusted consistently across all three derivative orders.
if !self.outer_default_trustworthy_for_joint_hessian(specs)
&& !self.joint_hessian_is_structurally_coupled(block_states)?
{
return Ok(None);
}
self.exact_newton_joint_hessiansecond_directional_derivative(
block_states,
d_beta_u_flat,
d_betav_flat,
)
}
/// Optional joint multi-block outer-hyper surrogate Hessian over the
/// flattened coefficient vector.
///
/// This hook exists for families whose inner working representation is
/// block-diagonal/diagonal in `evaluate(...)`, but whose outer profiled
/// smoothing derivatives are still joint because the fitted mode response
/// couples blocks. The generic blockwise outer-hyper surrogate only sees
/// per-block working sets, so it cannot recover missing cross-block
/// curvature on its own.
///
/// Families that can construct a mathematically valid joint surrogate
/// `H_L(beta)` for the current realized `specs` may override this and the
/// two directional derivative hooks below. Generic code then reuses the
/// same joint rho-calculus as the exact path, but on the family-supplied
/// surrogate curvature instead of the exact Newton Hessian.
///
/// Default behavior is to reuse the spec-aware exact joint curvature when
/// the family already provides it. That is the mathematically correct
/// repair for the old broken multi-block blockwise surrogate path: if the
/// family knows the full coupled Hessian and its beta-drifts, generic code
/// should use that joint information instead of pretending per-block
/// working sets are enough.
fn joint_outer_hyper_surrogate_hessian_with_specs(
&self,
block_states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
) -> Result<Option<Array2<f64>>, String> {
self.exact_newton_joint_hessian_with_specs(block_states, specs)
}
/// Optional first beta-directional derivative of the joint surrogate
/// outer-hyper Hessian.
fn joint_outer_hyper_surrogate_hessian_directional_derivative_with_specs(
&self,
block_states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
d_beta_flat: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
self.exact_newton_joint_hessian_directional_derivative_with_specs(
block_states,
specs,
d_beta_flat,
)
}
/// Optional second beta-directional derivative of the joint surrogate
/// outer-hyper Hessian.
fn joint_outer_hyper_surrogate_hessian_second_directional_derivative_with_specs(
&self,
block_states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
d_beta_u_flat: &Array1<f64>,
d_betav_flat: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
self.exact_newton_joint_hessian_second_directional_derivative_with_specs(
block_states,
specs,
d_beta_u_flat,
d_betav_flat,
)
}
/// Optional exact directional derivative of diagonal working weights along
/// a predictor-space direction `d_eta` for `BlockWorkingSet::Diagonal`.
///
/// This callback supplies the `dw` term in
///
/// D_beta J[u] = X^T diag(dw) X
///
/// for diagonal working-set blocks with
///
/// J = X^T W X + S.
///
/// Default `None` means no exact working-weight directional derivative is
/// available. Exact REML/LAML derivative paths should not silently replace
/// this with zero unless the family truly has constant working weights.
fn diagonalworking_weights_directional_derivative(
&self,
block_states: &[ParameterBlockState],
idx: usize,
arr: &Array1<f64>,
) -> Result<Option<Array1<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(idx < usize::MAX);
assert!(arr.iter().all(|v| !v.is_nan()));
Ok(None)
}
/// Optional exact second directional derivative of diagonal working weights.
///
/// This callback supplies the `d²w` term for static-design single-block
/// generic fallback Hessian drift:
///
/// D²_beta H_L[u, v] = X^T diag(D²w[D eta_u, D eta_v]) X.
///
/// Families with coefficient-dependent block geometry must use an exact
/// Newton Hessian path or a joint outer path until second-order geometry
/// hooks are available; the generic diagonal fallback will reject nonzero
/// first-order geometry while building `d²H`.
fn diagonalworking_weights_second_directional_derivative(
&self,
block_states: &[ParameterBlockState],
idx: usize,
arr: &Array1<f64>,
arr2: &Array1<f64>,
) -> Result<Option<Array1<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(idx < usize::MAX);
assert!(arr.iter().all(|v| !v.is_nan()));
assert!(arr2.iter().all(|v| !v.is_nan()));
Ok(None)
}
/// Optional exact first-order joint psi terms over the flattened
/// coefficient vector.
///
/// Families with coupled exact-joint curvature must provide psi objects in
/// the same flattened coefficient space used by the existing joint Hessian
/// hooks:
///
/// objective_psi = V_psi^explicit,
/// score_psi = g_psi^explicit,
/// hessian_psi = H_psi^explicit.
///
/// Generic code then adds the realized penalty surface, solves
///
/// beta_i = -H^{-1} g_i,
///
/// forms
///
/// dot H_i = H_i + D_beta H[beta_i],
///
/// and plugs those objects into the unified profiled/Laplace gradient
///
/// J_i = V_i + 0.5 tr(H^{-1} dot H_i) - 0.5 partial_i log|S(theta)|_+.
///
/// The current block-local exact-Newton psi hooks are not sufficient for a
/// full joint hyper Hessian on coupled families; joint exact-joint hyper
/// evaluation must use this flattened-coefficient hook instead.
fn exact_newton_joint_psi_terms(
&self,
block_states: &[ParameterBlockState],
block_specs: &[ParameterBlockSpec],
derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
idx: usize,
) -> Result<Option<ExactNewtonJointPsiTerms>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(block_specs.len() <= isize::MAX as usize);
assert!(derivative_blocks.len() <= isize::MAX as usize);
assert!(idx < usize::MAX);
Ok(None)
}
/// Optional exact second-order joint psi terms over the flattened
/// coefficient vector.
///
/// For two outer coordinates theta_i, theta_j the exact profiled/Laplace
/// Hessian uses fixed-beta second partials
///
/// V_{ij}^explicit, g_{ij}^explicit, H_{ij}^explicit.
///
/// For psi/psi blocks this callback returns those explicit family terms in
/// flattened coefficient coordinates. Generic code adds penalty
/// contributions and profile/Laplace corrections.
fn exact_newton_joint_psisecond_order_terms(
&self,
block_states: &[ParameterBlockState],
block_specs: &[ParameterBlockSpec],
derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
idx: usize,
idx2: usize,
) -> Result<Option<ExactNewtonJointPsiSecondOrderTerms>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(block_specs.len() <= isize::MAX as usize);
assert!(derivative_blocks.len() <= isize::MAX as usize);
assert!(idx < usize::MAX);
assert!(idx2 < usize::MAX);
Ok(None)
}
/// Optional per-evaluation workspace for exact joint ψ derivatives.
///
/// Families with expensive exact ψ calculus can override this hook to
/// precompute shared state once per outer evaluation and serve:
///
/// - exact fixed-β ψψ second-order terms, and
/// - exact mixed β/ψ Hessian drifts `D_β H_ψ[u]`
///
/// from one cached workspace. Generic code falls back to the direct hooks
/// above when no workspace is provided.
fn exact_newton_joint_psi_workspace(
&self,
block_states: &[ParameterBlockState],
block_specs: &[ParameterBlockSpec],
derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
) -> Result<Option<Arc<dyn ExactNewtonJointPsiWorkspace>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(block_specs.len() <= isize::MAX as usize);
assert!(derivative_blocks.len() <= isize::MAX as usize);
Ok(None)
}
/// Outer-aware variant of `exact_newton_joint_psi_workspace`.
///
/// Families that consume the optional outer-only stratified row subsample
/// (`options.outer_score_subsample`) override this method so the workspace
/// can be constructed with the subsample mask attached. Generic families
/// can stick with the default implementation, which simply forwards to
/// the legacy no-options method and ignores the options. This keeps full
/// backward compatibility with existing implementors while letting the
/// marginal-slope families thread the subsample down into the cached
/// per-evaluation ψ calculus.
fn exact_newton_joint_psi_workspace_with_options(
&self,
states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
derivs: &[Vec<CustomFamilyBlockPsiDerivative>],
options: &BlockwiseFitOptions,
) -> Result<Option<Arc<dyn ExactNewtonJointPsiWorkspace>>, String> {
assert_valid_options(options, "exact Newton joint psi workspace");
self.exact_newton_joint_psi_workspace(states, specs, derivs)
}
/// Whether the family's exact joint ψ workspace should also be built for
/// first-order ψ terms during outer gradient evaluation.
///
/// Default `false` avoids forcing every family to pay workspace setup cost
/// on gradient-only outer evaluations. Families with expensive shared state
/// that is reused by both first- and second-order ψ calculus can opt in.
fn exact_newton_joint_psi_workspace_for_first_order_terms(&self) -> bool {
false
}
/// Optional mixed beta/psi Hessian drift D_beta H_psi[u].
///
/// This is the missing T_i[u] object in the full exact joint profiled
/// Hessian:
///
/// ddot H_{ij}
/// = H_{ij}
/// + D_beta H_i[beta_j]
/// + D_beta H_j[beta_i]
/// + D_beta H[beta_{ij}]
/// + D_beta^2 H[beta_i, beta_j].
///
/// For i = psi_a this hook supplies D_beta H_{psi_a}[u].
///
/// This direct hook is dense-only. Families that can keep the drift in an
/// operator-backed or block-local form should expose it through
/// `exact_newton_joint_psi_workspace()` instead.
fn exact_newton_joint_psihessian_directional_derivative(
&self,
block_states: &[ParameterBlockState],
block_specs: &[ParameterBlockSpec],
derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
idx: usize,
arr: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(block_specs.len() <= isize::MAX as usize);
assert!(derivative_blocks.len() <= isize::MAX as usize);
assert!(idx < usize::MAX);
assert!(arr.iter().all(|v| !v.is_nan()));
Ok(None)
}
/// How the penalized Hessian's log-determinant and its derivatives
/// should handle eigenvalues below the numerical-stability floor.
///
/// See [`PseudoLogdetMode`]. Default: `Smooth`, the stable choice for
/// full-rank Hessians. Families whose model structure carries a
/// numerical null-space direction — e.g. multi-block GAMLSS wiggle
/// models where `q = q_0 + B(q_0)^⊤ β_w` is not identified from a
/// threshold shift — should override to `HardPseudo` so the null
/// direction drops out of both the REML cost and its gradient
/// consistently, rather than leaking a spurious first-order
/// contribution through the eigensolver's arbitrary choice of basis
/// inside the null space.
fn pseudo_logdet_mode(&self) -> PseudoLogdetMode {
PseudoLogdetMode::Smooth
}
}
/// Scope of an outer-evaluation context — distinguishes a real outer
/// derivative evaluation (where auto-subsample is allowed to install a
/// fresh stratified mask and emit phase prints) from an inner
/// coefficient line-search trial (where the family must reuse the outer
/// row measure, so auto-subsample must stay disabled).
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum EvalScope {
/// Real outer derivative evaluation: ρ has advanced; auto-subsample
/// install paths may build/refresh a mask keyed on this ρ.
OuterDerivative,
/// Inner coefficient trial (joint-Newton / line-search) at fixed
/// outer ρ: row measure must remain identical to the surrounding
/// outer eval, so auto-subsample must not install a fresh mask.
InnerCoefficient,
}
/// Context published by the outer smoothing optimizer for every
/// downstream family evaluation. Carries the current outer ρ and a
/// monotonic per-outer-eval id alongside the [`EvalScope`] tag used to
/// gate auto-subsample installation. See the
/// [`BlockwiseFitOptions::outer_eval_context`] field doc for the bug
/// this prevents.
#[derive(Clone, Debug)]
pub struct OuterEvalContext {
pub rho: Arc<Array1<f64>>,
pub eval_id: usize,
pub scope: EvalScope,
}
/// Stable public API for installing outer-score subsampling.
#[derive(Clone)]
pub struct BlockwiseFitOptions {
pub inner_max_cycles: usize,
pub inner_tol: f64,
pub outer_max_iter: usize,
pub outer_tol: f64,
pub minweight: f64,
pub ridge_floor: f64,
/// Shared ridge semantics used by solve/quadratic/logdet terms.
pub ridge_policy: RidgePolicy,
/// If true, outer smoothing optimization uses a Laplace/REML-style objective:
/// -loglik + penalty + 0.5(log|H| - log|S|_+)
/// where H is blockwise working curvature and S is blockwise penalty.
pub use_remlobjective: bool,
/// If false, the outer smoothing optimizer uses exact gradients but does
/// not request an analytic outer Hessian from the family.
pub use_outer_hessian: bool,
/// If false, skip post-fit joint covariance assembly.
pub compute_covariance: bool,
/// Shared cap engaged during seed screening so cost-only evaluations can
/// stop inner iterations early without affecting the full solve.
pub screening_max_inner_iterations: Option<Arc<AtomicUsize>>,
/// Shared cap engaged during regular outer iterations. Unlike screening,
/// this is only a budget: capped solves still have to earn the ordinary
/// KKT certificate before derivatives may be exposed.
pub outer_inner_max_iterations: Option<Arc<AtomicUsize>>,
/// Optional line-search objective ceiling for lazy log-likelihood-only
/// evaluations. Families whose per-row log-likelihood contributions are
/// non-positive may stop once the partial negative log-likelihood is already
/// above this ceiling, because the unvisited rows cannot improve the trial
/// objective enough to be accepted. Default `None` preserves exact full-sum
/// behavior and is the only mode used outside backtracking rejection tests.
pub early_exit_threshold: Option<f64>,
/// Stable public API for installing outer-score subsampling.
///
/// Optional stratified row subsample used by outer-only score/gradient
/// passes. When `Some(s)`, outer score/gradient hot loops should iterate
/// only over `s.rows` and multiply each contribution by that row's
/// Horvitz-Thompson inverse-inclusion weight. Inner-PIRLS and final
/// covariance passes always run on the full data, so this field is
/// consulted only by outer-only call sites. Default `None` preserves the
/// full-data behavior. Wrapping in `Arc` keeps `Clone` cheap across the
/// many places `BlockwiseFitOptions` is duplicated per-eval.
pub outer_score_subsample:
Option<Arc<crate::families::marginal_slope_shared::OuterScoreSubsample>>,
/// Gate for marginal-slope families to auto-derive a stratified
/// outer-score subsample at large scale (see
/// [`crate::families::marginal_slope_shared::auto_outer_score_subsample`]).
///
/// **Default `true`.** Auto-subsampling makes the early rho-gradient
/// evaluations unbiased stochastic estimators with bounded relative
/// variance (≈ 1 % at the conservative defaults), then the family switches
/// back to full-data gradients for the remaining outer iterations. That
/// keeps large marginal-slope fits fast during the high-motion part of the
/// trajectory while preserving the default tight `outer_tol` polish on
/// exact gradients. For small datasets the auto path declines to install a
/// mask and the fit remains full-data throughout.
///
/// When `outer_score_subsample` is already `Some(...)` the auto
/// path is bypassed entirely (caller-provided masks always win).
pub auto_outer_subsample: bool,
/// Outer-evaluation context populated by the smoothing optimizer at
/// the top of each real outer derivative evaluation. Used by
/// auto-subsample install paths to key the stratified mask on the
/// outer ρ rather than the inner β proxy: during the inner trust-
/// region / coefficient line search β changes on every trial step,
/// so keying on β re-fires phase prints (and re-shuffles the mask)
/// inside a single outer eval. Keying on (rho, eval_id) instead
/// keeps the mask stable across the inner Newton at one ρ, and
/// suppresses auto-subsample entirely on inner trial evaluations via
/// the [`EvalScope::InnerCoefficient`] tag set by
/// [`coefficient_line_search_options`].
///
/// `None` preserves legacy behavior (no context — install paths fall
/// back to "no auto-subsample"). Default `None`.
pub outer_eval_context: Option<OuterEvalContext>,
/// Optional persistent warm-start cache session. When `Some`, the
/// outer smoothing optimizer consults the on-disk cache before
/// starting (to seed θ from the last accepted iterate) and writes
/// checkpoints + a final entry on completion. When `None`, the fit
/// runs cold and writes nothing — the default for unit tests and
/// any caller that pinned a deterministic optimum.
///
/// The session is opened at the workflow-level `fit_model`
/// dispatcher so every family flows through one chokepoint; family
/// code never has to remember to wire it. This mirrors the standard
/// REML cache wiring in `solver/estimate.rs:2701`.
pub cache_session: Option<Arc<crate::cache::Session>>,
/// Optional mirror sessions that receive a copy of the final-result
/// finalize() write. Used by the workflow dispatcher to broadcast a
/// converged ρ to additional keyspace(s) — notably the data-
/// independent seed prefix — so future fits with related structure
/// can warm-start from this run. Writes still pass through the session
/// rate limiter, so mirroring checkpoints does not add unbounded I/O.
pub cache_mirror_sessions: Vec<Arc<crate::cache::Session>>,
/// Optional bundle of cross-block (full-width) penalties, paired with
/// their current `log λ` values from the outer ρ vector. When `Some`,
/// the inner joint-Newton primitives add the contributions
///
/// * objective: `½ Σ_j exp(ρ_j) βᵀ S_j β`
/// * gradient: `Σ_j exp(ρ_j) S_j β`
/// * Hessian: `Σ_j exp(ρ_j) S_j`
///
/// in addition to the per-block penalty stack assembled from
/// `ParameterBlockSpec.penalties`. The per-block path is unchanged.
/// `None` preserves legacy behaviour for every existing caller.
pub joint_penalties: Option<Arc<crate::families::joint_penalty::JointPenaltyBundle>>,
/// Whether the outer smoothing optimizer screens the explicit
/// `initial_rho` seed through the seed-screening cascade before the
/// solver starts.
///
/// **Default `true`** — the general path benefits from ranking the
/// initial seed against the generated exploration seeds via cheap
/// capped proxy fits.
///
/// A caller sets this `false` when `initial_rho` is already the correct,
/// identified optimum for its regime so that re-screening it adds only
/// cost. The survival location-scale constant-scale (parametric-AFT)
/// path uses this: its time-warp ρ seed is pinned AT the inner ρ box
/// bound (the affine-baseline limit), where the REML/LAML profile is a
/// dead-flat unidentified ridge. Running the screening cascade there
/// drives each proxy fit (and, when every capped stage collapses to
/// non-finite cost, the uncapped final stage) into a full inner solve on
/// the near-singular flat Hessian — the source of the multi-minute
/// no-iteration-log stall (#736, #735, #721). Skipping screening lets the
/// already-correct seed flow straight to the outer solver, which certifies
/// box-constraint stationarity at iteration 0. Genuinely flexible regimes
/// (smooth scale / spatial) leave this `true` and keep full screening.
pub screen_initial_rho: bool,
/// Set ONLY while the inner solve is invoked from the seed-screening proxy
/// (`custom_family_seed_screening_proxy_labeled`), which RANKS candidate
/// seeds by their penalized objective and never produces the final fit.
///
/// When `true`, the inner joint-Newton skips the full per-axis
/// Jeffreys/Firth curvature (`custom_family_joint_jeffreys_term`'s
/// `for k in 0..p` directional-derivative loop, O(p · per-axis-Hdot) per
/// cycle), keeping ONLY the cheap value-only Jeffreys term
/// (`custom_family_joint_jeffreys_value`, one reduced-info eigendecomposition)
/// in the screening score. The per-axis gradient/curvature is what the inner
/// Newton step needs to *converge* a near-separating fit; the screening proxy
/// is capped and only ranks, so it does not need step convergence — it needs
/// a finite, separation-aware score cheaply. For a K-block coupled family
/// (Dirichlet/multinomial) each per-axis directional derivative is itself
/// O(K²·n·p), so running the full term for every cascade candidate over the
/// joint width `p` is the wrong cost class and made the coupled fit
/// non-completing during screening alone (gam#729/#808). The actual fit
/// (after a seed is selected) runs with this `false`, so the load-bearing
/// Firth curvature is fully present where it matters.
///
/// **Default `false`** — only the screening proxy sets it `true`.
pub seed_screening: bool,
}
pub const DEFAULT_CUSTOM_FAMILY_INNER_MAX_CYCLES: usize = 1200;
impl Default for BlockwiseFitOptions {
fn default() -> Self {
Self {
// Large-scale custom-family marginal-slope fits can have a
// long, monotone joint-Newton tail: objective and step size keep
// shrinking, but the exact KKT residual may need several hundred
// additional cycles after the old 300-cycle cap. The outer
// REML/LAML derivative path is correct only at a stationary inner
// mode, so a merely descended iterate must not be accepted as
// converged. Use a production-sized cap by default and rely on the
// KKT/objective certificates to exit early for well-conditioned
// Gaussian, logistic, and small-n fits.
inner_max_cycles: DEFAULT_CUSTOM_FAMILY_INNER_MAX_CYCLES,
inner_tol: 1e-6,
outer_max_iter: 60,
outer_tol: 1e-5,
minweight: CUSTOM_FAMILY_WEIGHT_FLOOR,
// `ridge_floor` is an ExplicitPrior in the canonical
// stabilization ledger taxonomy (`StabilizationKind::ExplicitPrior`):
// its δ enters the quadratic term, the Laplace Hessian, and the
// penalty log-determinant — `ridge_policy` below is the live
// policy that confirms which terms it lands in. The default
// pos-part policy enables every inclusion flag, so callers
// wanting solver-only damping should construct a custom policy
// (or, preferably, a `StabilizationLedger::numerical_perturbation`)
// rather than reusing this field.
ridge_floor: CUSTOM_FAMILY_RIDGE_FLOOR,
ridge_policy: RidgePolicy::explicit_stabilization_pospart(),
use_remlobjective: true,
// Default ON: families expose exact outer Hessians whenever their
// analytic dense or operator representation is implemented.
use_outer_hessian: true,
compute_covariance: false,
screening_max_inner_iterations: None,
outer_inner_max_iterations: None,
seed_screening: false,
early_exit_threshold: None,
outer_score_subsample: None,
auto_outer_subsample: true,
outer_eval_context: None,
cache_session: None,
cache_mirror_sessions: Vec::new(),
joint_penalties: None,
screen_initial_rho: true,
}
}
}
#[derive(Clone)]
pub struct BlockwiseInnerResult {
pub block_states: Vec<ParameterBlockState>,
pub active_sets: Vec<Option<Vec<usize>>>,
pub log_likelihood: f64,
pub penalty_value: f64,
pub cycles: usize,
pub converged: bool,
pub block_logdet_h: f64,
pub block_logdet_s: f64,
/// Cached assembled penalty matrices S(ρ) = Σ_k exp(ρ_k) S_k per block.
/// Avoids redundant re-assembly in the outer objective evaluation.
pub s_lambdas: Vec<Array2<f64>>,
pub joint_workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
/// Projected KKT residual at the converged inner iterate, propagated to
/// the unified evaluator's `InnerAssembly::kkt_residual` for the
/// outer REML/LAML scoring path. `None` when the solver path doesn't
/// produce a typed KKT diagnostic (blockwise NR fallback, eager-stop).
pub kkt_residual: Option<crate::estimate::reml::unified::ProjectedKktResidual>,
/// Active linear-inequality constraint rows at the converged inner
/// iterate. When `Some`, the unified evaluator builds the
/// constraint-aware kernel `K_T = K_S − K_S Aᵀ (A K_S Aᵀ)⁻¹ A K_S`
/// for per-coordinate mode responses `v_k = ∂β/∂ρ_k`.
pub active_constraints:
Option<Arc<crate::estimate::reml::unified::ActiveLinearConstraintBlock>>,
}
impl std::fmt::Debug for BlockwiseInnerResult {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("BlockwiseInnerResult")
.field("block_states", &self.block_states)
.field("active_sets", &self.active_sets)
.field("log_likelihood", &self.log_likelihood)
.field("penalty_value", &self.penalty_value)
.field("cycles", &self.cycles)
.field("converged", &self.converged)
.field("block_logdet_h", &self.block_logdet_h)
.field("block_logdet_s", &self.block_logdet_s)
.field("s_lambdas", &self.s_lambdas)
.field(
"joint_workspace",
&self.joint_workspace.as_ref().map(|_| "<workspace>"),
)
.finish()
}
}
#[derive(Clone)]
struct ConstrainedWarmStart {
rho: Array1<f64>,
block_beta: Vec<Array1<f64>>,
active_sets: Vec<Option<Vec<usize>>>,
cached_inner: Option<CachedInnerMode>,
}
#[derive(Clone)]
struct CachedInnerMode {
log_likelihood: f64,
penalty_value: f64,
cycles: usize,
converged: bool,
block_logdet_h: f64,
block_logdet_s: f64,
joint_workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
kkt_residual: Option<crate::estimate::reml::unified::ProjectedKktResidual>,
active_constraints: Option<Arc<crate::estimate::reml::unified::ActiveLinearConstraintBlock>>,
}
fn screened_outer_warm_start<'a>(
warm_start: Option<&'a ConstrainedWarmStart>,
rho: &Array1<f64>,
) -> Option<&'a ConstrainedWarmStart> {
warm_start.filter(|seed| seed.rho.len() == rho.len())
}
fn warm_start_matches_block_log_lambdas(
seed: &ConstrainedWarmStart,
block_log_lambdas: &[Array1<f64>],
) -> bool {
let expected = block_log_lambdas
.iter()
.map(|values| values.len())
.sum::<usize>();
if seed.rho.len() != expected {
return false;
}
let mut offset = 0usize;
for block in block_log_lambdas {
let end = offset + block.len();
if seed.rho.slice(s![offset..end]) != block.view() {
return false;
}
offset = end;
}
true
}
fn cached_inner_mode_from_result(result: &BlockwiseInnerResult) -> CachedInnerMode {
CachedInnerMode {
log_likelihood: result.log_likelihood,
penalty_value: result.penalty_value,
cycles: result.cycles,
converged: result.converged,
block_logdet_h: result.block_logdet_h,
block_logdet_s: result.block_logdet_s,
joint_workspace: result.joint_workspace.clone(),
kkt_residual: result.kkt_residual.clone(),
active_constraints: result.active_constraints.clone(),
}
}
fn constrained_warm_start_from_inner(
rho: &Array1<f64>,
inner: &BlockwiseInnerResult,
) -> ConstrainedWarmStart {
ConstrainedWarmStart {
rho: rho.clone(),
block_beta: inner
.block_states
.iter()
.map(|state| state.beta.clone())
.collect(),
active_sets: inner.active_sets.clone(),
cached_inner: Some(cached_inner_mode_from_result(inner)),
}
}
fn constrained_warm_start_from_cached_beta(
rho_dim: usize,
specs: &[ParameterBlockSpec],
beta: &Array1<f64>,
) -> Result<ConstrainedWarmStart, EstimationError> {
let expected = specs.iter().map(|spec| spec.design.ncols()).sum::<usize>();
if beta.len() != expected {
crate::bail_invalid_estim!(
"cached inner beta has length {}, but custom-family blocks require length {}",
beta.len(),
expected
);
}
crate::families::marginal_slope_shared::bail_if_cached_beta_non_finite(beta)?;
let mut offset = 0usize;
let mut block_beta = Vec::with_capacity(specs.len());
for spec in specs {
let end = offset + spec.design.ncols();
block_beta.push(beta.slice(s![offset..end]).to_owned());
offset = end;
}
Ok(ConstrainedWarmStart {
rho: Array1::zeros(rho_dim),
block_beta,
active_sets: vec![None; specs.len()],
cached_inner: None,
})
}
fn inner_penalized_objective(
inner: &BlockwiseInnerResult,
include_logdet_h: bool,
include_logdet_s: bool,
context: &str,
) -> Result<f64, String> {
let reml_term = if include_logdet_h {
0.5 * inner.block_logdet_h
} else {
0.0
} - if include_logdet_s {
0.5 * inner.block_logdet_s
} else {
0.0
};
checked_penalizedobjective(
inner.log_likelihood,
inner.penalty_value,
reml_term,
context,
)
}
fn nonconverged_outer_efs_result(
inner: &BlockwiseInnerResult,
rho: &Array1<f64>,
theta_dim: usize,
include_logdet_h: bool,
include_logdet_s: bool,
context: &str,
) -> Result<
(
crate::solver::outer_strategy::EfsEval,
ConstrainedWarmStart,
bool,
),
String,
> {
Ok((
crate::solver::outer_strategy::EfsEval {
cost: inner_penalized_objective(inner, include_logdet_h, include_logdet_s, context)?,
steps: vec![0.0; theta_dim],
beta: None,
psi_gradient: None,
psi_indices: None,
inner_hessian_scale: None,
logdet_enclosure_gap: None,
},
constrained_warm_start_from_inner(rho, inner),
false,
))
}
fn warm_start_without_cached_inner_for_psi_derivatives(
warm_start: Option<&ConstrainedWarmStart>,
has_psi_derivatives: bool,
) -> Option<ConstrainedWarmStart> {
if !has_psi_derivatives {
return None;
}
warm_start.cloned().map(|mut warm| {
warm.cached_inner = None;
warm
})
}
fn hash_cf_array_view(hasher: &mut Fingerprinter, values: ndarray::ArrayView1<'_, f64>) {
hasher.write_usize(values.len());
for &value in values {
hasher.write_f64(value);
}
}
fn hash_cf_array2(hasher: &mut Fingerprinter, values: &Array2<f64>) {
hasher.write_usize(values.nrows());
hasher.write_usize(values.ncols());
for &value in values {
hasher.write_f64(value);
}
}
fn hash_cf_design_matrix(hasher: &mut Fingerprinter, design: &DesignMatrix) -> Result<(), String> {
let n = design.nrows();
let p = design.ncols();
hasher.write_usize(n);
hasher.write_usize(p);
let bytes_per_row = p.saturating_mul(std::mem::size_of::<f64>()).max(1);
let chunk_rows = ((8 * 1024 * 1024) / bytes_per_row).clamp(1, 4096);
for start in (0..n).step_by(chunk_rows) {
let end = (start + chunk_rows).min(n);
let chunk = design
.try_row_chunk(start..end)
.map_err(|e| format!("custom-family persistent warm-start design hash failed: {e}"))?;
hash_cf_array2(hasher, &chunk);
}
Ok(())
}
fn hash_cf_penalty(hasher: &mut Fingerprinter, penalty: &PenaltyMatrix) {
match penalty {
PenaltyMatrix::Dense(matrix) => {
hasher.write_str("dense");
hash_cf_array2(hasher, matrix);
}
PenaltyMatrix::KroneckerFactored { left, right } => {
hasher.write_str("kron");
hash_cf_array2(hasher, left);
hash_cf_array2(hasher, right);
}
PenaltyMatrix::Blockwise {
local,
col_range,
total_dim,
} => {
hasher.write_str("blockwise");
hasher.write_usize(col_range.start);
hasher.write_usize(col_range.end);
hasher.write_usize(*total_dim);
hash_cf_array2(hasher, local);
}
PenaltyMatrix::Labeled { label, inner } => {
hasher.write_str("labeled");
hasher.write_str(label);
hash_cf_penalty(hasher, inner);
}
PenaltyMatrix::Fixed { log_lambda, inner } => {
hasher.write_str("fixed");
hasher.write_u64(log_lambda.to_bits());
hash_cf_penalty(hasher, inner);
}
}
}
fn persistent_custom_family_key<F: CustomFamily + ?Sized>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
) -> Option<String> {
let mut hasher = Fingerprinter::new();
hasher.write_str("gamfit-persistent-block-warm-start");
hasher.write_str(&crate::solver::persistent_warm_start::cache_schema_tag());
hasher.write_str(type_name::<F>());
hasher.write_str(&family.persistent_warm_start_fingerprint(specs, options)?);
hasher.write_usize(specs.len());
for spec in specs {
hasher.write_str(&spec.name);
hash_cf_design_matrix(&mut hasher, &spec.design).ok()?;
hash_cf_array_view(&mut hasher, spec.offset.view());
hasher.write_usize(spec.penalties.len());
for penalty in &spec.penalties {
hash_cf_penalty(&mut hasher, penalty);
}
hasher.write_usize(spec.nullspace_dims.len());
for &dim in &spec.nullspace_dims {
hasher.write_usize(dim);
}
hash_cf_array_view(&mut hasher, spec.initial_log_lambdas.view());
}
hasher.write_usize(options.inner_max_cycles);
hasher.write_f64(options.inner_tol);
hasher.write_usize(options.outer_max_iter);
hasher.write_f64(options.outer_tol);
hasher.write_f64(options.minweight);
hasher.write_f64(options.ridge_floor);
hasher.write_str(&format!("{:?}", options.ridge_policy));
hasher.write_bool(options.use_remlobjective);
hasher.write_bool(options.use_outer_hessian);
hasher.write_bool(options.compute_covariance);
hasher.write_bool(options.early_exit_threshold.is_some());
if let Some(value) = options.early_exit_threshold {
hasher.write_f64(value);
}
hasher.write_bool(options.outer_score_subsample.is_some());
hasher.write_bool(options.auto_outer_subsample);
Some(format!("cf-{}", hasher.finish_hex()))
}
fn custom_family_cache_shape(specs: &[ParameterBlockSpec]) -> (usize, Vec<String>, Vec<usize>) {
let n_rows = specs.first().map(|spec| spec.design.nrows()).unwrap_or(0);
let block_names = specs.iter().map(|spec| spec.name.clone()).collect();
let block_dims = specs.iter().map(|spec| spec.design.ncols()).collect();
(n_rows, block_names, block_dims)
}
fn load_persistent_custom_family_warm_start<F: CustomFamily + ?Sized>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
rho_len: usize,
) -> (Option<String>, Option<ConstrainedWarmStart>) {
let Some(key) = persistent_custom_family_key::<F>(family, specs, options) else {
return (None, None);
};
let (n_rows, block_names, block_dims) = custom_family_cache_shape(specs);
let Some(record) = load_block_record(&key) else {
return (Some(key), None);
};
if !record.is_compatible(&key, n_rows, &block_names, &block_dims, rho_len) {
return (Some(key), None);
}
let active_sets = normalize_active_sets(record.active_sets);
let cached_inner = record.inner.map(|inner| CachedInnerMode {
log_likelihood: inner.log_likelihood,
penalty_value: inner.penalty_value,
cycles: inner.cycles,
converged: inner.converged,
block_logdet_h: inner.block_logdet_h,
block_logdet_s: inner.block_logdet_s,
joint_workspace: None,
// Persistent warm-start records don't carry the KKT-residual or
// active-constraint diagnostics (they're not serialized on disk;
// they're rebuilt from the inner solve on next visit), so a
// restored cache replay forces the unified evaluator's IFT
// correction path to degrade to its no-data branch until a fresh
// joint-Newton pass produces them.
kkt_residual: None,
active_constraints: None,
});
let inner_status = cached_inner.as_ref().map_or("missing", |inner| {
if inner.converged {
"converged"
} else {
"partial"
}
});
log::info!(
"[warm-start-cache] restored custom-family persistent warm start key={key} inner={inner_status}"
);
(
Some(key),
Some(ConstrainedWarmStart {
rho: Array1::from_vec(record.rho),
block_beta: record
.block_beta
.into_iter()
.map(Array1::from_vec)
.collect(),
active_sets,
cached_inner,
}),
)
}
fn persistent_block_inner_summary(
warm_start: &ConstrainedWarmStart,
) -> Option<PersistentBlockInnerSummary> {
warm_start.cached_inner.as_ref().and_then(|cached| {
(cached.log_likelihood.is_finite()
&& cached.penalty_value.is_finite()
&& cached.block_logdet_h.is_finite()
&& cached.block_logdet_s.is_finite())
.then_some(PersistentBlockInnerSummary {
log_likelihood: cached.log_likelihood,
penalty_value: cached.penalty_value,
cycles: cached.cycles,
converged: cached.converged,
block_logdet_h: cached.block_logdet_h,
block_logdet_s: cached.block_logdet_s,
})
})
}
fn store_persistent_custom_family_warm_start(
key: Option<&str>,
specs: &[ParameterBlockSpec],
warm_start: &ConstrainedWarmStart,
) {
let Some(key) = key else {
return;
};
let (n_rows, block_names, block_dims) = custom_family_cache_shape(specs);
if warm_start.block_beta.len() != block_dims.len()
|| warm_start
.block_beta
.iter()
.zip(block_dims.iter())
.any(|(beta, dim)| beta.len() != *dim || beta.iter().any(|v| !v.is_finite()))
|| warm_start.rho.iter().any(|v| !v.is_finite())
{
return;
}
// Saturation gate: never persist ρ that hit the outer optimizer's
// box (|ρ_i| ≥ 9). Those iterates are either at a legitimate active
// bound or a non-converged intermediate; either way they make poor
// seed material because the load-side clamp pulls them back into
// the interior anyway (see `outer_strategy.rs` `[CACHE] hit-clamp`).
const SATURATION_THRESHOLD: f64 = 9.0;
if warm_start
.rho
.iter()
.any(|&v| v.abs() >= SATURATION_THRESHOLD)
{
log::debug!(
"[warm-start-cache] skip persist custom-family key={} \
reason=rho-saturated threshold=±{:.1} rho_inf_norm={:.3e}",
key,
SATURATION_THRESHOLD,
warm_start
.rho
.iter()
.fold(0.0_f64, |acc, &v| acc.max(v.abs())),
);
return;
}
let mut record =
PersistentBlockWarmStartRecord::new(key.to_string(), n_rows, block_names, block_dims);
record.updated_unix_secs = record.created_unix_secs;
record.rho = warm_start.rho.to_vec();
record.block_beta = warm_start
.block_beta
.iter()
.map(|beta| beta.to_vec())
.collect();
record.active_sets = warm_start.active_sets.clone();
record.inner = persistent_block_inner_summary(warm_start);
if let Err(err) = store_block_record(&record) {
log::warn!("[warm-start-cache] failed to persist custom-family warm start: {err}");
}
}
const CUSTOM_OUTER_INNER_CAP_MARGIN: usize = 5;
fn update_custom_outer_inner_cap_from_warm_start(
options: &BlockwiseFitOptions,
warm_start: &ConstrainedWarmStart,
gradient_norm: Option<f64>,
initial_gradient_norm: &mut Option<f64>,
) {
let Some(outer_cap) = options.outer_inner_max_iterations.as_ref() else {
return;
};
let full_budget = options.inner_max_cycles.max(1);
let Some(cached_inner) = warm_start.cached_inner.as_ref() else {
outer_cap.store(full_budget, Ordering::Relaxed);
return;
};
if let Some(norm) = gradient_norm.filter(|value| value.is_finite() && *value > 0.0) {
if initial_gradient_norm.is_none() {
*initial_gradient_norm = Some(norm);
}
if matches!(*initial_gradient_norm, Some(initial) if initial > 0.0 && norm / initial < 0.01)
{
outer_cap.store(full_budget, Ordering::Relaxed);
return;
}
}
let next_cap = if cached_inner.converged {
cached_inner
.cycles
.saturating_add(CUSTOM_OUTER_INNER_CAP_MARGIN)
} else {
cached_inner.cycles.saturating_mul(2).max(
cached_inner
.cycles
.saturating_add(CUSTOM_OUTER_INNER_CAP_MARGIN),
)
}
.clamp(1, full_budget);
outer_cap.store(next_cap, Ordering::Relaxed);
}
/// Helper struct mirroring the old `BlockwiseFitResultParts`.
pub struct BlockwiseFitResultParts {
pub block_states: Vec<ParameterBlockState>,
pub log_likelihood: f64,
pub log_lambdas: Array1<f64>,
pub lambdas: Array1<f64>,
pub covariance_conditional: Option<Array2<f64>>,
pub stable_penalty_term: f64,
pub penalized_objective: f64,
pub outer_iterations: usize,
/// `None` = no gradient measured at termination (cache-hit, gradient-free,
/// or trivial early-exit); `Some(g)` = measured norm. `outer_converged`
/// is the authoritative convergence signal.
pub outer_gradient_norm: Option<f64>,
/// First-order optimality certificate from the outer smoothing solve
/// (#934); `None` when no outer ran (fixed-λ, one-cycle probe) or the
/// audit could not evaluate.
pub criterion_certificate: Option<crate::solver::outer_strategy::CriterionCertificate>,
pub inner_cycles: usize,
pub outer_converged: bool,
pub geometry: Option<FitGeometry>,
/// Effective degrees of freedom computed by the caller in the *reduced*
/// (canonical) coefficient space, where the penalized Hessian is full rank,
/// as `(edf_total, edf_by_penalty, block_edf)`. The trace edf is invariant
/// under the canonical reparameterization, so computing it in the reduced
/// space and reporting it on the raw fit is exact — and it avoids the
/// `tr((H_raw + εI)⁻¹ S_raw)` blow-up that a rank-deficient raw-lifted
/// Hessian (zero rows/cols on canonicalization-dropped directions) would
/// otherwise inject. `None` when the caller has no reduced geometry (e.g.
/// the one-cycle inner probe), in which case `blockwise_fit_from_parts`
/// falls back to computing edf from whatever geometry it was handed.
pub precomputed_edf: Option<(f64, Vec<f64>, Vec<f64>)>,
}
fn validate_parameter_block_state_finiteness(
label: &str,
state: &ParameterBlockState,
) -> Result<(), String> {
validate_all_finite_estimation(&format!("{label}.beta"), state.beta.iter().copied())
.map_err(|e| e.to_string())?;
validate_all_finite_estimation(&format!("{label}.eta"), state.eta.iter().copied())
.map_err(|e| e.to_string())?;
Ok(())
}
fn validate_lambda_pair_consistency(
log_lambdas: &Array1<f64>,
lambdas: &Array1<f64>,
label: &str,
) -> Result<(), String> {
if log_lambdas.len() != lambdas.len() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"{label} length mismatch: log_lambdas={}, lambdas={}",
log_lambdas.len(),
lambdas.len()
),
}
.into());
}
for (idx, (&log_lambda, &lambda)) in log_lambdas.iter().zip(lambdas.iter()).enumerate() {
let expected = log_lambda.exp();
let tolerance = 1e-10 * expected.abs().max(1.0);
if (lambda - expected).abs() > tolerance {
return Err(format!(
"{label}[{idx}] inconsistent with exp(log_lambda): got {lambda}, expected {expected}",
));
}
}
Ok(())
}
/// Effective degrees of freedom for a converged blockwise custom-family fit,
/// computed from the joint penalized Hessian `H = X'W_HX + S(λ)` and the
/// per-penalty matrices `S_k` exactly as the standard GAM path and mgcv do:
///
/// ```text
/// edf_total = p − Σ_k λ_k · tr(H⁻¹ S_k)
/// edf_penalty = (rank_k − λ_k · tr(H⁻¹ S_k)) clamped to [0, rank_k]
/// ```
///
/// `S_k` here is the *unscaled* penalty (its `λ_k` factor is applied here), and
/// each `S_k.to_dense()` is already embedded in the joint `p × p` coefficient
/// layout (the Blockwise / Kronecker variants place their local block at the
/// correct column range), so the trace solve runs in the full joint space and
/// no per-block offset bookkeeping is required.
///
/// The custom-family path (CTN transformation-normal, Dirichlet, …) builds its
/// fit through `blockwise_fit_from_parts` and previously left `inference` at
/// `None`, so `edf_total` was unavailable for every custom family even though
/// the converged geometry already carries the penalized Hessian. This mirrors
/// the survival-path repair (`survival_transformation_edf`, #565) for the
/// blockwise engine: the same trace formula, factorized with the same
/// ridge-retry stabilization so a marginally indefinite Hessian at a boundary
/// optimum still yields a usable trace instead of dropping inference.
///
/// `edf_penalty` is returned aligned 1:1 with the flattened `lambdas`
/// (one entry per penalty across all blocks), matching the
/// `FitInference::edf_by_block` ↔ `lambdas` length invariant. The per-block
/// aggregate edf (for `FittedBlock::edf`) is the sum of that block's penalty
/// edfs, with an unpenalized block contributing its full column count.
fn custom_family_blockwise_edf(
penalized_hessian: &Array2<f64>,
specs: &[ParameterBlockSpec],
lambdas: &ndarray::ArrayView1<'_, f64>,
) -> Result<(f64, Vec<f64>, Vec<f64>), String> {
let p = penalized_hessian.nrows();
let total_cols: usize = specs.iter().map(|s| s.design.ncols()).sum();
if penalized_hessian.ncols() != p || total_cols != p {
return Err(format!(
"custom-family edf: penalized Hessian {}x{} inconsistent with total block width {}",
penalized_hessian.nrows(),
penalized_hessian.ncols(),
total_cols
));
}
let expected_rho: usize = specs.iter().map(|s| s.penalties.len()).sum();
if lambdas.len() != expected_rho {
return Err(format!(
"custom-family edf: lambdas length {} does not match total penalty count {}",
lambdas.len(),
expected_rho
));
}
let h_sym = SymmetricMatrix::Dense(penalized_hessian.clone());
// Sparse-aware factorization with ridge retry (mirrors estimate.rs and
// survival_transformation_edf): a boundary-constrained optimum can leave
// the penalized Hessian marginally indefinite, in which case we add the
// smallest diagonal shift that restores definiteness so the trace solve
// succeeds rather than dropping inference for the whole fit.
let factor = {
let scale = h_sym.max_abs_diag();
let min_step = scale * 1e-10;
let mut ridge = 0.0_f64;
let mut attempts = 0_usize;
loop {
let candidate = if ridge > 0.0 {
h_sym.addridge(ridge).unwrap_or_else(|_| h_sym.clone())
} else {
h_sym.clone()
};
if let Ok(f) = candidate.factorize() {
break f;
}
attempts += 1;
if attempts >= 8 {
return Err(
"custom-family edf: penalized Hessian could not be factorized".to_string(),
);
}
ridge = if ridge <= 0.0 { min_step } else { ridge * 10.0 };
}
};
let mut edf_by_penalty = vec![0.0_f64; expected_rho];
let mut block_edf = Vec::with_capacity(specs.len());
let mut total_trace = 0.0_f64;
let mut penalty_offset = 0usize;
let mut block_col_start = 0usize;
for spec in specs.iter() {
let block_cols = spec.design.ncols();
let mut block_edf_acc = block_cols as f64;
for (local_k, penalty) in spec.penalties.iter().enumerate() {
let global_k = penalty_offset + local_k;
let lambda = lambdas[global_k];
// Embed S_k into the full p×p joint layout. `PenaltyMatrix::to_dense`
// returns the *local* block matrix for the `Dense` variant but the
// already-embedded full-width matrix for `Blockwise`/`Kronecker`, so
// dispatch on the materialized dimension: a local (block_cols-wide)
// penalty is placed at this block's column range, a full-width
// penalty is used as-is (mirrors `survival_transformation_edf`'s
// explicit block placement).
let s_local = penalty.to_dense();
let mut s_full = Array2::<f64>::zeros((p, p));
if s_local.nrows() == p && s_local.ncols() == p {
s_full.assign(&s_local);
} else if s_local.nrows() == block_cols && s_local.ncols() == block_cols {
let r = block_col_start..block_col_start + block_cols;
s_full.slice_mut(ndarray::s![r.clone(), r]).assign(&s_local);
} else {
return Err(format!(
"custom-family edf: penalty {global_k} materialized to {}x{}, expected {p}x{p} or {block_cols}x{block_cols}",
s_local.nrows(),
s_local.ncols()
));
}
// tr(H⁻¹ S_k) via H Z = S_k, summing the diagonal of Z.
let z = factor.solvemulti(&s_full).map_err(|e| {
format!("custom-family edf trace solve failed for penalty {global_k}: {e}")
})?;
let mut trace = 0.0_f64;
for d in 0..p {
trace += z[[d, d]];
}
let lam_trace = if lambda > 0.0 { lambda * trace } else { 0.0 };
total_trace += lam_trace;
// Per-penalty edf is bounded by the columns this penalty acts on,
// i.e. its block's column count (a `Blockwise` penalty reports the
// full joint width from `dim()`, so cap at `block_cols`, not `dim()`).
let penalty_cols = block_cols as f64;
let edf_k = (penalty_cols - lam_trace).clamp(0.0, penalty_cols);
edf_by_penalty[global_k] = edf_k;
// The block's edf is the column count minus the total trace this
// block's penalties spend (so multiple penalties on one block
// compose), clamped to the block's column count.
block_edf_acc -= lam_trace;
}
block_edf.push(block_edf_acc.clamp(0.0, block_cols as f64));
penalty_offset += spec.penalties.len();
block_col_start += block_cols;
}
let edf_total = (p as f64 - total_trace).clamp(0.0, p as f64);
if !edf_total.is_finite()
|| edf_by_penalty.iter().any(|v| !v.is_finite())
|| block_edf.iter().any(|v| !v.is_finite())
{
return Err("custom-family edf: non-finite effective degrees of freedom".to_string());
}
Ok((edf_total, edf_by_penalty, block_edf))
}
/// Compute reduced-space effective degrees of freedom for a converged fit,
/// to be carried through `BlockwiseFitResultParts::precomputed_edf`.
///
/// The reduced (canonical) geometry's penalized Hessian is full rank and its
/// `reduced_specs` carry the pulled-back penalties `T_iᵀ S_k T_i`, so the trace
/// edf is computed exactly here (no rank-deficiency ridge bias). Because the
/// trace edf is invariant under the canonical reparameterization, the resulting
/// `edf_total` / per-penalty / per-block values are the same as they would be
/// in the raw basis and are reported directly on the lifted raw fit. Returns
/// `None` when no reduced geometry is available, so the caller can leave
/// `precomputed_edf` unset (and the raw-geometry fallback applies).
fn reduced_blockwise_edf(
reduced_geometry: Option<&FitGeometry>,
canonical: &crate::solver::identifiability_canonical::CanonicalSpecs,
lambdas: &Array1<f64>,
) -> Option<(f64, Vec<f64>, Vec<f64>)> {
let geom = reduced_geometry?;
match custom_family_blockwise_edf(
geom.penalized_hessian.as_array(),
&canonical.reduced_specs,
&lambdas.view(),
) {
Ok(triple) => Some(triple),
Err(err) => {
log::warn!(
"[custom-family inference] reduced-space effective degrees of freedom unavailable: {err}"
);
None
}
}
}
/// Build a `UnifiedFitResult` from blockwise-specific fields.
pub fn blockwise_fit_from_parts(
parts: BlockwiseFitResultParts,
specs: &[ParameterBlockSpec],
) -> Result<crate::solver::estimate::UnifiedFitResult, String> {
let BlockwiseFitResultParts {
block_states,
log_likelihood,
log_lambdas,
lambdas,
covariance_conditional,
stable_penalty_term,
penalized_objective,
outer_iterations,
outer_gradient_norm,
criterion_certificate,
inner_cycles,
outer_converged,
geometry,
precomputed_edf,
} = parts;
if block_states.is_empty() {
return Err(CustomFamilyError::UnsupportedConfiguration {
reason: "blockwise fit requires at least one block state".to_string(),
}
.into());
}
ensure_finite_scalar_estimation("blockwise_fit.log_likelihood", log_likelihood)
.map_err(|e| e.to_string())?;
validate_all_finite_estimation("blockwise_fit.log_lambdas", log_lambdas.iter().copied())
.map_err(|e| e.to_string())?;
validate_all_finite_estimation("blockwise_fit.lambdas", lambdas.iter().copied())
.map_err(|e| e.to_string())?;
validate_lambda_pair_consistency(&log_lambdas, &lambdas, "blockwise_fit.lambdas")?;
ensure_finite_scalar_estimation("blockwise_fit.penalized_objective", penalized_objective)
.map_err(|e| e.to_string())?;
ensure_finite_scalar_estimation("blockwise_fit.stable_penalty_term", stable_penalty_term)
.map_err(|e| e.to_string())?;
if let Some(g) = outer_gradient_norm {
ensure_finite_scalar_estimation("blockwise_fit.outer_gradient_norm", g)
.map_err(|e| e.to_string())?;
}
if block_states.len() != specs.len() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"blockwise_fit.block_states length ({}) does not match specs length ({})",
block_states.len(),
specs.len()
),
}
.into());
}
let n = specs[0].design.nrows();
let total_p = block_states
.iter()
.map(|state| state.beta.len())
.sum::<usize>();
for (idx, state) in block_states.iter().enumerate() {
validate_parameter_block_state_finiteness(
&format!("blockwise_fit.block_states[{idx}]"),
state,
)?;
let expected_rows = specs[idx].solver_design().nrows();
if state.eta.len() != expected_rows {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"blockwise_fit.block_states[{idx}] eta length mismatch: got {}, expected {} (solver design rows)",
state.eta.len(),
expected_rows
) }.into());
}
}
if let Some(cov) = covariance_conditional.as_ref() {
validate_all_finite_estimation("blockwise_fit.covariance_conditional", cov.iter().copied())
.map_err(|e| e.to_string())?;
let (rows, cols) = cov.dim();
if rows != total_p || cols != total_p {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"blockwise_fit.covariance_conditional must be {}x{}, got {}x{}",
total_p, total_p, rows, cols
),
}
.into());
}
}
if let Some(geom) = geometry.as_ref() {
geom.validate_numeric_finiteness()
.map_err(|e| e.to_string())?;
let (rows, cols) = geom.penalized_hessian.dim();
if rows != total_p || cols != total_p {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"blockwise_fit.geometry.penalized_hessian must be {}x{}, got {}x{}",
total_p, total_p, rows, cols
),
}
.into());
}
let geom_len = geom.working_weights.len();
if geom_len != geom.working_response.len() {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"blockwise_fit.geometry working vector length mismatch: weights={}, response={}",
geom.working_weights.len(),
geom.working_response.len(),
) }.into());
}
if geom_len != n && (n == 0 || geom_len % n != 0) {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"blockwise_fit.geometry.working_weights length mismatch: got {geom_len}, expected {n} or a stacked multiple of {n}",
) }.into());
}
if geom.working_response.len() != n && (n == 0 || geom.working_response.len() % n != 0) {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"blockwise_fit.geometry.working_response length mismatch: got {}, expected {n} or a stacked multiple of {n}",
geom.working_response.len(),
) }.into());
}
}
// Build unified blocks from the blockwise states.
use crate::solver::estimate::{FittedBlock, FittedLinkState, UnifiedFitResultParts};
let expected_rho: usize = specs.iter().map(|s| s.penalties.len()).sum();
if lambdas.len() != expected_rho {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"blockwise_fit.lambdas length ({}) does not match sum of per-block penalty counts ({})",
lambdas.len(),
expected_rho
) }.into());
}
// Effective degrees of freedom and the inference block. When the
// converged geometry carries the joint penalized Hessian we compute the
// mgcv trace edf `p − Σ_k λ_k·tr(H⁻¹ S_k)` here so every custom-family fit
// (CTN transformation-normal, Dirichlet, …) reports `edf_total` /
// per-block `edf` like the standard GAM path, instead of leaving inference
// unpopulated. A factorization failure is non-fatal: the fit still returns
// with `edf=0`/`inference=None` rather than aborting, but in practice the
// ridge-retry inside `custom_family_blockwise_edf` recovers any boundary
// indefiniteness.
let (edf_total_opt, edf_by_penalty, block_edf): (Option<f64>, Vec<f64>, Vec<f64>) =
match precomputed_edf {
// Reduced-space edf supplied by the caller (the principled path:
// the trace is computed where the Hessian is full rank, then
// reported on the raw fit — exact because the trace edf is
// reparameterization-invariant).
Some((edf_total, edf_by_penalty, block_edf)) => {
(Some(edf_total), edf_by_penalty, block_edf)
}
// Fallback: compute from whatever geometry we were handed. Used
// only when the caller did not precompute (no reduced geometry);
// the ridge-retry factorization makes this robust to a marginally
// indefinite Hessian.
None => match geometry.as_ref() {
Some(geom) => {
match custom_family_blockwise_edf(
geom.penalized_hessian.as_array(),
specs,
&lambdas.view(),
) {
Ok((edf_total, edf_by_penalty, block_edf)) => {
(Some(edf_total), edf_by_penalty, block_edf)
}
Err(err) => {
log::warn!(
"[custom-family inference] effective degrees of freedom unavailable: {err}"
);
(None, Vec::new(), vec![0.0; block_states.len()])
}
}
}
None => (None, Vec::new(), vec![0.0; block_states.len()]),
},
};
let mut lambda_offset = 0usize;
let blocks: Vec<FittedBlock> = block_states
.iter()
.enumerate()
.map(|(i, bs)| {
let role = custom_family_block_role(&specs[i].name, i, block_states.len());
let k = specs[i].penalties.len();
let block_lambdas = lambdas
.slice(s![lambda_offset..lambda_offset + k])
.to_owned();
lambda_offset += k;
FittedBlock {
beta: bs.beta.clone(),
role,
edf: block_edf.get(i).copied().unwrap_or(0.0),
lambdas: block_lambdas,
}
})
.collect();
let deviance = -2.0 * log_likelihood;
// Assemble the inference block from the converged geometry. CTN and other
// custom families estimate their own likelihood scale, so the penalized
// Hessian is reported unscaled (dispersion = 1) — the EDF trace is
// dispersion-free, and downstream covariance scaling pairs `H` with the
// family's own dispersion where needed.
let inference = match (edf_total_opt, geometry.as_ref()) {
(Some(edf_total), Some(geom)) => Some(crate::solver::estimate::FitInference {
edf_by_block: edf_by_penalty,
edf_total,
smoothing_correction: None,
penalized_hessian: geom.penalized_hessian.clone(),
working_weights: geom.working_weights.clone(),
working_response: geom.working_response.clone(),
reparam_qs: None,
dispersion: crate::solver::estimate::Dispersion::Known(1.0),
beta_covariance: None,
beta_standard_errors: None,
beta_covariance_corrected: None,
beta_standard_errors_corrected: None,
beta_covariance_frequentist: None,
coefficient_influence: None,
bias_correction_beta: None,
}),
_ => None,
};
crate::solver::estimate::UnifiedFitResult::try_from_parts(UnifiedFitResultParts {
blocks,
log_lambdas: log_lambdas.clone(),
lambdas: lambdas.clone(),
likelihood_family: None,
likelihood_scale: crate::types::LikelihoodScaleMetadata::Unspecified,
log_likelihood_normalization: crate::types::LogLikelihoodNormalization::UserProvided,
log_likelihood,
deviance,
reml_score: penalized_objective,
stable_penalty_term,
penalized_objective,
outer_iterations,
outer_converged,
outer_gradient_norm,
standard_deviation: 1.0,
covariance_conditional,
covariance_corrected: None,
inference,
fitted_link: FittedLinkState::Standard(None),
geometry,
block_states,
pirls_status: crate::pirls::PirlsStatus::Converged,
max_abs_eta: 0.0,
constraint_kkt: None,
artifacts: crate::solver::estimate::FitArtifacts {
pirls: None,
criterion_certificate,
..Default::default()
},
inner_cycles,
})
.map_err(|e| e.to_string())
}
fn checked_penalizedobjective(
log_likelihood: f64,
penalty_value: f64,
reml_term: f64,
context: &str,
) -> Result<f64, String> {
let objective = -log_likelihood + penalty_value + reml_term;
if objective.is_finite() {
Ok(objective)
} else {
Err(CustomFamilyError::NumericalFailure {
reason: format!(
"{context}: non-finite penalized objective \
(log_likelihood={log_likelihood}, penalty_value={penalty_value}, \
reml_term={reml_term}, objective={objective})"
),
}
.into())
}
}
#[derive(Clone)]
pub struct CustomFamilyBlockPsiDerivative {
pub penalty_index: Option<usize>,
pub x_psi: Array2<f64>,
pub s_psi: Array2<f64>,
pub s_psi_components: Option<Vec<(usize, Array2<f64>)>>,
pub s_psi_penalty_components: Option<Vec<(usize, PenaltyMatrix)>>,
pub x_psi_psi: Option<Vec<Array2<f64>>>,
pub s_psi_psi: Option<Vec<Array2<f64>>>,
pub s_psi_psi_components: Option<Vec<Vec<(usize, Array2<f64>)>>>,
pub s_psi_psi_penalty_components: Option<Vec<Vec<(usize, PenaltyMatrix)>>>,
pub(crate) implicit_operator: Option<Arc<dyn CustomFamilyPsiDerivativeOperator>>,
pub implicit_axis: usize,
pub implicit_group_id: Option<usize>,
}
pub(crate) type SharedDerivativeBlocks = Arc<Vec<Vec<CustomFamilyBlockPsiDerivative>>>;
impl CustomFamilyBlockPsiDerivative {
/// Public constructor for use in tests and external consumers.
/// Sets `implicit_operator` to `None`.
pub fn new(
penalty_index: Option<usize>,
x_psi: Array2<f64>,
s_psi: Array2<f64>,
s_psi_components: Option<Vec<(usize, Array2<f64>)>>,
x_psi_psi: Option<Vec<Array2<f64>>>,
s_psi_psi: Option<Vec<Array2<f64>>>,
s_psi_psi_components: Option<Vec<Vec<(usize, Array2<f64>)>>>,
) -> Self {
Self {
penalty_index,
x_psi,
s_psi,
s_psi_components,
s_psi_penalty_components: None,
x_psi_psi,
s_psi_psi,
s_psi_psi_components,
s_psi_psi_penalty_components: None,
implicit_operator: None,
implicit_axis: 0,
implicit_group_id: None,
}
}
}
pub(crate) trait CustomFamilyPsiDerivativeOperator: Send + Sync + Any {
fn as_any(&self) -> &dyn Any;
fn n_data(&self) -> usize;
fn p_out(&self) -> usize;
fn transpose_mul(
&self,
axis: usize,
v: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError>;
fn forward_mul(
&self,
axis: usize,
u: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError>;
fn transpose_mul_second_diag(
&self,
axis: usize,
v: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError>;
fn transpose_mul_second_cross(
&self,
axis_d: usize,
axis_e: usize,
v: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError>;
fn forward_mul_second_diag(
&self,
axis: usize,
u: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError>;
fn forward_mul_second_cross(
&self,
axis_d: usize,
axis_e: usize,
u: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError>;
fn row_chunk_first(
&self,
axis: usize,
rows: Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError>;
/// Single-row specialization of `row_chunk_first`. Default implementation
/// delegates to `row_chunk_first(axis, row..row+1)` and copies the
/// resulting row into the output buffer; implementations that can avoid
/// the temporary matrix allocation should override this method.
fn row_vector_first_into(
&self,
axis: usize,
row: usize,
mut out: ArrayViewMut1<'_, f64>,
) -> Result<(), crate::terms::basis::BasisError> {
let chunk = self.row_chunk_first(axis, row..row + 1)?;
out.assign(&chunk.row(0));
Ok(())
}
fn row_chunk_second_diag(
&self,
axis: usize,
rows: Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError>;
fn row_chunk_second_cross(
&self,
axis_d: usize,
axis_e: usize,
rows: Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError>;
/// Optional upcast to the dense materialization surface. Production exact
/// paths should prefer the analytic matvec / row-chunk methods above and
/// avoid forming the full derivative matrix; implementations that *do*
/// support dense materialization (used by diagnostics, tests, and
/// small-data fallbacks) should override this to return `Some(self)`.
fn as_materializable(&self) -> Option<&dyn MaterializablePsiDerivativeOperator> {
None
}
}
/// Diagnostic / small-data extension that exposes dense materialization of
/// `\partial X / \partial \psi`. Production exact-Hessian code MUST NOT depend
/// on dense second-derivative materialization; second-order paths use the
/// row-chunk and matvec methods on [`CustomFamilyPsiDerivativeOperator`].
pub(crate) trait MaterializablePsiDerivativeOperator:
CustomFamilyPsiDerivativeOperator
{
fn materialize_first(
&self,
axis: usize,
) -> Result<Array2<f64>, crate::terms::basis::BasisError>;
}
impl CustomFamilyPsiDerivativeOperator for crate::terms::basis::ImplicitDesignPsiDerivative {
fn as_any(&self) -> &dyn Any {
self
}
fn n_data(&self) -> usize {
crate::terms::basis::ImplicitDesignPsiDerivative::n_data(self)
}
fn p_out(&self) -> usize {
crate::terms::basis::ImplicitDesignPsiDerivative::p_out(self)
}
fn transpose_mul(
&self,
axis: usize,
v: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
crate::terms::basis::ImplicitDesignPsiDerivative::transpose_mul(self, axis, v)
}
fn forward_mul(
&self,
axis: usize,
u: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
crate::terms::basis::ImplicitDesignPsiDerivative::forward_mul(self, axis, u)
}
fn row_chunk_first(
&self,
axis: usize,
rows: Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
let f: fn(
&crate::terms::basis::ImplicitDesignPsiDerivative,
usize,
Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> =
crate::terms::basis::ImplicitDesignPsiDerivative::row_chunk_first;
f(self, axis, rows)
}
fn row_vector_first_into(
&self,
axis: usize,
row: usize,
out: ArrayViewMut1<'_, f64>,
) -> Result<(), crate::terms::basis::BasisError> {
crate::terms::basis::ImplicitDesignPsiDerivative::row_vector_first_into(
self, axis, row, out,
)
}
fn row_chunk_second_diag(
&self,
axis: usize,
rows: Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
let f: fn(
&crate::terms::basis::ImplicitDesignPsiDerivative,
usize,
Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> =
crate::terms::basis::ImplicitDesignPsiDerivative::row_chunk_second_diag;
f(self, axis, rows)
}
fn row_chunk_second_cross(
&self,
axis_d: usize,
axis_e: usize,
rows: Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
let f: fn(
&crate::terms::basis::ImplicitDesignPsiDerivative,
usize,
usize,
Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> =
crate::terms::basis::ImplicitDesignPsiDerivative::row_chunk_second_cross;
f(self, axis_d, axis_e, rows)
}
fn transpose_mul_second_diag(
&self,
axis: usize,
v: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
crate::terms::basis::ImplicitDesignPsiDerivative::transpose_mul_second_diag(self, axis, v)
}
fn transpose_mul_second_cross(
&self,
axis_d: usize,
axis_e: usize,
v: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
crate::terms::basis::ImplicitDesignPsiDerivative::transpose_mul_second_cross(
self, axis_d, axis_e, v,
)
}
fn forward_mul_second_diag(
&self,
axis: usize,
u: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
crate::terms::basis::ImplicitDesignPsiDerivative::forward_mul_second_diag(self, axis, u)
}
fn forward_mul_second_cross(
&self,
axis_d: usize,
axis_e: usize,
u: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
crate::terms::basis::ImplicitDesignPsiDerivative::forward_mul_second_cross(
self, axis_d, axis_e, u,
)
}
fn as_materializable(&self) -> Option<&dyn MaterializablePsiDerivativeOperator> {
Some(self)
}
}
impl MaterializablePsiDerivativeOperator for crate::terms::basis::ImplicitDesignPsiDerivative {
fn materialize_first(
&self,
axis: usize,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
crate::terms::basis::ImplicitDesignPsiDerivative::materialize_first(self, axis)
}
}
pub(crate) struct EmbeddedImplicitPsiDerivativeOperator {
base: Arc<crate::terms::basis::ImplicitDesignPsiDerivative>,
total_p: usize,
global_range: Range<usize>,
}
impl EmbeddedImplicitPsiDerivativeOperator {
pub(crate) fn new(
base: Arc<crate::terms::basis::ImplicitDesignPsiDerivative>,
global_range: Range<usize>,
total_p: usize,
) -> Result<Self, String> {
if base.p_out() != global_range.len() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"embedded implicit psi operator width mismatch: got {}, expected {}",
base.p_out(),
global_range.len()
),
}
.into());
}
if global_range.end > total_p {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"embedded implicit psi operator range {}..{} exceeds total width {total_p}",
global_range.start, global_range.end
),
}
.into());
}
Ok(Self {
base,
total_p,
global_range,
})
}
fn embed_vector(&self, local: Array1<f64>) -> Array1<f64> {
let mut out = Array1::<f64>::zeros(self.total_p);
out.slice_mut(ndarray::s![self.global_range.clone()])
.assign(&local);
out
}
fn local_coeffs(
&self,
u: &ArrayView1<'_, f64>,
context: &str,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
if u.len() != self.total_p {
return Err(crate::terms::basis::BasisError::Other(format!(
"{context} expected coefficient length {}, got {}",
self.total_p,
u.len()
)));
}
Ok(u.slice(ndarray::s![self.global_range.clone()]).to_owned())
}
}
impl CustomFamilyPsiDerivativeOperator for EmbeddedImplicitPsiDerivativeOperator {
fn as_any(&self) -> &dyn Any {
self
}
fn n_data(&self) -> usize {
self.base.n_data()
}
fn p_out(&self) -> usize {
self.total_p
}
fn transpose_mul(
&self,
axis: usize,
v: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
Ok(self.embed_vector(self.base.transpose_mul(axis, v)?))
}
fn forward_mul(
&self,
axis: usize,
u: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
let local = self.local_coeffs(u, "embedded implicit psi forward_mul")?;
self.base.forward_mul(axis, &local.view())
}
fn transpose_mul_second_diag(
&self,
axis: usize,
v: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
Ok(self.embed_vector(self.base.transpose_mul_second_diag(axis, v)?))
}
fn transpose_mul_second_cross(
&self,
axis_d: usize,
axis_e: usize,
v: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
Ok(self.embed_vector(self.base.transpose_mul_second_cross(axis_d, axis_e, v)?))
}
fn forward_mul_second_diag(
&self,
axis: usize,
u: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
let local = self.local_coeffs(u, "embedded implicit psi forward_mul_second_diag")?;
self.base.forward_mul_second_diag(axis, &local.view())
}
fn forward_mul_second_cross(
&self,
axis_d: usize,
axis_e: usize,
u: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
let local = self.local_coeffs(u, "embedded implicit psi forward_mul_second_cross")?;
self.base
.forward_mul_second_cross(axis_d, axis_e, &local.view())
}
fn row_chunk_first(
&self,
axis: usize,
rows: Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
let local = self.base.row_chunk_first(axis, rows)?;
Ok(EmbeddedColumnBlock::new(&local, self.global_range.clone(), self.total_p).materialize())
}
fn row_vector_first_into(
&self,
axis: usize,
row: usize,
mut out: ArrayViewMut1<'_, f64>,
) -> Result<(), crate::terms::basis::BasisError> {
out.fill(0.0);
let local_slice = out.slice_mut(ndarray::s![self.global_range.clone()]);
self.base.row_vector_first_into(axis, row, local_slice)
}
fn row_chunk_second_diag(
&self,
axis: usize,
rows: Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
let local = self.base.row_chunk_second_diag(axis, rows)?;
Ok(EmbeddedColumnBlock::new(&local, self.global_range.clone(), self.total_p).materialize())
}
fn row_chunk_second_cross(
&self,
axis_d: usize,
axis_e: usize,
rows: Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
let local = self.base.row_chunk_second_cross(axis_d, axis_e, rows)?;
Ok(EmbeddedColumnBlock::new(&local, self.global_range.clone(), self.total_p).materialize())
}
fn as_materializable(&self) -> Option<&dyn MaterializablePsiDerivativeOperator> {
Some(self)
}
}
impl MaterializablePsiDerivativeOperator for EmbeddedImplicitPsiDerivativeOperator {
fn materialize_first(
&self,
axis: usize,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
Ok(EmbeddedColumnBlock::new(
&self.base.materialize_first(axis)?,
self.global_range.clone(),
self.total_p,
)
.materialize())
}
}
/// Non-allocating zero operator for `\partial X / \partial \psi` derivative
/// blocks whose ψ coordinate does not move the design matrix at all (e.g.
/// the spatial-adaptive overlay's mass / tension / stiffness / ε
/// hyperparameters, which act through the penalty stack alone).
///
/// All matvec/transpose_mul methods return zero vectors of the correct
/// length, all row-chunk methods return chunk-sized zero matrices. The
/// operator never allocates an `(n, p)` dense buffer, which saves ~1.45 GiB
/// at the large-scale spatial-adaptive overlay (n ≈ 320 000, p ≈ 101,
/// six hyperparameters).
pub(crate) struct ZeroPsiDerivativeOperator {
n: usize,
p: usize,
}
impl ZeroPsiDerivativeOperator {
pub(crate) fn new(n: usize, p: usize) -> Self {
Self { n, p }
}
}
impl CustomFamilyPsiDerivativeOperator for ZeroPsiDerivativeOperator {
fn as_any(&self) -> &dyn Any {
self
}
fn n_data(&self) -> usize {
self.n
}
fn p_out(&self) -> usize {
self.p
}
fn transpose_mul(
&self,
idx: usize,
v: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
assert!(idx < usize::MAX);
assert_eq!(v.len(), self.n, "zero psi transpose_mul length mismatch");
Ok(Array1::<f64>::zeros(self.p))
}
fn forward_mul(
&self,
idx: usize,
u: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
assert!(idx < usize::MAX);
assert_eq!(u.len(), self.p, "zero psi forward_mul length mismatch");
Ok(Array1::<f64>::zeros(self.n))
}
fn transpose_mul_second_diag(
&self,
idx: usize,
v: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
assert!(idx < usize::MAX);
assert_eq!(
v.len(),
self.n,
"zero psi transpose_mul_second_diag length mismatch"
);
Ok(Array1::<f64>::zeros(self.p))
}
fn transpose_mul_second_cross(
&self,
idx: usize,
idx2: usize,
v: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
assert!(idx < usize::MAX);
assert!(idx2 < usize::MAX);
assert_eq!(
v.len(),
self.n,
"zero psi transpose_mul_second_cross length mismatch"
);
Ok(Array1::<f64>::zeros(self.p))
}
fn forward_mul_second_diag(
&self,
idx: usize,
u: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
assert!(idx < usize::MAX);
assert_eq!(
u.len(),
self.p,
"zero psi forward_mul_second_diag length mismatch"
);
Ok(Array1::<f64>::zeros(self.n))
}
fn forward_mul_second_cross(
&self,
idx: usize,
idx2: usize,
u: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
assert!(idx < usize::MAX);
assert!(idx2 < usize::MAX);
assert_eq!(
u.len(),
self.p,
"zero psi forward_mul_second_cross length mismatch"
);
Ok(Array1::<f64>::zeros(self.n))
}
fn row_chunk_first(
&self,
idx: usize,
rows: Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
assert!(idx < usize::MAX);
assert!(
rows.start <= rows.end && rows.end <= self.n,
"zero psi row_chunk_first row range out of bounds"
);
Ok(Array2::<f64>::zeros((rows.end - rows.start, self.p)))
}
fn row_vector_first_into(
&self,
idx: usize,
row: usize,
mut out: ArrayViewMut1<'_, f64>,
) -> Result<(), crate::terms::basis::BasisError> {
assert!(idx < usize::MAX);
assert!(
row < self.n,
"zero psi row_vector_first_into row out of bounds"
);
assert_eq!(
out.len(),
self.p,
"zero psi row_vector_first_into output length mismatch"
);
out.fill(0.0);
Ok(())
}
fn row_chunk_second_diag(
&self,
idx: usize,
rows: Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
assert!(idx < usize::MAX);
assert!(
rows.start <= rows.end && rows.end <= self.n,
"zero psi row_chunk_second_diag row range out of bounds"
);
Ok(Array2::<f64>::zeros((rows.end - rows.start, self.p)))
}
fn row_chunk_second_cross(
&self,
idx: usize,
idx2: usize,
rows: Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
assert!(idx < usize::MAX);
assert!(idx2 < usize::MAX);
assert!(
rows.start <= rows.end && rows.end <= self.n,
"zero psi row_chunk_second_cross row range out of bounds"
);
Ok(Array2::<f64>::zeros((rows.end - rows.start, self.p)))
}
}
fn stack_dense_row_blocks(blocks: &[Array2<f64>]) -> Array2<f64> {
let total_rows = blocks.iter().map(Array2::nrows).sum();
let p = blocks.first().map(Array2::ncols).unwrap_or(0);
let mut stacked = Array2::<f64>::zeros((total_rows, p));
let mut row_start = 0usize;
for block in blocks {
assert_eq!(block.ncols(), p);
let row_end = row_start + block.nrows();
stacked
.slice_mut(ndarray::s![row_start..row_end, ..])
.assign(block);
row_start = row_end;
}
stacked
}
struct EmbeddedDensePsiDerivativeOperator {
axis: usize,
total_p: usize,
global_range: Range<usize>,
first_local: Array2<f64>,
second_diag_local: Array2<f64>,
second_cross_local: HashMap<usize, Array2<f64>>,
}
impl EmbeddedDensePsiDerivativeOperator {
fn new(
axis: usize,
total_p: usize,
global_range: Range<usize>,
first_local: Array2<f64>,
second_diag_local: Array2<f64>,
second_cross_local: HashMap<usize, Array2<f64>>,
) -> Result<Self, String> {
let local_p = global_range.len();
if first_local.ncols() != local_p {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"embedded dense psi operator first-derivative width mismatch: got {}, expected {local_p}",
first_local.ncols()
) }.into());
}
if second_diag_local.ncols() != local_p {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"embedded dense psi operator second-diag width mismatch: got {}, expected {local_p}",
second_diag_local.ncols()
) }.into());
}
for (cross_axis, local) in &second_cross_local {
if local.ncols() != local_p {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"embedded dense psi operator cross axis {cross_axis} width mismatch: got {}, expected {local_p}",
local.ncols()
) }.into());
}
}
Ok(Self {
axis,
total_p,
global_range,
first_local,
second_diag_local,
second_cross_local,
})
}
fn validate_axis(
&self,
axis: usize,
context: &str,
) -> Result<(), crate::terms::basis::BasisError> {
if axis == self.axis {
Ok(())
} else {
Err(crate::terms::basis::BasisError::Other(format!(
"{context} expected axis {}, got {axis}",
self.axis
)))
}
}
fn embed_vector(&self, local: Array1<f64>) -> Array1<f64> {
let mut out = Array1::<f64>::zeros(self.total_p);
out.slice_mut(ndarray::s![self.global_range.clone()])
.assign(&local);
out
}
fn local_coeffs(
&self,
u: &ArrayView1<'_, f64>,
context: &str,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
if u.len() != self.total_p {
return Err(crate::terms::basis::BasisError::Other(format!(
"{context} expected coefficient length {}, got {}",
self.total_p,
u.len()
)));
}
Ok(u.slice(ndarray::s![self.global_range.clone()]).to_owned())
}
fn cross_local(
&self,
axis_e: usize,
context: &str,
) -> Result<&Array2<f64>, crate::terms::basis::BasisError> {
self.second_cross_local.get(&axis_e).ok_or_else(|| {
crate::terms::basis::BasisError::Other(format!(
"{context} is missing cross-derivative data for axis {}",
axis_e
))
})
}
}
impl CustomFamilyPsiDerivativeOperator for EmbeddedDensePsiDerivativeOperator {
fn as_any(&self) -> &dyn Any {
self
}
fn n_data(&self) -> usize {
self.first_local.nrows()
}
fn p_out(&self) -> usize {
self.total_p
}
fn transpose_mul(
&self,
axis: usize,
v: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
self.validate_axis(axis, "embedded dense psi transpose_mul")?;
if v.len() != self.n_data() {
return Err(crate::terms::basis::BasisError::Other(format!(
"embedded dense psi transpose_mul expected {} rows, got {}",
self.n_data(),
v.len()
)));
}
Ok(self.embed_vector(crate::faer_ndarray::fast_atv(&self.first_local, v)))
}
fn forward_mul(
&self,
axis: usize,
u: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
self.validate_axis(axis, "embedded dense psi forward_mul")?;
Ok(self
.first_local
.dot(&self.local_coeffs(u, "embedded dense psi forward_mul")?))
}
fn transpose_mul_second_diag(
&self,
axis: usize,
v: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
self.validate_axis(axis, "embedded dense psi transpose_mul_second_diag")?;
if v.len() != self.second_diag_local.nrows() {
return Err(crate::terms::basis::BasisError::Other(format!(
"embedded dense psi transpose_mul_second_diag expected {} rows, got {}",
self.second_diag_local.nrows(),
v.len()
)));
}
Ok(self.embed_vector(crate::faer_ndarray::fast_atv(&self.second_diag_local, v)))
}
fn transpose_mul_second_cross(
&self,
axis_d: usize,
axis_e: usize,
v: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
self.validate_axis(axis_d, "embedded dense psi transpose_mul_second_cross")?;
let local = self.cross_local(axis_e, "embedded dense psi transpose_mul_second_cross")?;
if v.len() != local.nrows() {
return Err(crate::terms::basis::BasisError::Other(format!(
"embedded dense psi transpose_mul_second_cross expected {} rows, got {}",
local.nrows(),
v.len()
)));
}
Ok(self.embed_vector(crate::faer_ndarray::fast_atv(local, v)))
}
fn forward_mul_second_diag(
&self,
axis: usize,
u: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
self.validate_axis(axis, "embedded dense psi forward_mul_second_diag")?;
Ok(self
.second_diag_local
.dot(&self.local_coeffs(u, "embedded dense psi forward_mul_second_diag")?))
}
fn forward_mul_second_cross(
&self,
axis_d: usize,
axis_e: usize,
u: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
self.validate_axis(axis_d, "embedded dense psi forward_mul_second_cross")?;
Ok(self
.cross_local(axis_e, "embedded dense psi forward_mul_second_cross")?
.dot(&self.local_coeffs(u, "embedded dense psi forward_mul_second_cross")?))
}
fn row_chunk_first(
&self,
axis: usize,
rows: Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
self.validate_axis(axis, "embedded dense psi row_chunk_first")?;
let local = self.first_local.slice(ndarray::s![rows, ..]).to_owned();
Ok(EmbeddedColumnBlock::new(&local, self.global_range.clone(), self.total_p).materialize())
}
fn row_vector_first_into(
&self,
axis: usize,
row: usize,
mut out: ArrayViewMut1<'_, f64>,
) -> Result<(), crate::terms::basis::BasisError> {
self.validate_axis(axis, "embedded dense psi row_vector_first_into")?;
if row >= self.first_local.nrows() {
return Err(crate::terms::basis::BasisError::Other(format!(
"embedded dense psi row_vector_first_into row {row} out of bounds for {}",
self.first_local.nrows()
)));
}
if out.len() != self.total_p {
return Err(crate::terms::basis::BasisError::Other(format!(
"embedded dense psi row_vector_first_into expected length {}, got {}",
self.total_p,
out.len()
)));
}
out.fill(0.0);
out.slice_mut(ndarray::s![self.global_range.clone()])
.assign(&self.first_local.row(row));
Ok(())
}
fn row_chunk_second_diag(
&self,
axis: usize,
rows: Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
self.validate_axis(axis, "embedded dense psi row_chunk_second_diag")?;
let local = self
.second_diag_local
.slice(ndarray::s![rows, ..])
.to_owned();
Ok(EmbeddedColumnBlock::new(&local, self.global_range.clone(), self.total_p).materialize())
}
fn row_chunk_second_cross(
&self,
axis_d: usize,
axis_e: usize,
rows: Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
self.validate_axis(axis_d, "embedded dense psi row_chunk_second_cross")?;
let local = self
.cross_local(axis_e, "embedded dense psi row_chunk_second_cross")?
.slice(ndarray::s![rows, ..])
.to_owned();
Ok(EmbeddedColumnBlock::new(&local, self.global_range.clone(), self.total_p).materialize())
}
fn as_materializable(&self) -> Option<&dyn MaterializablePsiDerivativeOperator> {
Some(self)
}
}
impl MaterializablePsiDerivativeOperator for EmbeddedDensePsiDerivativeOperator {
fn materialize_first(
&self,
axis: usize,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
self.validate_axis(axis, "embedded dense psi materialize_first")?;
Ok(
EmbeddedColumnBlock::new(&self.first_local, self.global_range.clone(), self.total_p)
.materialize(),
)
}
}
pub(crate) fn build_embedded_dense_psi_operator(
first_local: &Array2<f64>,
second_diag_local: &Array2<f64>,
second_cross_local: Option<&Vec<(usize, Array2<f64>)>>,
global_range: Range<usize>,
total_p: usize,
axis: usize,
) -> Result<Arc<dyn CustomFamilyPsiDerivativeOperator>, String> {
let second_cross_local = second_cross_local
.map(|rows| {
rows.iter()
.map(|(axis, local)| (*axis, local.clone()))
.collect()
})
.unwrap_or_default();
Ok(Arc::new(EmbeddedDensePsiDerivativeOperator::new(
axis,
total_p,
global_range,
first_local.clone(),
second_diag_local.clone(),
second_cross_local,
)?))
}
struct RowwiseKroneckerPsiDerivativeOperator {
base: Arc<dyn CustomFamilyPsiDerivativeOperator>,
time_bases: Vec<Arc<Array2<f64>>>,
n_per_block: usize,
p_time: usize,
p_out: usize,
}
impl RowwiseKroneckerPsiDerivativeOperator {
fn new(
base: Arc<dyn CustomFamilyPsiDerivativeOperator>,
time_bases: Vec<Arc<Array2<f64>>>,
) -> Result<Self, String> {
let first = time_bases.first().ok_or_else(|| {
"rowwise kronecker psi operator needs at least one time basis".to_string()
})?;
let n_per_block = first.nrows();
let p_time = first.ncols();
for (idx, basis) in time_bases.iter().enumerate() {
if basis.nrows() != n_per_block || basis.ncols() != p_time {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"rowwise kronecker psi operator time basis {idx} shape mismatch: got {}x{}, expected {}x{}",
basis.nrows(),
basis.ncols(),
n_per_block,
p_time
) }.into());
}
}
if base.n_data() != n_per_block {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"rowwise kronecker psi operator base row mismatch: got {}, expected {n_per_block}",
base.n_data()
) }.into());
}
Ok(Self {
p_out: base.p_out() * p_time,
base,
time_bases,
n_per_block,
p_time,
})
}
fn split_time_columns(&self, u: &ArrayView1<'_, f64>) -> Vec<Array1<f64>> {
let p_base = self.base.p_out();
assert_eq!(u.len(), self.p_out);
let mut cols = vec![Array1::<f64>::zeros(p_base); self.p_time];
for j in 0..p_base {
for t in 0..self.p_time {
cols[t][j] = u[j * self.p_time + t];
}
}
cols
}
fn lifted_row_chunk_with_base<F>(
&self,
rows: Range<usize>,
mut base_chunk: F,
) -> Result<Array2<f64>, crate::terms::basis::BasisError>
where
F: FnMut(Range<usize>) -> Result<Array2<f64>, crate::terms::basis::BasisError>,
{
if rows.start > rows.end || rows.end > self.n_data() {
return Err(crate::terms::basis::BasisError::Other(format!(
"rowwise kronecker psi row chunk {}..{} out of bounds for {} rows",
rows.start,
rows.end,
self.n_data()
)));
}
if rows.is_empty() {
return Ok(Array2::<f64>::zeros((0, self.p_out)));
}
let first_block = rows.start / self.n_per_block;
let last_block = (rows.end - 1) / self.n_per_block;
let mut blocks = Vec::with_capacity(last_block + 1 - first_block);
for block_idx in first_block..=last_block {
let block_global_start = block_idx * self.n_per_block;
let local_start = rows.start.saturating_sub(block_global_start);
let local_end = (rows.end - block_global_start).min(self.n_per_block);
let local_rows = local_start..local_end;
let base = base_chunk(local_rows.clone())?;
let time = self.time_bases[block_idx]
.slice(ndarray::s![local_rows, ..])
.to_owned();
blocks.push(dense_rowwise_kronecker(base.view(), time.view()));
}
Ok(stack_dense_row_blocks(&blocks))
}
/// Canonical transpose-direction lifted matvec: for each time column `t`,
/// weight `v` by the time basis column, delegate to the base operator via
/// `base_op`, and scatter the per-base accumulator into the lifted layout.
fn lifted_transpose_mul_with_base<F>(
&self,
v: &ArrayView1<'_, f64>,
mut base_op: F,
) -> Result<Array1<f64>, crate::terms::basis::BasisError>
where
F: FnMut(&ArrayView1<'_, f64>) -> Result<Array1<f64>, crate::terms::basis::BasisError>,
{
assert_eq!(v.len(), self.n_data());
let p_base = self.base.p_out();
let mut out = Array1::<f64>::zeros(self.p_out);
for t in 0..self.p_time {
let mut accum = Array1::<f64>::zeros(p_base);
for (block_idx, time_basis) in self.time_bases.iter().enumerate() {
let row_start = block_idx * self.n_per_block;
let row_end = row_start + self.n_per_block;
let weighted = &v.slice(ndarray::s![row_start..row_end]).to_owned()
* &time_basis.column(t).to_owned();
accum += &base_op(&weighted.view())?;
}
for j in 0..p_base {
out[j * self.p_time + t] = accum[j];
}
}
Ok(out)
}
/// Canonical forward-direction lifted matvec: split `u` into per-time-column
/// coefficient vectors, delegate each to the base operator via `base_op`, and
/// accumulate the time-basis-weighted contributions into the block rows.
fn lifted_forward_mul_with_base<F>(
&self,
u: &ArrayView1<'_, f64>,
mut base_op: F,
) -> Result<Array1<f64>, crate::terms::basis::BasisError>
where
F: FnMut(&ArrayView1<'_, f64>) -> Result<Array1<f64>, crate::terms::basis::BasisError>,
{
let time_cols = self.split_time_columns(u);
let mut out = Array1::<f64>::zeros(self.n_data());
for (t, coeffs) in time_cols.iter().enumerate() {
let base_eval = base_op(&coeffs.view())?;
for (block_idx, time_basis) in self.time_bases.iter().enumerate() {
let row_start = block_idx * self.n_per_block;
let row_end = row_start + self.n_per_block;
let contrib = &base_eval * &time_basis.column(t).to_owned();
let mut out_block = out.slice_mut(ndarray::s![row_start..row_end]);
out_block += &contrib;
}
}
Ok(out)
}
}
impl CustomFamilyPsiDerivativeOperator for RowwiseKroneckerPsiDerivativeOperator {
fn as_any(&self) -> &dyn Any {
self
}
fn n_data(&self) -> usize {
self.n_per_block * self.time_bases.len()
}
fn p_out(&self) -> usize {
self.p_out
}
fn transpose_mul(
&self,
axis: usize,
v: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
self.lifted_transpose_mul_with_base(v, |weighted| self.base.transpose_mul(axis, weighted))
}
fn forward_mul(
&self,
axis: usize,
u: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
self.lifted_forward_mul_with_base(u, |coeffs| self.base.forward_mul(axis, coeffs))
}
fn transpose_mul_second_diag(
&self,
axis: usize,
v: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
self.lifted_transpose_mul_with_base(v, |weighted| {
self.base.transpose_mul_second_diag(axis, weighted)
})
}
fn transpose_mul_second_cross(
&self,
axis_d: usize,
axis_e: usize,
v: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
self.lifted_transpose_mul_with_base(v, |weighted| {
self.base
.transpose_mul_second_cross(axis_d, axis_e, weighted)
})
}
fn forward_mul_second_diag(
&self,
axis: usize,
u: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
self.lifted_forward_mul_with_base(u, |coeffs| {
self.base.forward_mul_second_diag(axis, coeffs)
})
}
fn forward_mul_second_cross(
&self,
axis_d: usize,
axis_e: usize,
u: &ArrayView1<'_, f64>,
) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
self.lifted_forward_mul_with_base(u, |coeffs| {
self.base.forward_mul_second_cross(axis_d, axis_e, coeffs)
})
}
fn row_chunk_first(
&self,
axis: usize,
rows: Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
self.lifted_row_chunk_with_base(rows, |local_rows| {
self.base.row_chunk_first(axis, local_rows)
})
}
fn row_chunk_second_diag(
&self,
axis: usize,
rows: Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
self.lifted_row_chunk_with_base(rows, |local_rows| {
self.base.row_chunk_second_diag(axis, local_rows)
})
}
fn row_chunk_second_cross(
&self,
axis_d: usize,
axis_e: usize,
rows: Range<usize>,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
self.lifted_row_chunk_with_base(rows, |local_rows| {
self.base.row_chunk_second_cross(axis_d, axis_e, local_rows)
})
}
fn as_materializable(&self) -> Option<&dyn MaterializablePsiDerivativeOperator> {
Some(self)
}
}
impl MaterializablePsiDerivativeOperator for RowwiseKroneckerPsiDerivativeOperator {
fn materialize_first(
&self,
axis: usize,
) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
let base_mat = self.base.as_materializable().ok_or_else(|| {
crate::terms::basis::BasisError::Other(
"rowwise kronecker psi operator: base operator does not support materialization"
.to_string(),
)
})?;
let base = base_mat.materialize_first(axis)?;
let blocks: Vec<Array2<f64>> = self
.time_bases
.iter()
.map(|basis| dense_rowwise_kronecker(base.view(), basis.view()))
.collect();
Ok(stack_dense_row_blocks(&blocks))
}
}
pub(crate) fn build_rowwise_kronecker_psi_operator(
base: Arc<dyn CustomFamilyPsiDerivativeOperator>,
time_bases: Vec<Arc<Array2<f64>>>,
) -> Result<Arc<dyn CustomFamilyPsiDerivativeOperator>, String> {
Ok(Arc::new(RowwiseKroneckerPsiDerivativeOperator::new(
base, time_bases,
)?))
}
#[derive(Clone)]
pub(crate) struct CustomFamilyPsiDesignAction {
operator: Arc<dyn CustomFamilyPsiDerivativeOperator>,
axis: usize,
row_range: Range<usize>,
p: usize,
}
impl CustomFamilyPsiDesignAction {
pub(crate) fn from_first_derivative(
deriv: &CustomFamilyBlockPsiDerivative,
total_rows: usize,
p: usize,
row_range: Range<usize>,
label: &str,
) -> Result<Self, String> {
if row_range.end > total_rows {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"{label} row range {}..{} exceeds total rows {total_rows}",
row_range.start, row_range.end
),
}
.into());
}
if let Some(op) = deriv.implicit_operator.as_ref()
&& op.n_data() == total_rows
&& op.p_out() == p
{
return Ok(Self {
operator: Arc::clone(op),
axis: deriv.implicit_axis,
row_range,
p,
});
}
Err(CustomFamilyError::UnsupportedConfiguration { reason: format!(
"{label} is missing an implicit x_psi operator with shape {}x{}; got dense payload {}x{} instead",
total_rows,
p,
deriv.x_psi.nrows(),
deriv.x_psi.ncols(),
) }.into())
}
pub(crate) fn is_implicit(&self) -> bool {
true
}
pub(crate) fn nrows(&self) -> usize {
self.row_range.end - self.row_range.start
}
pub(crate) fn slice_rows(&self, row_range: Range<usize>) -> Result<Self, String> {
if row_range.end > self.nrows() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"psi design row range {}..{} exceeds available rows {}",
row_range.start,
row_range.end,
self.nrows()
),
}
.into());
}
Ok(Self {
operator: Arc::clone(&self.operator),
axis: self.axis,
row_range: (self.row_range.start + row_range.start)
..(self.row_range.start + row_range.end),
p: self.p,
})
}
pub(crate) fn forward_mul(&self, u: ArrayView1<'_, f64>) -> Array1<f64> {
assert_eq!(u.len(), self.p);
self.operator
.forward_mul(self.axis, &u)
.expect("radial scalar evaluation failed during implicit psi forward_mul")
.slice(ndarray::s![self.row_range.clone()])
.to_owned()
}
pub(crate) fn transpose_mul(&self, v: ArrayView1<'_, f64>) -> Array1<f64> {
assert_eq!(v.len(), self.row_range.end - self.row_range.start);
if self.row_range.start == 0 && self.row_range.end == self.operator.n_data() {
self.operator
.transpose_mul(self.axis, &v)
.expect("radial scalar evaluation failed during implicit psi transpose_mul")
} else {
let mut expanded = Array1::<f64>::zeros(self.operator.n_data());
expanded
.slice_mut(ndarray::s![self.row_range.clone()])
.assign(&v);
self.operator
.transpose_mul(self.axis, &expanded.view())
.expect("radial scalar evaluation failed during implicit psi transpose_mul")
}
}
fn absolute_rows(&self, rows: Range<usize>) -> Range<usize> {
(self.row_range.start + rows.start)..(self.row_range.start + rows.end)
}
pub(crate) fn row_chunk(&self, rows: Range<usize>) -> Result<Array2<f64>, String> {
if rows.end > self.nrows() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"psi design row range {}..{} exceeds available rows {}",
rows.start,
rows.end,
self.nrows()
),
}
.into());
}
self.operator
.row_chunk_first(self.axis, self.absolute_rows(rows))
.map_err(|e| e.to_string())
}
pub(crate) fn row_vector(&self, row: usize) -> Result<Array1<f64>, String> {
if row >= self.nrows() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"psi design row {row} exceeds available rows {}",
self.nrows()
),
}
.into());
}
let absolute_row = self.row_range.start + row;
let mut out = Array1::<f64>::zeros(self.p);
self.operator
.row_vector_first_into(self.axis, absolute_row, out.view_mut())
.map_err(|e| e.to_string())?;
Ok(out)
}
}
#[derive(Clone, Copy)]
enum CustomFamilyPsiSecondDesignLevel {
Diag(usize),
Cross(usize, usize),
}
#[derive(Clone)]
pub(crate) struct CustomFamilyPsiSecondDesignAction {
operator: Arc<dyn CustomFamilyPsiDerivativeOperator>,
level: CustomFamilyPsiSecondDesignLevel,
row_range: Range<usize>,
p: usize,
}
impl CustomFamilyPsiSecondDesignAction {
pub(crate) fn from_second_derivative(
deriv_i: &CustomFamilyBlockPsiDerivative,
deriv_j: &CustomFamilyBlockPsiDerivative,
total_rows: usize,
p: usize,
row_range: Range<usize>,
label: &str,
) -> Result<Option<Self>, String> {
if row_range.end > total_rows {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"{label} row range {}..{} exceeds total rows {total_rows}",
row_range.start, row_range.end
),
}
.into());
}
let Some(op) = deriv_i.implicit_operator.as_ref() else {
return Ok(None);
};
if op.n_data() != total_rows || op.p_out() != p {
return Err(CustomFamilyError::UnsupportedConfiguration {
reason: format!(
"{label} is missing an implicit x_psi_psi operator with shape {}x{}",
total_rows, p
),
}
.into());
}
let same_group = deriv_i.implicit_group_id.is_some()
&& deriv_i.implicit_group_id == deriv_j.implicit_group_id;
if !same_group {
return Ok(None);
}
let level = if deriv_i.implicit_axis == deriv_j.implicit_axis {
CustomFamilyPsiSecondDesignLevel::Diag(deriv_i.implicit_axis)
} else {
CustomFamilyPsiSecondDesignLevel::Cross(deriv_i.implicit_axis, deriv_j.implicit_axis)
};
Ok(Some(Self {
operator: Arc::clone(op),
level,
row_range,
p,
}))
}
pub(crate) fn nrows(&self) -> usize {
self.row_range.end - self.row_range.start
}
pub(crate) fn forward_mul(&self, u: ArrayView1<'_, f64>) -> Array1<f64> {
assert_eq!(u.len(), self.p);
let out = match self.level {
CustomFamilyPsiSecondDesignLevel::Diag(axis) => self
.operator
.forward_mul_second_diag(axis, &u)
.expect("radial scalar evaluation failed during implicit psi second forward_mul"),
CustomFamilyPsiSecondDesignLevel::Cross(axis_d, axis_e) => self
.operator
.forward_mul_second_cross(axis_d, axis_e, &u)
.expect("radial scalar evaluation failed during implicit psi second forward_mul"),
};
out.slice(ndarray::s![self.row_range.clone()]).to_owned()
}
pub(crate) fn transpose_mul(&self, v: ArrayView1<'_, f64>) -> Array1<f64> {
assert_eq!(v.len(), self.nrows());
let expanded = if self.row_range.start == 0 && self.row_range.end == self.operator.n_data()
{
None
} else {
let mut expanded = Array1::<f64>::zeros(self.operator.n_data());
expanded
.slice_mut(ndarray::s![self.row_range.clone()])
.assign(&v);
Some(expanded)
};
let full_v = expanded.as_ref().map_or(v, |arr| arr.view());
match self.level {
CustomFamilyPsiSecondDesignLevel::Diag(axis) => self
.operator
.transpose_mul_second_diag(axis, &full_v)
.expect("radial scalar evaluation failed during implicit psi second transpose_mul"),
CustomFamilyPsiSecondDesignLevel::Cross(axis_d, axis_e) => self
.operator
.transpose_mul_second_cross(axis_d, axis_e, &full_v)
.expect("radial scalar evaluation failed during implicit psi second transpose_mul"),
}
}
fn absolute_rows(&self, rows: Range<usize>) -> Range<usize> {
(self.row_range.start + rows.start)..(self.row_range.start + rows.end)
}
pub(crate) fn row_chunk(&self, rows: Range<usize>) -> Result<Array2<f64>, String> {
if rows.end > self.nrows() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"psi second-design row range {}..{} exceeds available rows {}",
rows.start,
rows.end,
self.nrows()
),
}
.into());
}
match self.level {
CustomFamilyPsiSecondDesignLevel::Diag(axis) => self
.operator
.row_chunk_second_diag(axis, self.absolute_rows(rows))
.map_err(|e| e.to_string()),
CustomFamilyPsiSecondDesignLevel::Cross(axis_d, axis_e) => self
.operator
.row_chunk_second_cross(axis_d, axis_e, self.absolute_rows(rows))
.map_err(|e| e.to_string()),
}
}
pub(crate) fn row_vector(&self, row: usize) -> Result<Array1<f64>, String> {
self.row_chunk(row..row + 1).map(|m| m.row(0).to_owned())
}
}
#[derive(Clone, Copy)]
pub(crate) enum CustomFamilyPsiLinearMapRef<'a> {
Dense(&'a Array2<f64>),
First(&'a CustomFamilyPsiDesignAction),
Second(&'a CustomFamilyPsiSecondDesignAction),
Zero { nrows: usize, ncols: usize },
}
impl CustomFamilyPsiLinearMapRef<'_> {
pub(crate) fn nrows(&self) -> usize {
match self {
Self::Dense(mat) => mat.nrows(),
Self::First(action) => action.nrows(),
Self::Second(action) => action.nrows(),
Self::Zero { nrows, .. } => *nrows,
}
}
pub(crate) fn ncols(&self) -> usize {
match self {
Self::Dense(mat) => mat.ncols(),
Self::First(action) => action.p,
Self::Second(action) => action.p,
Self::Zero { ncols, .. } => *ncols,
}
}
pub(crate) fn forward_mul(&self, u: ArrayView1<'_, f64>) -> Array1<f64> {
match self {
Self::Dense(mat) => mat.dot(&u),
Self::First(action) => action.forward_mul(u),
Self::Second(action) => action.forward_mul(u),
Self::Zero { nrows, .. } => Array1::<f64>::zeros(*nrows),
}
}
pub(crate) fn transpose_mul(&self, v: ArrayView1<'_, f64>) -> Array1<f64> {
match self {
Self::Dense(mat) => crate::faer_ndarray::fast_atv(mat, &v),
Self::First(action) => action.transpose_mul(v),
Self::Second(action) => action.transpose_mul(v),
Self::Zero { ncols, .. } => Array1::<f64>::zeros(*ncols),
}
}
pub(crate) fn row_vector(&self, row: usize) -> Result<Array1<f64>, String> {
if row >= self.nrows() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"psi linear-map row {row} out of bounds for {} rows",
self.nrows()
),
}
.into());
}
Ok(match self {
Self::Dense(mat) => mat.row(row).to_owned(),
Self::First(action) => action.row_vector(row)?,
Self::Second(action) => action.row_vector(row)?,
Self::Zero { ncols, .. } => Array1::<f64>::zeros(*ncols),
})
}
pub(crate) fn row_chunk(&self, rows: Range<usize>) -> Result<Array2<f64>, String> {
if rows.end > self.nrows() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"psi linear-map row range {}..{} out of bounds for {} rows",
rows.start,
rows.end,
self.nrows()
),
}
.into());
}
Ok(match self {
Self::Dense(mat) => mat.slice(ndarray::s![rows, ..]).to_owned(),
Self::First(action) => action.row_chunk(rows)?,
Self::Second(action) => action.row_chunk(rows)?,
Self::Zero { ncols, .. } => Array2::<f64>::zeros((rows.end - rows.start, *ncols)),
})
}
}
#[derive(Clone)]
pub(crate) enum PsiDesignMap {
Zero {
nrows: usize,
ncols: usize,
},
Dense {
matrix: Arc<Array2<f64>>,
},
First {
action: CustomFamilyPsiDesignAction,
},
Second {
action: CustomFamilyPsiSecondDesignAction,
},
}
impl PsiDesignMap {
pub(crate) fn ncols(&self) -> usize {
match self {
Self::Zero { ncols, .. } => *ncols,
Self::Dense { matrix } => matrix.ncols(),
Self::First { action } => action.p,
Self::Second { action } => action.p,
}
}
pub(crate) fn forward_mul(&self, u: ArrayView1<'_, f64>) -> Result<Array1<f64>, String> {
match self {
Self::Zero { nrows, .. } => Ok(Array1::<f64>::zeros(*nrows)),
Self::Dense { matrix } => Ok(matrix.dot(&u)),
Self::First { action } => Ok(action.forward_mul(u)),
Self::Second { action } => Ok(action.forward_mul(u)),
}
}
pub(crate) fn row_chunk(&self, rows: Range<usize>) -> Result<Array2<f64>, String> {
let ncols = self.ncols();
match self {
Self::Zero { .. } => Ok(Array2::<f64>::zeros((rows.end - rows.start, ncols))),
Self::Dense { matrix } => Ok(matrix.slice(ndarray::s![rows, ..]).to_owned()),
Self::First { action } => action.row_chunk(rows),
Self::Second { action } => action.row_chunk(rows),
}
}
pub(crate) fn row_vector(&self, row: usize) -> Result<Array1<f64>, String> {
match self {
Self::Zero { ncols, .. } => Ok(Array1::<f64>::zeros(*ncols)),
Self::Dense { matrix } => Ok(matrix.row(row).to_owned()),
Self::First { action } => action.row_vector(row),
Self::Second { action } => action.row_vector(row),
}
}
/// Borrow this map as a `CustomFamilyPsiLinearMapRef`, handling every
/// variant. This is the zero-allocation replacement for the pattern
/// `first_psi_linear_map(action.as_ref(), dense.as_ref(), n, p)`.
pub(crate) fn as_linear_map_ref(&self) -> CustomFamilyPsiLinearMapRef<'_> {
match self {
Self::Zero { nrows, ncols } => CustomFamilyPsiLinearMapRef::Zero {
nrows: *nrows,
ncols: *ncols,
},
Self::Dense { matrix } => CustomFamilyPsiLinearMapRef::Dense(matrix.as_ref()),
Self::First { action } => CustomFamilyPsiLinearMapRef::First(action),
Self::Second { action } => CustomFamilyPsiLinearMapRef::Second(action),
}
}
/// Return a reference to the first-derivative operator action if this map
/// holds one. Useful for callers that need to pass ownership of the action
/// into downstream operator builders.
pub(crate) fn as_first_action(&self) -> Option<&CustomFamilyPsiDesignAction> {
match self {
Self::First { action } => Some(action),
_ => None,
}
}
/// Clone the first-derivative operator action if this map holds one.
pub(crate) fn cloned_first_action(&self) -> Option<CustomFamilyPsiDesignAction> {
self.as_first_action().cloned()
}
}
fn is_zero_array(a: &Array2<f64>) -> bool {
a.iter().all(|x| *x == 0.0)
}
pub(crate) fn weighted_crossprod_psi_maps(
left: CustomFamilyPsiLinearMapRef<'_>,
weights: ArrayView1<'_, f64>,
right: CustomFamilyPsiLinearMapRef<'_>,
) -> Result<Array2<f64>, String> {
if left.nrows() != weights.len() || right.nrows() != weights.len() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"psi weighted crossprod row mismatch: left={}, weights={}, right={}",
left.nrows(),
weights.len(),
right.nrows()
),
}
.into());
}
let p_left = left.ncols();
let p_right = right.ncols();
if p_left == 0 || p_right == 0 {
return Ok(Array2::<f64>::zeros((p_left, p_right)));
}
// Zero fast path: either operand being the Zero variant makes the full product zero.
if matches!(left, CustomFamilyPsiLinearMapRef::Zero { .. })
|| matches!(right, CustomFamilyPsiLinearMapRef::Zero { .. })
{
return Ok(Array2::<f64>::zeros((p_left, p_right)));
}
let mut out = Array2::<f64>::zeros((p_left, p_right));
// Stream row chunks of both operands so the weighted intermediate is never
// materialized at full n x p_right size. Chunk size is governed by the
// resource policy's row_chunk_target_bytes.
let policy = ResourcePolicy::default_library();
let rows_per_chunk = crate::resource::rows_for_target_bytes(
policy.row_chunk_target_bytes,
p_left.saturating_add(p_right).max(1),
);
let n = weights.len();
for start in (0..n).step_by(rows_per_chunk) {
let end = (start + rows_per_chunk).min(n);
let rows = start..end;
let xl = left.row_chunk(rows.clone())?;
let mut xr = right.row_chunk(rows.clone())?;
for local in 0..xr.nrows() {
let w = weights[start + local];
if w != 1.0 {
for j in 0..p_right {
xr[[local, j]] *= w;
}
}
}
out += &fast_atb(&xl, &xr);
}
Ok(out)
}
pub(crate) fn first_psi_linear_map<'a>(
action: Option<&'a CustomFamilyPsiDesignAction>,
dense: Option<&'a Array2<f64>>,
nrows: usize,
ncols: usize,
) -> CustomFamilyPsiLinearMapRef<'a> {
if let Some(action) = action {
CustomFamilyPsiLinearMapRef::First(action)
} else if let Some(dense) = dense
&& dense.nrows() == nrows
&& dense.ncols() == ncols
{
CustomFamilyPsiLinearMapRef::Dense(dense)
} else {
CustomFamilyPsiLinearMapRef::Zero { nrows, ncols }
}
}
pub(crate) fn second_psi_linear_map<'a>(
action: Option<&'a CustomFamilyPsiSecondDesignAction>,
dense: Option<&'a Array2<f64>>,
nrows: usize,
ncols: usize,
) -> CustomFamilyPsiLinearMapRef<'a> {
if let Some(action) = action {
CustomFamilyPsiLinearMapRef::Second(action)
} else if let Some(dense) = dense
&& dense.nrows() == nrows
&& dense.ncols() == ncols
{
CustomFamilyPsiLinearMapRef::Dense(dense)
} else {
CustomFamilyPsiLinearMapRef::Zero { nrows, ncols }
}
}
pub(crate) struct CustomFamilyJointDesignChannel {
range: Range<usize>,
design: DesignMatrix,
psi_derivative: Option<CustomFamilyPsiDesignAction>,
}
impl CustomFamilyJointDesignChannel {
pub(crate) fn new<D>(
range: Range<usize>,
design: D,
psi_derivative: Option<CustomFamilyPsiDesignAction>,
) -> Self
where
D: Into<DesignMatrix>,
{
Self {
range,
design: design.into(),
psi_derivative,
}
}
fn coefficients(&self, full: &Array1<f64>) -> Array1<f64> {
full.slice(ndarray::s![self.range.clone()]).to_owned()
}
fn apply(&self, full: &Array1<f64>) -> Array1<f64> {
let coeffs = self.coefficients(full);
self.design.matrixvectormultiply(&coeffs)
}
fn apply_transpose(&self, values: &Array1<f64>) -> Array1<f64> {
self.design.transpose_vector_multiply(values)
}
}
pub(crate) struct CustomFamilyJointDesignPairContribution {
left_channel: usize,
right_channel: usize,
weights: Array1<f64>,
drift_weights: Array1<f64>,
}
impl CustomFamilyJointDesignPairContribution {
pub(crate) fn new(
left_channel: usize,
right_channel: usize,
weights: Array1<f64>,
drift_weights: Array1<f64>,
) -> Self {
Self {
left_channel,
right_channel,
weights,
drift_weights,
}
}
}
pub(crate) struct CustomFamilyJointPsiOperator {
total_dim: usize,
channels: Vec<CustomFamilyJointDesignChannel>,
pair_contributions: Vec<CustomFamilyJointDesignPairContribution>,
/// Optional dense correction for small cross-blocks (e.g. h/w parameters)
/// that don't warrant their own weighted-Gram channel.
dense_correction: Option<Array2<f64>>,
}
impl CustomFamilyJointPsiOperator {
pub(crate) fn new(
total_dim: usize,
channels: Vec<CustomFamilyJointDesignChannel>,
pair_contributions: Vec<CustomFamilyJointDesignPairContribution>,
) -> Self {
Self {
total_dim,
channels,
pair_contributions,
dense_correction: None,
}
}
}
impl HyperOperator for CustomFamilyJointPsiOperator {
fn dim(&self) -> usize {
self.total_dim
}
fn mul_vec(&self, v: &Array1<f64>) -> Array1<f64> {
assert_eq!(v.len(), self.total_dim);
let base_vals: Vec<Array1<f64>> = self
.channels
.iter()
.map(|channel| channel.apply(v))
.collect();
let deriv_vals: Vec<Option<Array1<f64>>> = self
.channels
.iter()
.map(|channel| {
channel
.psi_derivative
.as_ref()
.map(|deriv| deriv.forward_mul(v.slice(ndarray::s![channel.range.clone()])))
})
.collect();
let mut out = if let Some(ref corr) = self.dense_correction {
corr.dot(v)
} else {
Array1::<f64>::zeros(self.total_dim)
};
for pair in &self.pair_contributions {
let left = &self.channels[pair.left_channel];
let right_base = &base_vals[pair.right_channel];
let weighted_drift = &pair.drift_weights * right_base;
let mut contrib = left.apply_transpose(&weighted_drift);
if let Some(left_deriv) = left.psi_derivative.as_ref() {
let weighted_right = &pair.weights * right_base;
contrib += &left_deriv.transpose_mul(weighted_right.view());
}
if let Some(right_deriv) = deriv_vals[pair.right_channel].as_ref() {
let weighted_right = &pair.weights * right_deriv;
contrib += &left.apply_transpose(&weighted_right);
}
let mut out_slice = out.slice_mut(ndarray::s![left.range.clone()]);
out_slice += &contrib;
}
out
}
fn bilinear(&self, v: &Array1<f64>, u: &Array1<f64>) -> f64 {
assert_eq!(v.len(), self.total_dim);
assert_eq!(u.len(), self.total_dim);
let base_v: Vec<Array1<f64>> = self
.channels
.iter()
.map(|channel| channel.apply(v))
.collect();
let base_u: Vec<Array1<f64>> = self
.channels
.iter()
.map(|channel| channel.apply(u))
.collect();
let deriv_v: Vec<Option<Array1<f64>>> = self
.channels
.iter()
.map(|channel| {
channel
.psi_derivative
.as_ref()
.map(|deriv| deriv.forward_mul(v.slice(ndarray::s![channel.range.clone()])))
})
.collect();
let deriv_u: Vec<Option<Array1<f64>>> = self
.channels
.iter()
.map(|channel| {
channel
.psi_derivative
.as_ref()
.map(|deriv| deriv.forward_mul(u.slice(ndarray::s![channel.range.clone()])))
})
.collect();
let mut total = if let Some(ref corr) = self.dense_correction {
v.dot(&corr.dot(u))
} else {
0.0
};
for pair in &self.pair_contributions {
let left_base_u = &base_u[pair.left_channel];
let right_base_v = &base_v[pair.right_channel];
total += left_base_u.dot(&(&pair.drift_weights * right_base_v));
if let Some(left_deriv_u) = deriv_u[pair.left_channel].as_ref() {
total += left_deriv_u.dot(&(&pair.weights * right_base_v));
}
if let Some(right_deriv_v) = deriv_v[pair.right_channel].as_ref() {
total += left_base_u.dot(&(&pair.weights * right_deriv_v));
}
}
total
}
fn to_dense(&self) -> Array2<f64> {
let mut out = self
.dense_correction
.clone()
.unwrap_or_else(|| Array2::<f64>::zeros((self.total_dim, self.total_dim)));
let mut basis = Array1::<f64>::zeros(self.total_dim);
for j in 0..self.total_dim {
basis[j] = 1.0;
// Use mul_vec without the dense_correction part (already in `out`).
let base_vals: Vec<Array1<f64>> = self
.channels
.iter()
.map(|channel| channel.apply(&basis))
.collect();
let deriv_vals: Vec<Option<Array1<f64>>> = self
.channels
.iter()
.map(|channel| {
channel.psi_derivative.as_ref().map(|deriv| {
deriv.forward_mul(basis.slice(ndarray::s![channel.range.clone()]))
})
})
.collect();
let mut col = Array1::<f64>::zeros(self.total_dim);
for pair in &self.pair_contributions {
let left = &self.channels[pair.left_channel];
let right_base = &base_vals[pair.right_channel];
let weighted_drift = &pair.drift_weights * right_base;
let mut contrib = left.apply_transpose(&weighted_drift);
if let Some(left_deriv) = left.psi_derivative.as_ref() {
let weighted_right = &pair.weights * right_base;
contrib += &left_deriv.transpose_mul(weighted_right.view());
}
if let Some(right_deriv) = deriv_vals[pair.right_channel].as_ref() {
let weighted_right = &pair.weights * right_deriv;
contrib += &left.apply_transpose(&weighted_right);
}
col.slice_mut(ndarray::s![left.range.clone()])
.scaled_add(1.0, &contrib);
}
out.column_mut(j).scaled_add(1.0, &col);
basis[j] = 0.0;
}
out
}
fn is_implicit(&self) -> bool {
self.dense_correction.is_none()
&& self.channels.iter().any(|channel| {
channel
.psi_derivative
.as_ref()
.is_some_and(|d| d.is_implicit())
})
}
}
fn shared_dense_design_cache() -> &'static Mutex<HashMap<(usize, usize, usize), Weak<Array2<f64>>>>
{
static CACHE: OnceLock<Mutex<HashMap<(usize, usize, usize), Weak<Array2<f64>>>>> =
OnceLock::new();
CACHE.get_or_init(|| Mutex::new(HashMap::new()))
}
pub(crate) fn shared_dense_arc(x: &Array2<f64>) -> Arc<Array2<f64>> {
let key = (x.as_ptr() as usize, x.nrows(), x.ncols());
let cache = shared_dense_design_cache();
if let Ok(mut guard) = cache.lock() {
if let Some(shared) = guard.get(&key).and_then(Weak::upgrade) {
return shared;
}
guard.retain(|_, shared| shared.strong_count() > 0);
let shared = Arc::new(x.clone());
guard.insert(key, Arc::downgrade(&shared));
shared
} else {
Arc::new(x.clone())
}
}
pub(crate) fn resolve_custom_family_x_psi_map(
deriv: &CustomFamilyBlockPsiDerivative,
n: usize,
p: usize,
row_range: Range<usize>,
label: &str,
policy: &ResourcePolicy,
) -> Result<PsiDesignMap, String> {
if row_range.end > n {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"{label}: row range {}..{} exceeds total rows {n}",
row_range.start, row_range.end
),
}
.into());
}
// Prefer operator action when dimensions match.
if let Some(op) = deriv.implicit_operator.as_ref()
&& op.n_data() == n
&& op.p_out() == p
{
return Ok(PsiDesignMap::First {
action: CustomFamilyPsiDesignAction::from_first_derivative(
deriv, n, p, row_range, label,
)?,
});
}
// Dense fallback guarded by policy.
if deriv.x_psi.nrows() == n && deriv.x_psi.ncols() == p {
match policy.derivative_storage_mode {
DerivativeStorageMode::AnalyticOperatorRequired => {
if is_zero_array(&deriv.x_psi) {
return Ok(PsiDesignMap::Zero {
nrows: row_range.end - row_range.start,
ncols: p,
});
}
return Err(CustomFamilyError::UnsupportedConfiguration {
reason: format!(
"{label}: dense x_psi fallback disabled by AnalyticOperatorRequired"
),
}
.into());
}
DerivativeStorageMode::MaterializeIfSmall | DerivativeStorageMode::DiagnosticsOnly => {
let matrix = if row_range.start == 0 && row_range.end == n {
Arc::new(deriv.x_psi.clone())
} else {
Arc::new(
deriv
.x_psi
.slice(ndarray::s![row_range.clone(), ..])
.to_owned(),
)
};
return Ok(PsiDesignMap::Dense { matrix });
}
}
}
// Empty / zero sentinel.
if deriv.x_psi.nrows() == 0 || deriv.x_psi.ncols() == 0 {
return Ok(PsiDesignMap::Zero {
nrows: row_range.end - row_range.start,
ncols: p,
});
}
Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"{label}: x_psi shape {:?} does not match ({n}, {p})",
deriv.x_psi.dim()
),
}
.into())
}
pub(crate) fn resolve_custom_family_x_psi_psi_map(
deriv_i: &CustomFamilyBlockPsiDerivative,
deriv_j: &CustomFamilyBlockPsiDerivative,
local_j: usize,
n: usize,
p: usize,
row_range: Range<usize>,
label: &str,
policy: &ResourcePolicy,
) -> Result<PsiDesignMap, String> {
if row_range.end > n {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"{label}: row range {}..{} exceeds total rows {n}",
row_range.start, row_range.end
),
}
.into());
}
// Prefer operator action when dimensions match.
if let Some(op) = deriv_i.implicit_operator.as_ref()
&& op.n_data() == n
&& op.p_out() == p
{
let same_group = deriv_i.implicit_group_id.is_some()
&& deriv_i.implicit_group_id == deriv_j.implicit_group_id;
if !same_group {
return Ok(PsiDesignMap::Zero {
nrows: row_range.end - row_range.start,
ncols: p,
});
}
match CustomFamilyPsiSecondDesignAction::from_second_derivative(
deriv_i,
deriv_j,
n,
p,
row_range.clone(),
label,
)? {
Some(action) => {
return Ok(PsiDesignMap::Second { action });
}
None => {
return Ok(PsiDesignMap::Zero {
nrows: row_range.end - row_range.start,
ncols: p,
});
}
}
}
// Dense fallback guarded by policy, reading from the per-second-derivative
// slot `x_psi_psi[local_j]` if provided.
if let Some(x_psi_psi) = deriv_i.x_psi_psi.as_ref()
&& let Some(x_ab) = x_psi_psi.get(local_j)
{
if x_ab.nrows() == n && x_ab.ncols() == p {
match policy.derivative_storage_mode {
DerivativeStorageMode::AnalyticOperatorRequired => {
if is_zero_array(x_ab) {
return Ok(PsiDesignMap::Zero {
nrows: row_range.end - row_range.start,
ncols: p,
});
}
return Err(CustomFamilyError::UnsupportedConfiguration {
reason: format!(
"{label}: dense x_psi_psi fallback disabled by AnalyticOperatorRequired"
),
}
.into());
}
DerivativeStorageMode::MaterializeIfSmall
| DerivativeStorageMode::DiagnosticsOnly => {
let matrix = if row_range.start == 0 && row_range.end == n {
Arc::new(x_ab.clone())
} else {
Arc::new(x_ab.slice(ndarray::s![row_range.clone(), ..]).to_owned())
};
return Ok(PsiDesignMap::Dense { matrix });
}
}
}
if x_ab.is_empty() {
return Ok(PsiDesignMap::Zero {
nrows: row_range.end - row_range.start,
ncols: p,
});
}
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"{label}: x_psi_psi shape {:?} does not match ({n}, {p})",
x_ab.dim()
),
}
.into());
}
// No operator, no dense slot: treat as zero.
Ok(PsiDesignMap::Zero {
nrows: row_range.end - row_range.start,
ncols: p,
})
}
#[derive(Clone)]
pub struct ExactNewtonJointPsiTerms {
pub objective_psi: f64,
pub score_psi: Array1<f64>,
pub hessian_psi: Array2<f64>,
pub hessian_psi_operator: Option<Arc<dyn HyperOperator>>,
}
impl std::fmt::Debug for ExactNewtonJointPsiTerms {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ExactNewtonJointPsiTerms")
.field("objective_psi", &self.objective_psi)
.field("score_psi", &self.score_psi)
.field("hessian_psi", &self.hessian_psi)
.field(
"hessian_psi_operator",
&self.hessian_psi_operator.as_ref().map(|_| "<operator>"),
)
.finish()
}
}
impl ExactNewtonJointPsiTerms {
fn zeros(total: usize) -> Self {
Self {
objective_psi: 0.0,
score_psi: Array1::zeros(total),
hessian_psi: Array2::zeros((total, total)),
hessian_psi_operator: None,
}
}
}
pub struct ExactNewtonJointPsiSecondOrderTerms {
pub objective_psi_psi: f64,
pub score_psi_psi: Array1<f64>,
pub hessian_psi_psi: Array2<f64>,
pub hessian_psi_psi_operator: Option<Box<dyn HyperOperator>>,
}
/// Direction-contracted second-order ψ terms for the profiled θ-HVP (#740).
///
/// The per-pair [`ExactNewtonJointPsiSecondOrderTerms`] are the `(ψ_i, ψ_j)`
/// entries of the joint hyper-Hessian; assembling the full outer Hessian from
/// them costs one O(n) family row pass per pair, i.e. `K²·n`. A matrix-free
/// profiled θ-HVP never needs the individual pairs — it needs, for one applied
/// outer direction with ψ-weights `α_ψ`, the `α`-contraction of those pairs
/// against the combined ψ-direction `ψ(α) = Σ_j α_j ψ_j`:
///
/// ```text
/// objective[i] = Σ_j α_j V_{ψ_i ψ_j}
/// score[i] = Σ_j α_j g_{ψ_i ψ_j} (a p-vector per output row i)
/// hessian[i] = Σ_j α_j D²_β H_L[ψ_i, ψ_j]
/// = D²_β H_L[ψ_i, ψ(α)] (bilinearity)
/// ```
///
/// All `psi_dim` output rows share the SAME contracted second leg `ψ(α)`, so a
/// family that streams its rows once over `ψ(α)` (carrying every fixed first
/// leg `ψ_i` as a batched factor column) produces every row in a SINGLE n-pass.
/// That is the cost the profiled θ-HVP turns into `K·n`-to-densify /
/// `m·n`-in-CG instead of the dense path's `K²·n`.
///
/// Indexing is over the flattened ψ coordinates in the same order as
/// [`ExactNewtonJointPsiWorkspace::second_order_terms`]; `hessian[i]` carries
/// the `D²_β H_L[ψ_i, ψ(α)]` drift as a [`DriftDerivResult`] (dense or
/// operator-backed) plus any block-local `S_{ψ_i ψ_j}` penalty motion folded by
/// the family, exactly mirroring the per-pair `hessian_psi_psi(_operator)`.
pub struct ExactNewtonJointPsiSecondOrderContracted {
/// `objective[i] = Σ_j α_j V_{ψ_i ψ_j}`, one scalar per ψ output row.
pub objective: Array1<f64>,
/// `score[i] = Σ_j α_j g_{ψ_i ψ_j}`, the `psi_dim × total` matrix whose
/// row `i` is the contracted fixed-β score derivative for output row `i`.
pub score: Array2<f64>,
/// `hessian[i] = D²_β H_L[ψ_i, ψ(α)]` for each ψ output row `i`.
pub hessian: Vec<DriftDerivResult>,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum JointHessianSourcePreference {
Dense,
Operator,
}
/// What the consumer is going to *do* with the joint Hessian. This is the
/// intent half of #738's capability-vs-representation split: the call site
/// states what it needs, and the workspace picks the cheapest representation
/// that serves that need (rather than a single per-workspace preference being
/// applied uniformly regardless of how the result is consumed).
///
/// The distinction matters because the same workspace serves several
/// consumers with opposite ideal representations:
/// - the inner Newton/PCG solve only ever applies `H · v`, so a matrix-free
/// HVP (`Operator`) is ideal and a dense build is pure waste;
/// - the REML logdet term factorizes `H + S_λ` (Cholesky / eigendecomposition),
/// so it must hold a dense matrix anyway — handing it an `Operator` only
/// forces an immediate column-basis (or `dense_forced`) re-materialization,
/// so a workspace with a structural direct-dense build should answer `Dense`
/// here and skip the operator wrapper entirely.
///
/// Workspaces refine their representation choice per intent via
/// [`ExactNewtonJointHessianWorkspace::hessian_source_preference_for_intent`];
/// the default keeps the legacy single-preference behaviour so existing
/// workspaces are unchanged.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum MaterializationIntent {
/// Inner Newton / PCG solve — only applies `H · v`. Matrix-free is ideal.
InnerSolve,
/// REML/LAML logdet term — factorizes `H + S_λ`, needs a dense matrix.
LogdetFactorization,
/// Outer-Hessian / EFS evaluation — builds the joint hyper terms; today
/// these route through the same source as the gradient path.
OuterEvaluation,
/// Outer-gradient / IFT term assembly.
OuterGradient,
}
pub trait ExactNewtonJointHessianWorkspace: Send + Sync {
/// Pre-build any per-row jet caches the workspace will hand to the
/// outer-eval directional-derivative path. Called once when the
/// `compute_dh` / `compute_d2h` closures are wired up at top-level
/// rayon, *before* the outer ext-coordinate `par_iter` enters. The
/// alternative — letting the cache materialise lazily on first call
/// from inside the outer `par_iter` — collapses the build's own
/// `par_iter` to a single worker (the seven other workers are parked
/// on the cache's `OnceLock`). Default impl is a no-op for workspaces
/// with no per-row jet cache.
///
/// Deliberately not called from PIRLS-side workspaces (which never
/// invoke `directional_derivative_operator` and would pay the prime
/// cost without ever consuming the cache).
fn warm_up_outer_caches(&self) -> Result<(), String> {
Ok(())
}
fn hessian_dense(&self) -> Result<Option<Array2<f64>>, String> {
Ok(None)
}
/// Preferred representation for callers that can consume either the dense
/// coefficient Hessian or the matrix-free HVP source.
fn hessian_source_preference(&self) -> JointHessianSourcePreference {
JointHessianSourcePreference::Dense
}
/// Intent-aware representation choice (#738). Given what the consumer is
/// about to do with the Hessian ([`MaterializationIntent`]), return the
/// representation the workspace prefers to hand back. The default keeps the
/// legacy intent-blind behaviour by delegating to
/// [`Self::hessian_source_preference`], so existing workspaces are
/// unchanged. Workspaces with a structural direct-dense build that also
/// expose a matrix-free HVP override this to answer `Operator` for
/// [`MaterializationIntent::InnerSolve`] (stream the HVP) and `Dense` for
/// [`MaterializationIntent::LogdetFactorization`] (the consumer factorizes,
/// so building the operator wrapper only to re-densify it is pure waste).
fn hessian_source_preference_for_intent(
&self,
intent: MaterializationIntent,
) -> JointHessianSourcePreference {
// Intent-agnostic default: every intent maps to the single legacy
// preference. Implementors that benefit from per-intent representation
// (e.g. CTN: dense for logdet, operator for inner solve) override this.
match intent {
MaterializationIntent::InnerSolve
| MaterializationIntent::LogdetFactorization
| MaterializationIntent::OuterEvaluation
| MaterializationIntent::OuterGradient => self.hessian_source_preference(),
}
}
/// Forced dense materialization that bypasses any amortization gate the
/// workspace applies to `hessian_dense`. Callers that genuinely need a
/// dense matrix (logdet, factorize-based QP solves) use this so they pay
/// the workspace's structural direct-dense build cost rather than the
/// caller-side column-basis HVP fallback. Returning `None` means the
/// workspace has no preferred direct-dense path and the caller should
/// fall back to column-basis HVP via `hessian_matvec` / `apply`.
fn hessian_dense_forced(&self) -> Result<Option<Array2<f64>>, String> {
self.hessian_dense()
}
fn joint_log_likelihood_evaluation(&self) -> Result<Option<f64>, String> {
Ok(None)
}
fn joint_gradient_evaluation(
&self,
) -> Result<Option<ExactNewtonJointGradientEvaluation>, String> {
Ok(None)
}
/// Whether `hessian_matvec` / `hessian_matvec_into` will return `Some`.
/// A cheap synchronisation-free flag consulted by
/// `exact_newton_joint_hessian_source_from_workspace` to decide whether
/// to construct a matrix-free `JointHessianSource::Operator` variant.
/// Returning `false` is equivalent to returning `Ok(None)` from
/// `hessian_matvec` but avoids allocating and running a full HVP sweep
/// against a zero vector just to discover unavailability.
/// Default is `false` matching the base-trait `hessian_matvec` returning
/// `Ok(None)`. Concrete impls that override `hessian_matvec` must also
/// override this to return `true`.
fn hessian_matvec_available(&self) -> bool {
false
}
fn hessian_matvec(&self, arr: &Array1<f64>) -> Result<Option<Array1<f64>>, String> {
assert!(arr.iter().all(|v| !v.is_nan()));
Ok(None)
}
/// Write-into variant of `hessian_matvec`. The default implementation
/// delegates to the legacy owned-return form and copies the result into
/// `out`, providing back-compat without per-impl work. Concrete impls in
/// the inner-Newton large-scale hot path (Bernoulli marginal-slope and
/// survival marginal-slope) override this to write directly into the
/// caller-owned buffer, eliminating per-PCG-iter `Array1` allocations.
fn hessian_matvec_into(&self, v: &Array1<f64>, out: &mut Array1<f64>) -> Result<bool, String> {
match self.hessian_matvec(v)? {
Some(result) => {
if result.len() != out.len() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"hessian_matvec_into: result length {} != out length {}",
result.len(),
out.len()
),
}
.into());
}
out.assign(&result);
Ok(true)
}
None => Ok(false),
}
}
/// Batched multi-RHS Hessian apply: writes `H · V` into `out`, where `V`
/// and `out` are `(total, n_rhs)` with each column an independent
/// direction. Returns `Ok(true)` when the apply was performed and
/// `Ok(false)` when the workspace exposes no matrix-free apply (mirroring
/// `hessian_matvec_into`).
///
/// The default implementation applies `hessian_matvec_into` column by
/// column, so every existing workspace gets a correct batched apply for
/// free and the batched result is, column for column, **numerically
/// identical** to looping the single-vector HVP. Workspaces whose Hessian
/// is `Σ_i Jᵢᵀ Hᵢ Jᵢ` over a streamed/tiled per-row primary Hessian `Hᵢ`
/// (Bernoulli marginal-slope) override this to sweep each row tile **once**
/// and apply its `Hᵢ` to all `n_rhs` columns in that single pass — the
/// per-tile `Hᵢ` read and the design-row projection are then amortised
/// across every RHS instead of paid once per column. This is the
/// representation that makes dense reconstruction of a matrix-free operator
/// (`H = H · [e_0 | … | e_{p-1}]`) one tile sweep wide instead of `p`.
fn hessian_apply_mat(
&self,
v_cols: &Array2<f64>,
out: &mut Array2<f64>,
) -> Result<bool, String> {
if v_cols.nrows() != out.nrows() || v_cols.ncols() != out.ncols() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"hessian_apply_mat: v_cols {}x{} != out {}x{}",
v_cols.nrows(),
v_cols.ncols(),
out.nrows(),
out.ncols()
),
}
.into());
}
let total = v_cols.nrows();
let mut col_in = Array1::<f64>::zeros(total);
let mut col_out = Array1::<f64>::zeros(total);
for col in 0..v_cols.ncols() {
col_in.assign(&v_cols.column(col));
if !self.hessian_matvec_into(&col_in, &mut col_out)? {
return Ok(false);
}
out.column_mut(col).assign(&col_out);
}
Ok(true)
}
fn hessian_diagonal(&self) -> Result<Option<Array1<f64>>, String> {
Ok(None)
}
/// Exact row-local contractions for
/// `trace(F^T · D_beta H[d_j] · F)` over many coefficient directions.
///
/// Workspaces that own the current row cache can implement this to avoid
/// rebuilding row contexts or materializing each `D_beta H[d_j]` as a
/// coefficient-space operator when the caller only needs its projected
/// trace against the fixed logdet factor `F`.
fn projected_directional_derivative_traces(
&self,
factor: &Array2<f64>,
directions: &Array2<f64>,
) -> Result<Option<Array1<f64>>, String> {
assert_eq!(
factor.nrows(),
directions.nrows(),
"projected directional derivative traces require shared coefficient dimension"
);
Ok(None)
}
fn directional_derivative(
&self,
d_beta_flat: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String>;
fn directional_derivative_operator(
&self,
d_beta_flat: &Array1<f64>,
) -> Result<Option<Arc<dyn HyperOperator>>, String> {
Ok(self.directional_derivative(d_beta_flat)?.map(|matrix| {
Arc::new(crate::solver::estimate::reml::unified::DenseMatrixHyperOperator { matrix })
as Arc<dyn HyperOperator>
}))
}
fn directional_derivative_operators(
&self,
d_beta_flats: &[Array1<f64>],
) -> Result<Vec<Option<Arc<dyn HyperOperator>>>, String> {
d_beta_flats
.iter()
.map(|d_beta_flat| self.directional_derivative_operator(d_beta_flat))
.collect()
}
fn second_directional_derivative(
&self,
arr: &Array1<f64>,
arr2: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
assert!(arr.iter().all(|v| !v.is_nan()));
assert!(arr2.iter().all(|v| !v.is_nan()));
Ok(None)
}
fn second_directional_derivative_operator(
&self,
d_beta_u: &Array1<f64>,
d_beta_v: &Array1<f64>,
) -> Result<Option<Arc<dyn HyperOperator>>, String> {
Ok(self
.second_directional_derivative(d_beta_u, d_beta_v)?
.map(|matrix| {
Arc::new(
crate::solver::estimate::reml::unified::DenseMatrixHyperOperator { matrix },
) as Arc<dyn HyperOperator>
}))
}
fn second_directional_derivative_operators(
&self,
d_beta_pairs: &[(Array1<f64>, Array1<f64>)],
) -> Result<Vec<Option<Arc<dyn HyperOperator>>>, String> {
d_beta_pairs
.iter()
.map(|(u, v)| self.second_directional_derivative_operator(u, v))
.collect()
}
}
pub trait ExactNewtonJointPsiWorkspace: Send + Sync {
fn first_order_terms(&self, idx: usize) -> Result<Option<ExactNewtonJointPsiTerms>, String> {
assert!(idx < usize::MAX);
Ok(None)
}
fn first_order_terms_all(&self) -> Result<Option<Vec<ExactNewtonJointPsiTerms>>, String> {
Ok(None)
}
fn second_order_terms(
&self,
psi_i: usize,
psi_j: usize,
) -> Result<Option<ExactNewtonJointPsiSecondOrderTerms>, String>;
/// Direction-contracted second-order ψ terms for the profiled θ-HVP (#740).
///
/// Given the ψ-block weights `alpha_psi` (length `psi_dim`, the ψ slice of
/// one applied outer direction α), return the `α`-contraction of every
/// `(ψ_i, ψ_j)` second-order term against the combined ψ-direction
/// `ψ(α) = Σ_j alpha_psi[j] · ψ_j`, as
/// [`ExactNewtonJointPsiSecondOrderContracted`]. A family that can stream
/// its rows once over `ψ(α)` overrides this so the profiled outer-Hessian
/// operator applies one combined-direction n-pass per matvec instead of the
/// dense path's `K²` per-pair [`Self::second_order_terms`] passes.
///
/// Default returns `None`: the profiled θ-HVP operator is then not built and
/// the evaluator keeps the exact per-pair assembly (dense
/// `compute_outer_hessian` / `build_outer_hessian_operator`). Overriding
/// this method is purely a representation/cost choice — it must produce the
/// exact same contraction the per-pair terms would, which the
/// `profiled_theta_hvp_outer_hessian_fd` finite-difference cross-check
/// guards.
fn second_order_terms_contracted(
&self,
alpha_psi: &[f64],
) -> Result<Option<ExactNewtonJointPsiSecondOrderContracted>, String> {
assert!(alpha_psi.len() < usize::MAX);
Ok(None)
}
fn hessian_directional_derivative(
&self,
psi_index: usize,
d_beta_flat: &Array1<f64>,
) -> Result<Option<DriftDerivResult>, String>;
}
pub(crate) struct ExactNewtonJointPsiDirectCache<T> {
entries: Vec<Mutex<Option<Option<Arc<T>>>>>,
lru: Mutex<std::collections::VecDeque<usize>>,
limit: usize,
}
impl<T> ExactNewtonJointPsiDirectCache<T> {
pub(crate) fn new(len: usize) -> Self {
Self {
entries: (0..len).map(|_| Mutex::new(None)).collect(),
lru: Mutex::new(std::collections::VecDeque::new()),
limit: len,
}
}
fn touch_lru(&self, index: usize) -> Result<(), String> {
let mut lru = self
.lru
.lock()
.map_err(|_| "joint psi direct cache lru poisoned".to_string())?;
lru.retain(|&existing| existing != index);
lru.push_back(index);
while lru.len() > self.limit {
let Some(evict_index) = lru.pop_front() else {
break;
};
if evict_index == index {
continue;
}
if let Some(entry) = self.entries.get(evict_index) {
let mut guard = entry
.lock()
.map_err(|_| "joint psi direct cache poisoned".to_string())?;
*guard = None;
}
}
Ok(())
}
pub(crate) fn get_or_try_init<F>(&self, index: usize, init: F) -> Result<Option<Arc<T>>, String>
where
F: FnOnce() -> Result<Option<T>, String>,
{
let Some(entry) = self.entries.get(index) else {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"psi cache index {index} out of bounds for size {}",
self.entries.len()
),
}
.into());
};
{
let guard = entry
.lock()
.map_err(|_| "joint psi direct cache poisoned".to_string())?;
if let Some(cached) = guard.as_ref() {
let cached = cached.clone();
// release-early-on-purpose: update LRU after releasing the entry mutex.
drop(guard);
self.touch_lru(index)?;
return Ok(cached);
}
}
let computed = init()?.map(Arc::new);
let mut guard = entry
.lock()
.map_err(|_| "joint psi direct cache poisoned".to_string())?;
let cached = guard.get_or_insert_with(|| computed.clone());
let out = cached.clone();
// release-early-on-purpose: update LRU after releasing the entry mutex.
drop(guard);
self.touch_lru(index)?;
Ok(out)
}
}
#[derive(Clone)]
pub struct CustomFamilyWarmStart {
inner: ConstrainedWarmStart,
}
impl CustomFamilyWarmStart {
pub(crate) fn compatible_with_rho(&self, rho: &Array1<f64>) -> bool {
screened_outer_warm_start(Some(&self.inner), rho).is_some()
}
pub(crate) fn block_beta_len(&self, block_idx: usize) -> Option<usize> {
self.inner.block_beta.get(block_idx).map(|beta| beta.len())
}
pub(crate) fn block_beta_abs_argmax_in_range(
&self,
block_idx: usize,
range: std::ops::Range<usize>,
) -> Option<(usize, f64)> {
let beta = self.inner.block_beta.get(block_idx)?;
let end = range.end.min(beta.len());
if range.start >= end {
return None;
}
beta.slice(s![range.start..end])
.iter()
.copied()
.enumerate()
.map(|(idx, value)| (range.start + idx, value.abs()))
.filter(|(_, abs)| abs.is_finite())
.max_by(|left, right| {
left.1
.partial_cmp(&right.1)
.unwrap_or(std::cmp::Ordering::Equal)
})
}
/// Build a warm-start payload from a flat cached β and the per-block
/// coefficient widths. The returned warm-start carries a zero `rho`
/// (the outer cache will overwrite it on the next eval) and empty
/// active sets; only the per-block β slices feed the next inner
/// PIRLS / Newton solve. Used by the spatial-joint outer cache to
/// seed the family-owned warm-start slot on cache hits so the inner
/// solve opens at the prior converged iterate instead of cold β.
pub fn from_cached_beta(
block_col_counts: &[usize],
beta: &Array1<f64>,
) -> Result<Self, EstimationError> {
let expected: usize = block_col_counts.iter().copied().sum();
if beta.len() != expected {
crate::bail_invalid_estim!(
"cached inner beta has length {}, but spatial-joint blocks require length {}",
beta.len(),
expected
);
}
crate::families::marginal_slope_shared::bail_if_cached_beta_non_finite(beta)?;
let mut offset = 0usize;
let mut block_beta = Vec::with_capacity(block_col_counts.len());
for &width in block_col_counts {
let end = offset + width;
block_beta.push(beta.slice(s![offset..end]).to_owned());
offset = end;
}
Ok(CustomFamilyWarmStart {
inner: ConstrainedWarmStart {
rho: Array1::zeros(0),
block_beta,
active_sets: vec![None; block_col_counts.len()],
cached_inner: None,
},
})
}
}
struct CustomOuterState {
warm_cache: Option<ConstrainedWarmStart>,
reset_warm_cache: Option<ConstrainedWarmStart>,
last_error: Option<String>,
initial_gradient_norm: Option<f64>,
}
impl CustomOuterState {
fn new(warm_start: Option<ConstrainedWarmStart>) -> Self {
Self {
warm_cache: warm_start.clone(),
reset_warm_cache: warm_start,
last_error: None,
initial_gradient_norm: None,
}
}
fn reset(&mut self) {
self.warm_cache = self.reset_warm_cache.clone();
}
fn seed_cached_beta(
&mut self,
rho_dim: usize,
specs: &[ParameterBlockSpec],
beta: &Array1<f64>,
) -> Result<(), EstimationError> {
let warm_start = constrained_warm_start_from_cached_beta(rho_dim, specs, beta)?;
self.reset_warm_cache = Some(warm_start.clone());
self.warm_cache = Some(warm_start);
self.last_error = None;
Ok(())
}
}
pub struct CustomFamilyJointHyperResult {
pub objective: f64,
pub gradient: Array1<f64>,
pub outer_hessian: crate::solver::outer_strategy::HessianResult,
pub warm_start: CustomFamilyWarmStart,
/// `false` when the inner blockwise/Newton solve hit its divergence
/// early-exit or its max-cycle cap. Envelope-theorem outer gradients
/// and analytic outer Hessians are valid only at a stationary β̂ —
/// callers that consume `gradient`/`outer_hessian` MUST gate on this
/// flag and treat non-converged evaluations as inexact (e.g. let ARC
/// back off the trust region) rather than feeding pathological
/// derivatives into the outer optimizer.
pub inner_converged: bool,
}
pub struct CustomFamilyJointHyperEfsResult {
pub efs_eval: crate::solver::outer_strategy::EfsEval,
pub warm_start: CustomFamilyWarmStart,
/// See [`CustomFamilyJointHyperResult::inner_converged`]. EFS gradients
/// also assume a stationary inner solve.
pub inner_converged: bool,
}
struct OuterObjectiveEvalResult {
objective: f64,
gradient: Array1<f64>,
outer_hessian: crate::solver::outer_strategy::HessianResult,
warm_start: ConstrainedWarmStart,
inner_converged: bool,
}
fn outer_eval_result_to_joint_hyper_result(
result: OuterObjectiveEvalResult,
) -> CustomFamilyJointHyperResult {
CustomFamilyJointHyperResult {
objective: result.objective,
gradient: result.gradient,
outer_hessian: result.outer_hessian,
warm_start: CustomFamilyWarmStart {
inner: result.warm_start,
},
inner_converged: result.inner_converged,
}
}
struct OwnedDenseOuterHessianOperator {
matrix: Array2<f64>,
}
impl crate::solver::outer_strategy::OuterHessianOperator for OwnedDenseOuterHessianOperator {
fn dim(&self) -> usize {
self.matrix.nrows()
}
fn matvec(&self, v: &Array1<f64>) -> Result<Array1<f64>, String> {
if v.len() != self.matrix.ncols() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"batched dense outer Hessian matvec length mismatch: got {}, expected {}",
v.len(),
self.matrix.ncols()
),
}
.into());
}
Ok(self.matrix.dot(v))
}
/// Zero-alloc override: write `matrix · v` directly into `out` using a
/// row-dot loop, avoiding the `matrix.dot(v)` allocation.
fn apply_into(&self, v: &Array1<f64>, out: &mut Array1<f64>) -> Result<(), String> {
if v.len() != self.matrix.ncols() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"batched dense outer Hessian apply_into input length mismatch: got {}, expected {}",
v.len(),
self.matrix.ncols()
),
}
.into());
}
if out.len() != self.matrix.nrows() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"batched dense outer Hessian apply_into output length mismatch: got {}, expected {}",
out.len(),
self.matrix.nrows()
),
}
.into());
}
for (row, cell) in self.matrix.rows().into_iter().zip(out.iter_mut()) {
*cell = row.dot(v);
}
Ok(())
}
fn is_cheap_to_materialize(&self) -> bool {
true
}
}
struct LabeledOuterHessianOperator {
base: Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>,
physical_to_outer: Vec<Option<usize>>,
outer_dim: usize,
/// Scratch buffers reused across `apply_into` calls to avoid
/// per-call allocation of the permuted input and output vectors.
/// `(physical_in, physical_out)`, each of length `physical_to_outer.len()`.
scratch: std::sync::Mutex<(ndarray::Array1<f64>, ndarray::Array1<f64>)>,
}
impl LabeledOuterHessianOperator {
fn new(
base: Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>,
layout: &PenaltyLabelLayout,
) -> Self {
let n_physical = layout.physical_to_outer.len();
Self {
base,
physical_to_outer: layout.physical_to_outer.clone(),
outer_dim: layout.initial_rho.len(),
scratch: std::sync::Mutex::new((
ndarray::Array1::zeros(n_physical),
ndarray::Array1::zeros(n_physical),
)),
}
}
}
impl crate::solver::outer_strategy::OuterHessianOperator for LabeledOuterHessianOperator {
fn dim(&self) -> usize {
self.outer_dim
}
fn matvec(&self, v: &Array1<f64>) -> Result<Array1<f64>, String> {
if v.len() != self.outer_dim {
return Err(format!(
"labeled outer Hessian input length mismatch: got {}, expected {}",
v.len(),
self.outer_dim
));
}
let mut physical = Array1::<f64>::zeros(self.physical_to_outer.len());
for (physical_idx, outer_idx) in self.physical_to_outer.iter().enumerate() {
physical[physical_idx] = outer_idx.map(|idx| v[idx]).unwrap_or(0.0);
}
let physical_out = self.base.matvec(&physical)?;
if physical_out.len() != self.physical_to_outer.len() {
return Err(format!(
"labeled outer Hessian physical matvec length mismatch: got {}, expected {}",
physical_out.len(),
self.physical_to_outer.len()
));
}
let mut out = Array1::<f64>::zeros(self.outer_dim);
for (physical_idx, outer_idx) in self.physical_to_outer.iter().enumerate() {
if let Some(outer_idx) = *outer_idx {
out[outer_idx] += physical_out[physical_idx];
}
}
Ok(out)
}
/// Zero-alloc override: reuses hoisted scratch buffers to avoid the
/// per-call `physical` and `out` allocations in `matvec`.
fn apply_into(
&self,
v: &ndarray::Array1<f64>,
out: &mut ndarray::Array1<f64>,
) -> Result<(), String> {
if v.len() != self.outer_dim {
return Err(format!(
"labeled outer Hessian apply_into input length mismatch: got {}, expected {}",
v.len(),
self.outer_dim
));
}
if out.len() != self.outer_dim {
return Err(format!(
"labeled outer Hessian apply_into output length mismatch: got {}, expected {}",
out.len(),
self.outer_dim
));
}
let mut guard = self
.scratch
.lock()
.map_err(|_| "labeled outer Hessian scratch lock poisoned".to_string())?;
let (physical_in, physical_out) = &mut *guard;
for (physical_idx, outer_idx) in self.physical_to_outer.iter().enumerate() {
physical_in[physical_idx] = outer_idx.map(|idx| v[idx]).unwrap_or(0.0);
}
self.base.apply_into(physical_in, physical_out)?;
if physical_out.len() != self.physical_to_outer.len() {
return Err(format!(
"labeled outer Hessian physical apply_into length mismatch: got {}, expected {}",
physical_out.len(),
self.physical_to_outer.len()
));
}
out.fill(0.0);
for (physical_idx, outer_idx) in self.physical_to_outer.iter().enumerate() {
if let Some(outer_idx) = *outer_idx {
out[outer_idx] += physical_out[physical_idx];
}
}
Ok(())
}
fn mul_mat(&self, factor: ndarray::ArrayView2<'_, f64>) -> Result<Array2<f64>, String> {
if factor.nrows() != self.outer_dim {
return Err(format!(
"labeled outer Hessian factor row mismatch: got {}, expected {}",
factor.nrows(),
self.outer_dim
));
}
let mut physical_factor =
Array2::<f64>::zeros((self.physical_to_outer.len(), factor.ncols()));
for (physical_idx, outer_idx) in self.physical_to_outer.iter().enumerate() {
if let Some(outer_idx) = *outer_idx {
physical_factor
.row_mut(physical_idx)
.assign(&factor.row(outer_idx));
}
}
let physical_out = self.base.mul_mat(physical_factor.view())?;
if physical_out.nrows() != self.physical_to_outer.len() {
return Err(format!(
"labeled outer Hessian physical output row mismatch: got {}, expected {}",
physical_out.nrows(),
self.physical_to_outer.len()
));
}
let mut out = Array2::<f64>::zeros((self.outer_dim, factor.ncols()));
for (physical_idx, outer_idx) in self.physical_to_outer.iter().enumerate() {
if let Some(outer_idx) = *outer_idx {
let physical_row = physical_out.row(physical_idx);
out.row_mut(outer_idx).scaled_add(1.0, &physical_row);
}
}
Ok(out)
}
fn is_cheap_to_materialize(&self) -> bool {
self.base.is_cheap_to_materialize()
}
fn materialization_capability(
&self,
) -> crate::solver::outer_strategy::OuterHessianMaterialization {
self.base.materialization_capability()
}
}
fn custom_family_batched_outer_hessian_operator<F: CustomFamily>(
family: &F,
states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
rho: &Array1<f64>,
workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
eval_mode: EvalMode,
) -> Result<Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>>, String> {
if eval_mode != EvalMode::ValueGradientHessian {
return Ok(None);
}
let Some(terms) =
family.batched_outer_hessian_terms(states, specs, derivative_blocks, rho, workspace)?
else {
return Ok(None);
};
match terms.outer_hessian {
crate::solver::outer_strategy::HessianResult::Operator(operator) => Ok(Some(operator)),
crate::solver::outer_strategy::HessianResult::Analytic(matrix) => {
Ok(Some(Arc::new(OwnedDenseOuterHessianOperator { matrix })))
}
crate::solver::outer_strategy::HessianResult::Unavailable => Ok(None),
}
}
fn outer_efs_result_to_joint_hyper_efs_result(
efs_eval: crate::solver::outer_strategy::EfsEval,
warm_start: ConstrainedWarmStart,
inner_converged: bool,
) -> CustomFamilyJointHyperEfsResult {
CustomFamilyJointHyperEfsResult {
efs_eval,
warm_start: CustomFamilyWarmStart { inner: warm_start },
inner_converged,
}
}
// Unified exact joint hyper-calculus over theta = [rho, psi].
//
// The correct outer problem is not “a rho objective plus a separate psi
// objective”. It is one profiled/Laplace surface over one flattened hypervector
//
// theta = [rho, psi],
//
// one flattened joint coefficient vector
//
// beta = [beta_1; ...; beta_B],
//
// and one joint exact mode system
//
// F(beta, theta) := V_beta(beta, theta) = 0,
// H(beta, theta) := V_beta_beta(beta, theta).
//
// For every hypercoordinate theta_i we need the fixed-beta objects
//
// V_i = partial_{theta_i} V,
// g_i = partial_{theta_i} F,
// H_i = partial_{theta_i} H,
//
// and for every pair (i, j)
//
// V_ij, g_ij, H_ij,
//
// together with the beta-curvature contractions
//
// D_beta H[u],
// D_beta^2 H[u, v],
// T_i[u] := D_beta H_i[u].
//
// The exact profiled mode response and total Hessian drifts are then
//
// beta_i = -H^{-1} g_i,
// beta_ij = -H^{-1}(g_ij + H_i beta_j + H_j beta_i + D_beta H[beta_i] beta_j),
//
// dot H_i
// = H_i + D_beta H[beta_i],
//
// ddot H_ij
// = H_ij
// + T_i[beta_j]
// + T_j[beta_i]
// + D_beta H[beta_ij]
// + D_beta^2 H[beta_i, beta_j].
//
// Hence the exact joint profiled/Laplace derivatives are
//
// J_i
// = V_i + 0.5 tr(H^{-1} dot H_i) - 0.5 partial_i log|S(theta)|_+,
//
// J_ij
// = (V_ij - g_i^T H^{-1} g_j)
// + 0.5 [ tr(H^{-1} ddot H_ij)
// - tr(H^{-1} dot H_j H^{-1} dot H_i) ]
// - 0.5 partial^2_{ij} log|S(theta)|_+.
//
// In this unified view rho and psi are the same outer calculus. They differ
// only in where their fixed-beta derivative objects come from:
//
// - rho coordinates often contribute only through the penalty surface,
// but the generic assembler intentionally treats the penalty as S(theta),
// not S(rho), so mixed rho/psi penalty terms are allowed whenever realized
// component penalties move with psi:
// V_i = D_i + 0.5 beta^T S_i beta
// g_i = D_beta_i + S_i beta
// H_i = D_beta_beta_i + S_i
// V_ij = D_ij + 0.5 beta^T S_ij beta
// g_ij = D_beta_ij + S_ij beta
// H_ij = D_beta_beta_ij + S_ij.
//
// - psi coordinates come from the family-specific joint exact psi hooks, while
// the generic assembler still owns any realized-penalty motion through
// S_i / S_ij:
// objective_psi <-> V_i
// score_psi <-> g_i
// hessian_psi <-> H_i
// objective_psi_psi <-> V_ij
// score_psi_psi <-> g_ij
// hessian_psi_psi <-> H_ij
// D_beta H_psi[u] <-> T_i[u].
//
// For coupled families this means any block-local psi path is wrong. Even when
// g_i is sparse or penalty-local, beta_i is defined by the full joint solve
//
// beta_i = -H^{-1} g_i,
//
// so every exact outer derivative must be assembled in this joint flattened
// space.
#[derive(Debug, Clone, Error)]
pub enum CustomFamilyError {
#[error("custom-family invalid input in {context}: {reason}")]
InvalidInput {
context: &'static str,
reason: String,
},
#[error("custom-family optimization error in {context}: {reason}")]
Optimization {
context: &'static str,
reason: String,
},
#[error("{reason}")]
DimensionMismatch { reason: String },
#[error("{reason}")]
NumericalFailure { reason: String },
#[error("{reason}")]
ConstraintViolation { reason: String },
#[error("{reason}")]
UnsupportedConfiguration { reason: String },
#[error("{reason}")]
BasisDecompositionFailed { reason: String },
/// Pre-fit cross-block identifiability audit refused the fit. The
/// joint design across `ParameterBlockSpec`s carries a rank
/// deficiency that the post-`joint_null_rotation` absorption did
/// not resolve: two or more blocks contribute the same direction,
/// or a structural >2-way alias was detected without per-pair
/// attribution. The full `IdentifiabilityAudit` is held so
/// consumers (logs, structured-error sinks, the seed driver's
/// classifier) can extract the alias pairs and the summary string
/// without reparsing.
#[error("identifiability audit refused the fit: {}", audit.summary)]
IdentifiabilityFailure {
audit: crate::solver::identifiability_audit::IdentifiabilityAudit,
},
/// MAP estimate uniqueness condition `ker(J^T W J) ∩ ker(S) = {0}` is
/// violated. A null direction of `J^T W J` carries zero penalty
/// curvature, so the posterior is flat along that direction and the
/// MAP is non-unique. The structured [`MapUniquenessError`] names the
/// dominant block so the caller can add the missing penalty or remove
/// the unpenalised direction.
#[error("MAP estimate non-unique: {}", error)]
MapUniquenessFailure {
error: crate::solver::identifiability_audit::MapUniquenessError,
},
}
impl From<String> for CustomFamilyError {
fn from(value: String) -> Self {
Self::InvalidInput {
context: "custom-family string boundary",
reason: value,
}
}
}
impl From<CustomFamilyError> for String {
fn from(value: CustomFamilyError) -> Self {
value.to_string()
}
}
pub(crate) fn validate_blockspecs(specs: &[ParameterBlockSpec]) -> Result<Vec<usize>, String> {
// `fit_custom_family` is a fit entry point and genuinely requires at least
// one parameter block — an empty model has nothing to estimate. This is a
// *fit-level precondition*, distinct from the *consistency* of the block
// specs themselves, which is checked by `validate_blockspec_consistency`.
if specs.is_empty() {
return Err(CustomFamilyError::UnsupportedConfiguration {
reason: "fit_custom_family requires at least one parameter block".to_string(),
}
.into());
}
validate_blockspec_consistency(specs)
}
/// Validate the *internal consistency* of a slice of parameter block specs
/// (unique names; design/offset/initial_beta/penalty dimensions agree) without
/// imposing the fit-level "at least one block" precondition.
///
/// An empty slice is vacuously consistent and returns an empty penalty-count
/// vector. The non-empty fit precondition lives in [`validate_blockspecs`];
/// pure operator-materialization hooks (e.g. `batched_outer_hessian_terms`)
/// must use this consistency check instead, so they can be probed with an
/// empty, self-consistent argument set without tripping a fit precondition
/// that does not apply to them.
pub(crate) fn validate_blockspec_consistency(
specs: &[ParameterBlockSpec],
) -> Result<Vec<usize>, String> {
let mut seen_names = BTreeMap::<String, usize>::new();
for (b, spec) in specs.iter().enumerate() {
if let Some(prev) = seen_names.insert(spec.name.clone(), b) {
return Err(CustomFamilyError::ConstraintViolation {
reason: format!(
"duplicate parameter block name '{}' at indices {prev} and {b}: block names must be unique so coefficient labels resolved by name are unambiguous",
spec.name
),
}
.into());
}
}
let mut penalty_counts = Vec::with_capacity(specs.len());
for (b, spec) in specs.iter().enumerate() {
let n = spec.design.nrows();
if spec.offset.len() != n {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"block {b} offset length mismatch: got {}, expected {}",
spec.offset.len(),
n
),
}
.into());
}
// `stacked_design` and `stacked_offset` must be `Some` together
// and their row/length must agree. This enforces the contract
// that `solver_design()` and `solver_offset()` always return a
// matched pair.
match (&spec.stacked_design, &spec.stacked_offset) {
(Some(sd), Some(so)) => {
if sd.nrows() != so.len() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"block {b} stacked_design/stacked_offset row mismatch: \
stacked_design.nrows()={}, stacked_offset.len()={}",
sd.nrows(),
so.len(),
),
}
.into());
}
if sd.ncols() != spec.design.ncols() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"block {b} stacked_design column count {} disagrees with \
design column count {}",
sd.ncols(),
spec.design.ncols(),
),
}
.into());
}
}
(None, None) => {}
(Some(_), None) | (None, Some(_)) => {
return Err(CustomFamilyError::ConstraintViolation {
reason: format!(
"block {b} stacked_design and stacked_offset must be Some together \
or both None"
),
}
.into());
}
}
let p = spec.design.ncols();
if let Some(beta0) = &spec.initial_beta
&& beta0.len() != p
{
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"block {b} initial_beta length mismatch: got {}, expected {p}",
beta0.len()
),
}
.into());
}
if spec.initial_log_lambdas.len() != spec.penalties.len() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"block {b} initial_log_lambdas length {} does not match penalties {}",
spec.initial_log_lambdas.len(),
spec.penalties.len()
),
}
.into());
}
for (k, s) in spec.penalties.iter().enumerate() {
let (r, c) = s.shape();
if r != p || c != p {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!("block {b} penalty {k} must be {p}x{p}, got {r}x{c}"),
}
.into());
}
}
penalty_counts.push(spec.penalties.len());
}
Ok(penalty_counts)
}
fn with_block_geometry<F: CustomFamily + ?Sized, T>(
family: &F,
block_states: &[ParameterBlockState],
spec: &ParameterBlockSpec,
block_idx: usize,
f: impl FnOnce(&DesignMatrix, &Array1<f64>) -> Result<T, String>,
) -> Result<T, String> {
if family.block_geometry_is_dynamic() {
let (x_dyn, off_dyn) = family.block_geometry(block_states, spec)?;
let expected_rows = spec.solver_design().nrows();
if x_dyn.nrows() != expected_rows {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"block {block_idx} dynamic design row mismatch: got {}, expected {}",
x_dyn.nrows(),
expected_rows
),
}
.into());
}
if x_dyn.ncols() != spec.design.ncols() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"block {block_idx} dynamic design col mismatch: got {}, expected {}",
x_dyn.ncols(),
spec.design.ncols()
),
}
.into());
}
if off_dyn.len() != expected_rows {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"block {block_idx} dynamic offset length mismatch: got {}, expected {}",
off_dyn.len(),
expected_rows
),
}
.into());
}
f(&x_dyn, &off_dyn)
} else {
f(spec.solver_design(), spec.solver_offset())
}
}
fn flatten_log_lambdas(specs: &[ParameterBlockSpec]) -> Array1<f64> {
let total = specs
.iter()
.map(|s| s.initial_log_lambdas.len())
.sum::<usize>();
let mut out = Array1::<f64>::zeros(total);
let mut at = 0usize;
for spec in specs {
let len = spec.initial_log_lambdas.len();
if len > 0 {
out.slice_mut(ndarray::s![at..at + len])
.assign(&spec.initial_log_lambdas);
}
at += len;
}
out
}
#[derive(Clone, Debug)]
struct PenaltyLabelLayout {
penalty_counts: Vec<usize>,
physical_to_outer: Vec<Option<usize>>,
fixed_log_lambdas: Vec<Option<f64>>,
initial_rho: Array1<f64>,
}
impl PenaltyLabelLayout {
fn physical_count(&self) -> usize {
self.physical_to_outer.len()
}
fn has_tied_coordinates(&self) -> bool {
self.initial_rho.len() != self.physical_to_outer.len()
}
}
fn penalty_label_layout(
specs: &[ParameterBlockSpec],
penalty_counts: Vec<usize>,
) -> Result<PenaltyLabelLayout, String> {
let mut label_to_outer = BTreeMap::<String, usize>::new();
let mut physical_to_outer = Vec::<Option<usize>>::new();
let mut fixed_log_lambdas = Vec::<Option<f64>>::new();
let mut initial = Vec::<f64>::new();
for (block_idx, spec) in specs.iter().enumerate() {
for penalty_idx in 0..spec.penalties.len() {
if let Some(fixed) = spec.penalties[penalty_idx].fixed_log_lambda() {
if !fixed.is_finite() {
return Err(CustomFamilyError::ConstraintViolation {
reason: format!(
"block {block_idx} penalty {penalty_idx} fixed log-precision is non-finite: {fixed}"
),
}
.into());
}
physical_to_outer.push(None);
fixed_log_lambdas.push(Some(fixed));
continue;
}
let label = spec.penalties[penalty_idx]
.precision_label()
.map(str::to_owned)
.unwrap_or_else(|| format!("__block_{block_idx}_penalty_{penalty_idx}"));
let rho0 = spec.initial_log_lambdas[penalty_idx];
let outer = if let Some(&outer) = label_to_outer.get(&label) {
let first = initial[outer];
if first.is_finite() && rho0.is_finite() && (first - rho0).abs() > 1e-10 {
return Err(CustomFamilyError::ConstraintViolation { reason: format!(
"precision label '{label}' has inconsistent initial log-precisions: {first} and {rho0}"
) }.into());
}
outer
} else {
let outer = initial.len();
label_to_outer.insert(label, outer);
initial.push(rho0);
outer
};
physical_to_outer.push(Some(outer));
fixed_log_lambdas.push(None);
}
}
Ok(PenaltyLabelLayout {
penalty_counts,
physical_to_outer,
fixed_log_lambdas,
initial_rho: Array1::from_vec(initial),
})
}
fn expand_labeled_log_lambdas(
rho: &Array1<f64>,
layout: &PenaltyLabelLayout,
) -> Result<Array1<f64>, String> {
if rho.len() != layout.initial_rho.len() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"log-lambda label coordinate mismatch: got {}, expected {}",
rho.len(),
layout.initial_rho.len()
),
}
.into());
}
let mut expanded = Array1::<f64>::zeros(layout.physical_count());
for (physical, outer) in layout.physical_to_outer.iter().enumerate() {
expanded[physical] = match *outer {
Some(outer) => rho[outer],
None => layout.fixed_log_lambdas[physical].ok_or_else(|| {
CustomFamilyError::ConstraintViolation {
reason: format!(
"fixed penalty layout missing value at physical slot {physical}"
),
}
.to_string()
})?,
};
}
Ok(expanded)
}
fn split_labeled_log_lambdas(
rho: &Array1<f64>,
layout: &PenaltyLabelLayout,
) -> Result<Vec<Array1<f64>>, String> {
let expanded = expand_labeled_log_lambdas(rho, layout)?;
split_log_lambdas(&expanded, &layout.penalty_counts)
}
fn aggregate_labeled_gradient(
gradient: &Array1<f64>,
layout: &PenaltyLabelLayout,
) -> Result<Array1<f64>, String> {
if gradient.len() != layout.physical_count() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"physical gradient length mismatch: got {}, expected {}",
gradient.len(),
layout.physical_count()
),
}
.into());
}
let mut out = Array1::<f64>::zeros(layout.initial_rho.len());
for (physical, outer) in layout.physical_to_outer.iter().enumerate() {
if let Some(outer) = *outer {
out[outer] += gradient[physical];
}
}
Ok(out)
}
fn aggregate_labeled_hessian(
hessian: &Array2<f64>,
layout: &PenaltyLabelLayout,
) -> Result<Array2<f64>, String> {
if hessian.nrows() != layout.physical_count() || hessian.ncols() != layout.physical_count() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"physical Hessian shape mismatch: got {}x{}, expected {}x{}",
hessian.nrows(),
hessian.ncols(),
layout.physical_count(),
layout.physical_count()
),
}
.into());
}
let mut out = Array2::<f64>::zeros((layout.initial_rho.len(), layout.initial_rho.len()));
for (i, oi) in layout.physical_to_outer.iter().enumerate() {
let Some(oi) = *oi else { continue };
for (j, oj) in layout.physical_to_outer.iter().enumerate() {
if let Some(oj) = *oj {
out[[oi, oj]] += hessian[[i, j]];
}
}
}
Ok(out)
}
/// Adapter over the shared [`rho_prior_eval`](crate::solver::estimate::reml::rho_prior_eval)
/// engine using the custom-family invalid-prior policy
/// (`HardError`): the prior math is shared with the REML/LAML runtime, and a
/// malformed prior surfaces as a structured [`CustomFamilyError`] rather than
/// being folded into the objective.
fn rho_prior_cost_gradient_hessian(
prior: &crate::types::RhoPrior,
rho: &Array1<f64>,
) -> Result<(f64, Array1<f64>, Option<Array2<f64>>), String> {
use crate::solver::estimate::reml::rho_prior_eval::{InvalidPriorPolicy, RhoPriorError};
match crate::solver::estimate::reml::rho_prior_eval::evaluate(
prior,
rho,
InvalidPriorPolicy::HardError,
) {
Ok(eval) => Ok((eval.cost, eval.gradient, eval.hessian)),
Err(RhoPriorError::DimensionMismatch { reason }) => {
Err(CustomFamilyError::DimensionMismatch { reason }.into())
}
Err(RhoPriorError::ConstraintViolation { reason }) => {
Err(CustomFamilyError::ConstraintViolation { reason }.into())
}
}
}
fn add_labeled_rho_prior_to_outer_eval(
mut result: OuterObjectiveEvalResult,
rho: &Array1<f64>,
rho_prior: &crate::types::RhoPrior,
eval_mode: EvalMode,
) -> Result<OuterObjectiveEvalResult, String> {
// For tied physical penalties, the likelihood/LAML contribution is first
// evaluated in the expanded physical coordinates and then pulled back to
// the user-facing labeled coordinates. The configured prior lives on the
// labeled precision itself, so it is added once after that pullback:
//
// V_label(rho) = V_base(E rho) + pi(rho),
// ∇V_label = E' ∇V_base(E rho) + ∇pi(rho),
// ∇²V_label = E' ∇²V_base(E rho) E + ∇²pi(rho),
//
// where E maps each physical penalty piece to its outer label. This is
// the same change-of-variables identity used for overlapping/nested group
// penalties; the prior is not repeated for each physical child component.
if matches!(rho_prior, crate::types::RhoPrior::Flat) {
return Ok(result);
}
let (cost, gradient, hessian) = rho_prior_cost_gradient_hessian(rho_prior, rho)?;
result.objective += cost;
if eval_mode != EvalMode::ValueOnly {
if result.gradient.len() != gradient.len() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"rho prior gradient length mismatch: got {}, expected {}",
gradient.len(),
result.gradient.len()
),
}
.into());
}
result.gradient += &gradient;
}
if eval_mode == EvalMode::ValueGradientHessian
&& let Some(prior_hessian) = hessian
{
result.outer_hessian.add_rho_block_dense(&prior_hessian)?;
}
Ok(result)
}
fn physical_warm_start_for_labeled(
warm_start: Option<&ConstrainedWarmStart>,
physical_rho: &Array1<f64>,
layout: &PenaltyLabelLayout,
) -> Option<ConstrainedWarmStart> {
if !layout.has_tied_coordinates() {
return None;
}
warm_start.map(|seed| {
let mut physical_seed = seed.clone();
physical_seed.rho = physical_rho.clone();
physical_seed
})
}
fn pullback_labeled_outer_eval(
mut result: OuterObjectiveEvalResult,
rho: &Array1<f64>,
layout: &PenaltyLabelLayout,
rho_prior: &crate::types::RhoPrior,
eval_mode: EvalMode,
) -> Result<OuterObjectiveEvalResult, String> {
if eval_mode == EvalMode::ValueOnly {
result.gradient = Array1::<f64>::zeros(layout.initial_rho.len());
} else {
result.gradient = aggregate_labeled_gradient(&result.gradient, layout)?;
}
if eval_mode == EvalMode::ValueGradientHessian {
result.outer_hessian = match result.outer_hessian {
crate::solver::outer_strategy::HessianResult::Analytic(hessian) => {
crate::solver::outer_strategy::HessianResult::Analytic(aggregate_labeled_hessian(
&hessian, layout,
)?)
}
crate::solver::outer_strategy::HessianResult::Operator(operator) => {
crate::solver::outer_strategy::HessianResult::Operator(Arc::new(
LabeledOuterHessianOperator::new(operator, layout),
))
}
crate::solver::outer_strategy::HessianResult::Unavailable => {
crate::solver::outer_strategy::HessianResult::Unavailable
}
};
}
result.warm_start.rho = rho.clone();
add_labeled_rho_prior_to_outer_eval(result, rho, rho_prior, eval_mode)
}
fn outerobjectivegradienthessian_labeled<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
layout: &PenaltyLabelLayout,
rho: &Array1<f64>,
warm_start: Option<&ConstrainedWarmStart>,
rho_prior: &crate::types::RhoPrior,
eval_mode: EvalMode,
) -> Result<OuterObjectiveEvalResult, String> {
let physical_rho = expand_labeled_log_lambdas(rho, layout)?;
let physical_warm_start = physical_warm_start_for_labeled(warm_start, &physical_rho, layout);
let base = outerobjectivegradienthessian_internal(
family,
specs,
options,
&layout.penalty_counts,
&physical_rho,
physical_warm_start.as_ref().or(warm_start),
crate::types::RhoPrior::Flat,
eval_mode,
)?;
pullback_labeled_outer_eval(base, rho, layout, rho_prior, eval_mode)
}
fn custom_family_seed_screening_proxy_labeled<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
layout: &PenaltyLabelLayout,
rho: &Array1<f64>,
warm_start: Option<&ConstrainedWarmStart>,
rho_prior: &crate::types::RhoPrior,
) -> Result<(f64, ConstrainedWarmStart, bool), String> {
let physical_rho = expand_labeled_log_lambdas(rho, layout)?;
let per_block = split_log_lambdas(&physical_rho, &layout.penalty_counts)?;
let physical_warm_start = physical_warm_start_for_labeled(warm_start, &physical_rho, layout);
// Seed screening only RANKS candidate seeds by their penalized objective; it
// is capped and never produces the final fit. Mark the inner solve as a
// screening solve so it skips the O(p · per-axis-Hdot) full Jeffreys/Firth
// curvature loop and keeps only the cheap value-only Jeffreys term in the
// score (gam#729/#808). For a K-block coupled family (Dirichlet/multinomial)
// each per-axis directional derivative is O(K²·n·p), so paying the full term
// for every cascade candidate over the joint width is the wrong cost class
// and made the coupled fit non-completing in screening alone. The real fit
// (after a seed is selected) runs with `seed_screening = false`, so the
// load-bearing Firth curvature is fully present where it matters.
let screening_options = BlockwiseFitOptions {
seed_screening: true,
..options.clone()
};
let mut inner = inner_blockwise_fit(
family,
specs,
&per_block,
&screening_options,
physical_warm_start.as_ref().or(warm_start),
)?;
refresh_all_block_etas(family, specs, &mut inner.block_states)?;
let prior_terms = rho_prior_cost_gradient_hessian(rho_prior, rho)?;
let score = inner_penalized_objective(
&inner,
include_exact_newton_logdet_h(family, options),
include_exact_newton_logdet_s(family, options),
"custom-family labeled seed-screening proxy",
)? + prior_terms.0;
let warm = ConstrainedWarmStart {
rho: rho.clone(),
block_beta: inner
.block_states
.iter()
.map(|state| state.beta.clone())
.collect(),
active_sets: inner.active_sets.clone(),
cached_inner: Some(cached_inner_mode_from_result(&inner)),
};
Ok((score, warm, inner.converged))
}
fn split_log_lambdas(
flat: &Array1<f64>,
penalty_counts: &[usize],
) -> Result<Vec<Array1<f64>>, String> {
let expected: usize = penalty_counts.iter().sum();
if flat.len() != expected {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"log-lambda length mismatch: got {}, expected {expected}",
flat.len()
),
}
.into());
}
let mut out = Vec::with_capacity(penalty_counts.len());
let mut at = 0usize;
for &k in penalty_counts {
out.push(flat.slice(ndarray::s![at..at + k]).to_owned());
at += k;
}
Ok(out)
}
fn buildblock_states<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
) -> Result<Vec<ParameterBlockState>, String> {
let mut states = Vec::with_capacity(specs.len());
for (b, spec) in specs.iter().enumerate() {
let p = spec.design.ncols();
let beta = spec
.initial_beta
.clone()
.unwrap_or_else(|| Array1::<f64>::zeros(p));
let eta = with_block_geometry(family, &states, spec, b, |x, off| {
let mut eta = x.matrixvectormultiply(&beta);
eta += off;
Ok(eta)
})?;
states.push(ParameterBlockState { beta, eta });
}
// After every block state is populated, pass each β through
// `post_update_block_beta` so the invariant "every `states[b].beta`
// in `inner_blockwise_fit` is feasible" holds from the first eval
// call onward — matching the same projection the warm-start seed
// path at 5932 already applies. Defers projection to this second
// pass because some family overrides (e.g.
// `SurvivalMarginalSlopeFamily::post_update_block_beta`) read
// `block_states[block_idx]` during projection, and `block_idx == b`
// is only populated once the first pass has pushed all states.
//
// Without this, a caller that supplies `initial_beta = Some(infeasible)`
// — or leaves it `None` for a family whose zero vector violates the
// family's bounds — feeds an infeasible β into
// `exact_newton_joint_hessian` / `evaluate` before the first
// line-search trial, silently corrupting the fit or tripping
// `max_feasible_step_size` guards on iteration 1. The warm-start
// path (5925-5938) projects on entry for exactly this reason; this
// extends the invariant to the cold-start path too.
for b in 0..specs.len() {
let raw = states[b].beta.clone();
let projected = family.post_update_block_beta(&states, b, &specs[b], raw)?;
states[b].beta.assign(&projected);
}
// Note: the caller (`inner_blockwise_fit`) calls `refresh_all_block_etas`
// immediately after this returns, so η is recomputed against the
// projected β before any family evaluation runs. We don't duplicate
// the refresh here.
Ok(states)
}
fn refresh_all_block_etas<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
states: &mut [ParameterBlockState],
) -> Result<(), String> {
if family.block_geometry_is_dynamic() {
for b in 0..specs.len() {
refresh_single_block_eta(family, specs, states, b)?;
}
return Ok(());
}
use rayon::iter::{IntoParallelIterator, ParallelIterator};
let refreshed_etas: Vec<Array1<f64>> = (0..specs.len())
.into_par_iter()
.map(|b| {
specs[b]
.solver_design()
.matrixvectormultiply(&states[b].beta)
+ specs[b].solver_offset()
})
.collect();
for (state, eta) in states.iter_mut().zip(refreshed_etas) {
state.eta = eta;
}
Ok(())
}
fn refresh_single_block_eta<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
states: &mut [ParameterBlockState],
block_idx: usize,
) -> Result<(), String> {
let spec = &specs[block_idx];
let beta = states[block_idx].beta.clone();
states[block_idx].eta = with_block_geometry(family, states, spec, block_idx, |x, off| {
Ok(x.matrixvectormultiply(&beta) + off)
})?;
Ok(())
}
#[inline]
fn capped_inner_max_cycles(options: &BlockwiseFitOptions, base_cycles: usize) -> usize {
let mut cap = base_cycles;
if let Some(screening) = options.screening_max_inner_iterations.as_ref() {
let screening_cap = screening.load(Ordering::Relaxed);
if screening_cap > 0 {
cap = cap.min(screening_cap);
}
}
if let Some(outer) = options.outer_inner_max_iterations.as_ref() {
let outer_cap = outer.load(Ordering::Relaxed);
// `0` is the `SEED_SCREENING_UNCAPPED` sentinel: "no cap — use the full
// `pirls_config.max_iterations`". The outer bridges store it into this
// atomic for the line-search COST probe so the deciding cost is the true
// converged-inner envelope objective the analytic gradient differentiates
// (gam#787/#808). Honoring it requires the SAME `> 0` guard the screening
// branch above uses; an unconditional `cap.min(0)` would collapse the
// probe to a single inner cycle (`.max(1)`), guaranteeing a non-converged
// inner solve and a spurious `∞` cost — re-introducing the frozen-|g|
// outer stall the uncap was meant to remove.
if outer_cap > 0 {
cap = cap.min(outer_cap);
}
}
cap.max(1)
}
fn weighted_normal_equations(
x: &DesignMatrix,
w: &Array1<f64>,
y_star: Option<&Array1<f64>>,
) -> Result<(Array2<f64>, Option<Array1<f64>>), String> {
let n = x.nrows();
if w.len() != n {
return Err(CustomFamilyError::DimensionMismatch {
reason: "weighted normal-equation dimension mismatch".to_string(),
}
.into());
}
if let Some(y) = y_star
&& y.len() != n
{
return Err(CustomFamilyError::DimensionMismatch {
reason: "weighted RHS dimension mismatch".to_string(),
}
.into());
}
let xtwx = x.xt_diag_x_signed_op(SignedWeightsView::from_array(w))?;
let xtwy = if let Some(y) = y_star {
Some(x.compute_xtwy(w, y)?)
} else {
None
};
Ok((xtwx, xtwy))
}
/// Smallest diagonal shift that makes the penalized joint Hessian
/// Cholesky-factorable (i.e. positive definite at the solver floor), or `None`
/// when the matrix is already PD and needs no shift.
///
/// PERF (gam#729/#826): the stabilizing shift is recomputed every inner Newton
/// cycle. For a coupled K-block family (Dirichlet/multinomial) the joint Hessian
/// is structurally near-singular along the cross-block gauge / sum-to-zero null
/// space, so a shift fires on (almost) every cycle. The previous implementation
/// ran a full dense self-adjoint eigendecomposition (`O(p³)`, all eigenpairs)
/// just to read `min_eval` — the dominant per-cycle cost on the coupled inner
/// solve. We only need a PD CERTIFICATE plus the smallest lifting ridge, which a
/// Cholesky probe gives far more cheaply: a plain Cholesky succeeds in one shot
/// on a well-conditioned cycle (no shift), and a geometric ridge escalation
/// finds the lifting shift in a handful of `O(p³/3)` Cholesky attempts on the
/// near-singular cycles — strictly cheaper than the full eigh and short-circuiting
/// on the first PD factorization. The resulting shift makes `H_pen + δI` PD,
/// which is exactly what the downstream solve requires.
fn exact_newton_stabilizing_shift(lhs_dense: &Array2<f64>, ridge_floor: f64) -> Option<f64> {
let floor = effective_solverridge(ridge_floor);
// Fast path: already PD at zero shift ⇒ no stabilization needed. One Cholesky
// (O(p³/3)), the common case on a well-conditioned cycle.
if lhs_dense.cholesky(Side::Lower).is_ok() {
return None;
}
// Near-singular / indefinite. We need a positive diagonal shift `δ` that makes
// `H + δI` PD. A full eigendecomposition (the previous implementation) reads
// the exact `λ_min` but costs `O(p³)` for ALL eigenpairs EVERY inner cycle;
// for a coupled K-block family the shift fires almost every cycle, so that
// dominated the inner solve (gam#729/#826). A Cholesky-escalation search is
// even worse on a hard-near-singular block (many `O(p³)` Cholesky retries).
//
// Use the Gershgorin lower bound on `λ_min` instead — a single `O(p²)` pass,
// no iteration: every eigenvalue lies in some disc
// `[H_ii − R_i, H_ii + R_i]` with `R_i = Σ_{j≠i} |H_ij|`, so
// `λ_min ≥ min_i (H_ii − R_i) =: g`. Shifting by `δ = floor − g` (when `g`
// is at/below the floor) guarantees `λ_min(H + δI) = λ_min + δ ≥ floor > 0`,
// i.e. `H + δI` is PD. The bound is conservative (δ may be larger than the
// exact eigh shift), but it is self-vanishing in the well-conditioned regime
// (handled by the Cholesky fast path above) and the downstream solve only
// requires PD, not the tightest possible shift — and the trust region governs
// step size regardless. `O(p²)` per cycle instead of `O(p³)`.
let p = lhs_dense.nrows();
let mut gershgorin_min = f64::INFINITY;
for i in 0..p {
let diag = lhs_dense[[i, i]];
let mut radius = 0.0_f64;
for j in 0..p {
if j != i {
radius += lhs_dense[[i, j]].abs();
}
}
gershgorin_min = gershgorin_min.min(diag - radius);
}
if !gershgorin_min.is_finite() {
let diag_max = (0..p)
.map(|d| lhs_dense[[d, d]].abs())
.fold(0.0_f64, f64::max);
return Some(floor.max(diag_max * 1e-6).max(1e-6));
}
if gershgorin_min >= floor {
// Gershgorin certifies PD-at-floor but the no-shift Cholesky failed
// (round-off on a barely-PD matrix): a floor-sized shift suffices.
return Some(floor);
}
Some(floor - gershgorin_min)
}
fn stabilize_exact_newton_lhs_in_place<F: CustomFamily + ?Sized>(
family: &F,
lhs_dense: &mut Array2<f64>,
ridge_floor: f64,
) {
if use_exact_newton_strict_spd(family) {
return;
}
if let Some(shift) = exact_newton_stabilizing_shift(lhs_dense, ridge_floor) {
for d in 0..lhs_dense.nrows() {
lhs_dense[[d, d]] += shift;
}
}
}
fn shift_linear_constraints_to_delta(
constraints: &LinearInequalityConstraints,
beta: &Array1<f64>,
) -> Result<LinearInequalityConstraints, String> {
if constraints.a.ncols() != beta.len() || constraints.a.nrows() != constraints.b.len() {
return Err(CustomFamilyError::ConstraintViolation {
reason: "linear constraints: shape mismatch".to_string(),
}
.into());
}
Ok(LinearInequalityConstraints {
a: constraints.a.clone(),
b: &constraints.b - &constraints.a.dot(beta),
})
}
fn collect_block_linear_constraints<F: CustomFamily + ?Sized>(
family: &F,
states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
) -> Result<Vec<Option<LinearInequalityConstraints>>, String> {
let mut constraints = Vec::with_capacity(specs.len());
for (block_idx, spec) in specs.iter().enumerate() {
constraints.push(family.block_linear_constraints(states, block_idx, spec)?);
}
Ok(constraints)
}
fn reject_constrained_post_update_repair(
block_idx: usize,
spec: &ParameterBlockSpec,
raw_beta: &Array1<f64>,
updated_beta: &Array1<f64>,
constraints: Option<&LinearInequalityConstraints>,
) -> Result<(), String> {
let Some(constraints) = constraints else {
return Ok(());
};
if raw_beta.len() != updated_beta.len() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"post-update beta length changed for constrained block '{}' (idx {block_idx}): raw={}, updated={}",
spec.name,
raw_beta.len(),
updated_beta.len(),
),
}
.into());
}
if raw_beta.len() != constraints.a.ncols() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"post-update constrained block '{}' (idx {block_idx}) width mismatch: beta={}, constraints={}",
spec.name,
raw_beta.len(),
constraints.a.ncols(),
),
}
.into());
}
let max_change = raw_beta
.iter()
.zip(updated_beta.iter())
.map(|(left, right)| (left - right).abs())
.fold(0.0_f64, f64::max);
let raw_scale = raw_beta.iter().map(|v| v.abs()).fold(0.0_f64, f64::max);
let updated_scale = updated_beta.iter().map(|v| v.abs()).fold(0.0_f64, f64::max);
let tol = 1e-10 * (1.0 + raw_scale.max(updated_scale));
if max_change > tol {
return Err(CustomFamilyError::ConstraintViolation {
reason: format!(
"post-update hook materially changed constrained block '{}' (idx {block_idx}): \
max |β_post - β_qp|={max_change:.3e} > tol={tol:.3e}; \
constraints must be represented analytically in block_linear_constraints, not repaired after the Newton/QP solve",
spec.name,
),
}
.into());
}
Ok(())
}
fn assemble_joint_linear_constraints(
block_constraints: &[Option<LinearInequalityConstraints>],
ranges: &[(usize, usize)],
total_p: usize,
) -> Result<Option<LinearInequalityConstraints>, String> {
if block_constraints.len() != ranges.len() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"joint linear constraint assembly mismatch: {} blocks but {} ranges",
block_constraints.len(),
ranges.len()
),
}
.into());
}
let total_rows = block_constraints
.iter()
.map(|constraints| constraints.as_ref().map_or(0, |c| c.a.nrows()))
.sum::<usize>();
if total_rows == 0 {
return Ok(None);
}
let mut a = Array2::<f64>::zeros((total_rows, total_p));
let mut b = Array1::<f64>::zeros(total_rows);
let mut row_offset = 0usize;
for (block_idx, constraints_opt) in block_constraints.iter().enumerate() {
let Some(constraints) = constraints_opt else {
continue;
};
let (start, end) = ranges[block_idx];
let block_p = end - start;
if constraints.a.ncols() != block_p || constraints.a.nrows() != constraints.b.len() {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"joint linear constraint assembly mismatch for block {block_idx}: A is {}x{}, b is {}, block width is {}",
constraints.a.nrows(),
constraints.a.ncols(),
constraints.b.len(),
block_p
) }.into());
}
let rows = constraints.a.nrows();
a.slice_mut(s![row_offset..(row_offset + rows), start..end])
.assign(&constraints.a);
b.slice_mut(s![row_offset..(row_offset + rows)])
.assign(&constraints.b);
row_offset += rows;
}
Ok(Some(LinearInequalityConstraints { a, b }))
}
fn flatten_joint_active_set(
block_active_sets: &[Option<Vec<usize>>],
block_constraints: &[Option<LinearInequalityConstraints>],
) -> Option<Vec<usize>> {
if block_active_sets.len() != block_constraints.len() {
return None;
}
let mut offset = 0usize;
let mut joint_active = Vec::new();
for (active_opt, constraints_opt) in block_active_sets.iter().zip(block_constraints.iter()) {
let rows = constraints_opt
.as_ref()
.map_or(0, |constraints| constraints.a.nrows());
if let Some(active) = active_opt {
joint_active.extend(
active
.iter()
.copied()
.filter(|&idx| idx < rows)
.map(|idx| offset + idx),
);
}
offset += rows;
}
if joint_active.is_empty() {
None
} else {
Some(joint_active)
}
}
fn scatter_joint_active_set(
joint_active: &[usize],
block_constraints: &[Option<LinearInequalityConstraints>],
) -> Vec<Option<Vec<usize>>> {
let mut per_block = Vec::with_capacity(block_constraints.len());
let mut offset = 0usize;
for constraints_opt in block_constraints {
let rows = constraints_opt
.as_ref()
.map_or(0, |constraints| constraints.a.nrows());
if rows == 0 {
per_block.push(None);
continue;
}
let mut local = joint_active
.iter()
.copied()
.filter(|&idx| idx >= offset && idx < offset + rows)
.map(|idx| idx - offset)
.collect::<Vec<_>>();
offset += rows;
if local.is_empty() {
per_block.push(None);
continue;
}
local.sort_unstable();
local.dedup();
per_block.push(Some(local));
}
per_block
}
/// Assemble the **active rows** of the joint linear inequality constraint
/// matrix into a single `(k_active × total_p)` block, suitable for the
/// unified evaluator's constraint-aware kernel.
///
/// Inputs:
/// * `block_constraints`: per-block dense `LinearInequalityConstraints`
/// (the family's full inequality system per block, output of
/// `collect_block_linear_constraints`).
/// * `block_active_sets`: per-block indices of rows currently active
/// (output of the joint Newton's QP solver / `cached_active_sets`).
/// * `ranges`: per-block column ranges within the joint β.
/// * `total_p`: sum of block widths.
///
/// Returns `None` when no block has any active constraints — the caller
/// can then skip the constraint-aware kernel entirely.
fn assemble_active_constraint_block(
block_constraints: &[Option<LinearInequalityConstraints>],
block_active_sets: &[Option<Vec<usize>>],
ranges: &[(usize, usize)],
total_p: usize,
) -> Option<crate::solver::estimate::reml::unified::ActiveLinearConstraintBlock> {
if block_constraints.len() != ranges.len() || block_active_sets.len() != ranges.len() {
return None;
}
let mut active_per_block: Vec<(usize, &[usize], &LinearInequalityConstraints)> = Vec::new();
let mut total_active = 0usize;
for (b, (range, (constraints_opt, active_opt))) in ranges
.iter()
.zip(block_constraints.iter().zip(block_active_sets.iter()))
.enumerate()
{
let Some(constraints) = constraints_opt else {
continue;
};
let Some(active) = active_opt else {
continue;
};
if active.is_empty() {
continue;
}
if constraints.a.ncols() != range.1 - range.0 {
return None;
}
if !active.iter().all(|&r| r < constraints.a.nrows()) {
return None;
}
total_active += active.len();
active_per_block.push((b, active.as_slice(), constraints));
}
if total_active == 0 {
return None;
}
let mut a = ndarray::Array2::<f64>::zeros((total_active, total_p));
let mut out_row = 0usize;
for (b_idx, active, constraints) in active_per_block {
let (start, end) = ranges[b_idx];
let block_p = end - start;
for &local_row in active {
for col in 0..block_p {
a[[out_row, start + col]] = constraints.a[[local_row, col]];
}
out_row += 1;
}
}
Some(crate::solver::estimate::reml::unified::ActiveLinearConstraintBlock { a })
}
struct SimpleLowerBounds {
lower_bounds: Array1<f64>,
row_to_coeff: Vec<usize>,
coeff_to_row: Vec<Option<usize>>,
}
fn extract_simple_lower_bounds(
constraints: &LinearInequalityConstraints,
p: usize,
) -> Result<Option<SimpleLowerBounds>, String> {
if constraints.a.ncols() != p || constraints.a.nrows() != constraints.b.len() {
return Err(CustomFamilyError::ConstraintViolation {
reason: "linear constraints: shape mismatch".to_string(),
}
.into());
}
let mut lower_bounds = Array1::from_elem(p, f64::NEG_INFINITY);
let mut coeff_to_row = vec![None; p];
let mut row_to_coeff = Vec::with_capacity(constraints.a.nrows());
for row in 0..constraints.a.nrows() {
let mut coeff_idx = None;
let mut coeff_value = 0.0;
for col in 0..p {
let value = constraints.a[[row, col]];
if value.abs() <= 1e-12 {
continue;
}
if coeff_idx.is_some() {
return Ok(None);
}
coeff_idx = Some(col);
coeff_value = value;
}
let Some(col) = coeff_idx else {
return Ok(None);
};
if coeff_value <= 0.0 {
return Ok(None);
}
let bound = constraints.b[row] / coeff_value;
if bound > lower_bounds[col] {
lower_bounds[col] = bound;
coeff_to_row[col] = Some(row);
}
row_to_coeff.push(col);
}
Ok(Some(SimpleLowerBounds {
lower_bounds,
row_to_coeff,
coeff_to_row,
}))
}
fn lower_bound_active_rows_to_coeffs(
bounds: &SimpleLowerBounds,
active_rows: Option<&[usize]>,
) -> Vec<usize> {
let Some(active_rows) = active_rows else {
return Vec::new();
};
let mut active_coeffs = active_rows
.iter()
.copied()
.filter_map(|row| bounds.row_to_coeff.get(row).copied())
.collect::<Vec<_>>();
active_coeffs.sort_unstable();
active_coeffs.dedup();
active_coeffs
}
fn lower_bound_active_coeffs_to_rows(
bounds: &SimpleLowerBounds,
active_coeffs: &[usize],
) -> Vec<usize> {
let mut active_rows = active_coeffs
.iter()
.copied()
.filter_map(|coeff| bounds.coeff_to_row.get(coeff).and_then(|row| *row))
.collect::<Vec<_>>();
active_rows.sort_unstable();
active_rows.dedup();
active_rows
}
fn lower_bound_active_coeffs_from_solution(
bounds: &SimpleLowerBounds,
beta: &Array1<f64>,
) -> Vec<usize> {
let mut active_coeffs = Vec::new();
for coeff in 0..beta.len() {
let lower = bounds.lower_bounds[coeff];
if !lower.is_finite() {
continue;
}
let scale = beta[coeff].abs().max(lower.abs()).max(1.0);
let tol = 1e-6 * scale + 1e-10;
if beta[coeff] <= lower + tol {
active_coeffs.push(coeff);
}
}
active_coeffs
}
fn project_to_lower_bounds(beta: &mut Array1<f64>, lower_bounds: &Array1<f64>) {
for i in 0..beta.len() {
let lower = lower_bounds[i];
if lower.is_finite() && beta[i] < lower {
beta[i] = lower;
}
}
}
fn solve_quadratic_with_simple_lower_bounds(
lhs: &Array2<f64>,
rhs: &Array1<f64>,
beta_start: &Array1<f64>,
bounds: &SimpleLowerBounds,
active_rows: Option<&[usize]>,
) -> Result<(Array1<f64>, Vec<usize>), String> {
let gradient = lhs.dot(beta_start) - rhs;
let mut delta = Array1::zeros(beta_start.len());
let mut active_coeffs = lower_bound_active_rows_to_coeffs(bounds, active_rows);
solve_newton_directionwith_lower_bounds(
lhs,
&gradient,
beta_start,
&bounds.lower_bounds,
&mut delta,
Some(&mut active_coeffs),
)
.map_err(|e| format!("lower-bound Newton solve failed: {e}"))?;
let mut beta_new = beta_start + δ
project_to_lower_bounds(&mut beta_new, &bounds.lower_bounds);
active_coeffs = lower_bound_active_coeffs_from_solution(bounds, &beta_new);
let active = lower_bound_active_coeffs_to_rows(bounds, &active_coeffs);
Ok((beta_new, active))
}
fn normalize_active_set(mut active_set: Vec<usize>) -> Option<Vec<usize>> {
active_set.sort_unstable();
active_set.dedup();
if active_set.is_empty() {
None
} else {
Some(active_set)
}
}
fn normalize_active_sets(active_sets: Vec<Option<Vec<usize>>>) -> Vec<Option<Vec<usize>>> {
active_sets
.into_iter()
.map(|active_set| active_set.and_then(normalize_active_set))
.collect()
}
struct BlockUpdateContext<'a> {
family: &'a dyn CustomFamily,
states: &'a [ParameterBlockState],
spec: &'a ParameterBlockSpec,
block_idx: usize,
s_lambda: &'a Array2<f64>,
options: &'a BlockwiseFitOptions,
linear_constraints: Option<&'a LinearInequalityConstraints>,
cached_active_set: Option<&'a [usize]>,
}
struct BlockUpdateResult {
beta_new_raw: Array1<f64>,
active_set: Option<Vec<usize>>,
}
#[inline]
fn floor_positiveworking_weights(working_weights: &Array1<f64>, minweight: f64) -> Array1<f64> {
let mut out = Array1::<f64>::zeros(working_weights.len());
ndarray::Zip::from(&mut out)
.and(working_weights)
.par_for_each(|o, &wi| *o = if wi <= 0.0 { 0.0 } else { wi.max(minweight) });
out
}
trait ParameterBlockUpdater {
fn compute_update_step(
&self,
ctx: &BlockUpdateContext<'_>,
) -> Result<BlockUpdateResult, String>;
}
struct DiagonalBlockUpdater<'a> {
working_response: &'a Array1<f64>,
working_weights: &'a Array1<f64>,
}
impl ParameterBlockUpdater for DiagonalBlockUpdater<'_> {
fn compute_update_step(
&self,
ctx: &BlockUpdateContext<'_>,
) -> Result<BlockUpdateResult, String> {
if self.working_response.len() != ctx.spec.design.nrows()
|| self.working_weights.len() != ctx.spec.design.nrows()
{
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"family diagonal working-set size mismatch on block {} ({})",
ctx.block_idx, ctx.spec.name
),
}
.into());
}
// Zero-weight observations are semantically excluded and must stay inactive.
let w_clamped = floor_positiveworking_weights(self.working_weights, ctx.options.minweight);
if let Some(constraints) = ctx.linear_constraints {
check_linear_feasibility(&ctx.states[ctx.block_idx].beta, constraints, 1e-8).map_err(
|e| {
format!(
"block {} ({}) constrained diagonal solve: {e}",
ctx.block_idx, ctx.spec.name
)
},
)?;
with_block_geometry(ctx.family, ctx.states, ctx.spec, ctx.block_idx, |x, off| {
let mut y_star = self.working_response.clone();
y_star -= off;
let (mut lhs, rhs_opt) = weighted_normal_equations(x, &w_clamped, Some(&y_star))?;
let rhs = rhs_opt.ok_or_else(|| {
"missing weighted RHS in constrained diagonal solve".to_string()
})?;
lhs += ctx.s_lambda;
let lower_bounds = extract_simple_lower_bounds(constraints, lhs.ncols())?;
let (beta_constrained, active_set) = if let Some(bounds) = lower_bounds.as_ref() {
solve_quadratic_with_simple_lower_bounds(
&lhs,
&rhs,
&ctx.states[ctx.block_idx].beta,
bounds,
ctx.cached_active_set,
)
} else {
solve_quadratic_with_linear_constraints(
&lhs,
&rhs,
&ctx.states[ctx.block_idx].beta,
constraints,
ctx.cached_active_set,
)
.map_err(|e| e.to_string())
}
.map_err(|e| {
format!(
"block {} ({}) constrained diagonal solve failed: {e}",
ctx.block_idx, ctx.spec.name
)
})?;
Ok(BlockUpdateResult {
beta_new_raw: beta_constrained,
active_set: normalize_active_set(active_set),
})
})
} else {
with_block_geometry(ctx.family, ctx.states, ctx.spec, ctx.block_idx, |x, off| {
// Fuse offset subtraction into the weighted RHS: wy[i] = w[i] * (z[i] - off[i]).
// This avoids an O(n) working_response clone.
let n = self.working_response.len();
let wy = Array1::from_shape_fn(n, |i| {
(self.working_response[i] - off[i]) * w_clamped[i].max(0.0)
});
let xtwy = x.transpose_vector_multiply(&wy);
let beta = x
.solve_systemwith_policy(
&w_clamped,
&xtwy,
Some(ctx.s_lambda),
ctx.options.ridge_floor,
ctx.options.ridge_policy,
)
.map_err(|_| "block solve failed after ridge retries".to_string())?;
Ok(BlockUpdateResult {
beta_new_raw: beta,
active_set: None,
})
})
}
}
}
struct ExactNewtonBlockUpdater<'a> {
gradient: &'a Array1<f64>,
hessian: &'a SymmetricMatrix,
}
impl ParameterBlockUpdater for ExactNewtonBlockUpdater<'_> {
fn compute_update_step(
&self,
ctx: &BlockUpdateContext<'_>,
) -> Result<BlockUpdateResult, String> {
let p = ctx.spec.design.ncols();
if self.gradient.len() != p {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"block {} exact-newton gradient length mismatch: got {}, expected {p}",
ctx.block_idx,
self.gradient.len()
),
}
.into());
}
if self.hessian.nrows() != p || self.hessian.ncols() != p {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"block {} exact-newton Hessian shape mismatch: got {}x{}, expected {}x{}",
ctx.block_idx,
self.hessian.nrows(),
self.hessian.ncols(),
p,
p
),
}
.into());
}
let lhs = self.hessian.add_dense(ctx.s_lambda)?;
// Solve in delta-space for both constrained and unconstrained blocks.
// That keeps the linear system consistent even when we add a
// numerical ridge to stabilize an indefinite exact-Newton Hessian.
let rhs_step = self.gradient - &ctx.s_lambda.dot(&ctx.states[ctx.block_idx].beta);
let mut lhs_dense = lhs.to_dense();
stabilize_exact_newton_lhs_in_place(ctx.family, &mut lhs_dense, ctx.options.ridge_floor);
if let Some(constraints) = ctx.linear_constraints {
check_linear_feasibility(&ctx.states[ctx.block_idx].beta, constraints, 1e-8).map_err(
|e| {
format!(
"block {} ({}) constrained exact-newton solve: {e}",
ctx.block_idx, ctx.spec.name
)
},
)?;
let lower_bounds = extract_simple_lower_bounds(constraints, p).map_err(|e| {
format!(
"block {} ({}) constrained exact-newton solve: {e}",
ctx.block_idx, ctx.spec.name
)
})?;
let (beta_new_raw, active_set) = if let Some(bounds) = lower_bounds.as_ref() {
let rhs_beta = &lhs_dense.dot(&ctx.states[ctx.block_idx].beta) + &rhs_step;
solve_quadratic_with_simple_lower_bounds(
&lhs_dense,
&rhs_beta,
&ctx.states[ctx.block_idx].beta,
bounds,
ctx.cached_active_set,
)
} else {
let delta_constraints =
shift_linear_constraints_to_delta(constraints, &ctx.states[ctx.block_idx].beta)
.map_err(|e| {
format!(
"block {} ({}) constrained exact-newton solve: {e}",
ctx.block_idx, ctx.spec.name
)
})?;
let delta_start = Array1::zeros(p);
let (delta, active_set) = solve_quadratic_with_linear_constraints(
&lhs_dense,
&rhs_step,
&delta_start,
&delta_constraints,
ctx.cached_active_set,
)
.map_err(|e| e.to_string())?;
Ok((&ctx.states[ctx.block_idx].beta + &delta, active_set))
}
.map_err(|e| {
format!(
"block {} ({}) constrained exact-newton solve failed: {e}",
ctx.block_idx, ctx.spec.name
)
})?;
Ok(BlockUpdateResult {
beta_new_raw,
active_set: normalize_active_set(active_set),
})
} else {
// Solve for the Newton step, not the next beta directly.
//
// For the penalized negative objective
//
// Q(beta) = -log L(beta) + 0.5 beta^T S beta,
//
// the exact block gradient and Hessian are
//
// grad_Q = S beta - gradient,
// hess_Q = hessian + S.
//
// The Newton step must therefore satisfy
//
// hess_Q * delta = -grad_Q = gradient - S beta.
//
// This form stays correct even when the linear solver adds a
// numerical ridge to the left-hand side to stabilize an indefinite
// or nearly singular block. Solving directly for `beta_new` with a
// ridged matrix would require an extra `ridge * beta` term on the
// right-hand side; without it the step is distorted, which can trap
// exact-Newton block updates on nonconvex blocks such as survival
// `log_sigma`.
let delta = if use_exact_newton_strict_spd(ctx.family) {
// Strict-mode Newton step uses the LM δ-ridge continuation:
// a single near-zero eigenvalue from numerical noise in
// H_β should not bounce the entire seed evaluation. The
// bare strict_solve_spd contract is preserved (still used
// by other paths and the existing test
// `pseudo_laplace_path_skips_eigendecomposition_avoiding_nan_crash`);
// here we pay an O(p³) extra Cholesky attempt when needed
// to keep adaptive optimization moving.
let (step, lm_stats) =
strict_solve_spd_with_lm_continuation(&lhs_dense, &rhs_step)?;
if lm_stats.escalations > 0 {
log::debug!(
"[strict-spd-lm] block={} ({}): δ-ridge continuation succeeded \
after {} escalation(s) at δ={:.3e}",
ctx.block_idx,
ctx.spec.name,
lm_stats.escalations,
lm_stats.delta_used,
);
}
step
} else {
// Non-strict (RidgedQuadraticReml) families share the strict
// path's LM δ-ridge continuation. For a nonconvex block whose
// likelihood Hessian H_β is INDEFINITE away from the optimum —
// e.g. the squared-coefficient SCOP transformation-normal tensor
// over a smooth covariate, where the I(y)⊗b(x) columns are
// strongly collinear — the previous `solve_spd_systemwith_policy`
// (ridge-retry + pinv-positive-part) returns a valid but
// poorly-scaled descent step that crawls and hits the inner
// cycle cap. The eigenvalue-floored LM continuation produces a
// well-scaled Newton step on exactly those indefinite /
// ill-conditioned systems. It is a STRICT SUPERSET of the plain
// solve: when H_β + S is SPD and well-conditioned it reduces to
// the same Cholesky step (zero escalations), only escalating the
// floor when the system is genuinely indefinite — so
// well-behaved families see no behaviour change. Internal to the
// solve; β is recovered in the raw basis, so dimensionality /
// identifiability are untouched.
let step = match strict_solve_spd_with_lm_continuation(&lhs_dense, &rhs_step) {
Ok((step, lm_stats)) => {
if lm_stats.escalations > 0 {
log::debug!(
"[joint-Newton/lm] block={} ({}): non-strict δ-ridge continuation \
succeeded after {} escalation(s) at δ={:.3e}",
ctx.block_idx,
ctx.spec.name,
lm_stats.escalations,
lm_stats.delta_used,
);
}
step
}
// Final guard: only if the LM continuation itself fails to
// produce a finite step do we fall back to the diagonal-
// scaled steepest-descent direction (always finite when the
// gradient is finite).
Err(_) => (0..lhs_dense.nrows())
.map(|i| {
let d = lhs_dense[[i, i]].abs().max(1e-8);
rhs_step[i] / d
})
.collect(),
};
step
};
let beta = &ctx.states[ctx.block_idx].beta + δ
Ok(BlockUpdateResult {
beta_new_raw: beta,
active_set: None,
})
}
}
}
impl BlockWorkingSet {
fn updater(&self) -> Box<dyn ParameterBlockUpdater + '_> {
match self {
BlockWorkingSet::Diagonal {
working_response,
working_weights,
} => Box::new(DiagonalBlockUpdater {
working_response,
working_weights,
}),
BlockWorkingSet::ExactNewton { gradient, hessian } => {
Box::new(ExactNewtonBlockUpdater { gradient, hessian })
}
}
}
}
fn check_linear_feasibility(
beta: &Array1<f64>,
constraints: &LinearInequalityConstraints,
tol: f64,
) -> Result<(), String> {
if constraints.a.ncols() != beta.len() || constraints.a.nrows() != constraints.b.len() {
return Err(CustomFamilyError::ConstraintViolation {
reason: "linear constraints: shape mismatch".to_string(),
}
.into());
}
let slack = constraints.a.dot(beta) - &constraints.b;
let mut worst = 0.0_f64;
let mut worst_idx = 0usize;
for (i, &s) in slack.iter().enumerate() {
let v = (-s).max(0.0);
if v > worst {
worst = v;
worst_idx = i;
}
}
if worst > tol {
return Err(CustomFamilyError::ConstraintViolation {
reason: format!(
"infeasible iterate: max(Aβ-b violation)={worst:.3e} at constraint row {worst_idx}"
),
}
.into());
}
Ok(())
}
#[inline]
fn effective_solverridge(ridge_floor: f64) -> f64 {
ridge_floor.max(1e-15)
}
fn block_quadratic_penalty(
beta: &Array1<f64>,
s_lambda: &Array2<f64>,
ridge: f64,
ridge_policy: RidgePolicy,
) -> f64 {
let mut value = 0.5 * beta.dot(&s_lambda.dot(beta));
if ridge_policy.include_quadratic_penalty {
value += 0.5 * ridge * beta.dot(beta);
}
value
}
fn block_penalized_hessian_vector(
spec: &ParameterBlockSpec,
work: &BlockWorkingSet,
s_lambda: &Array2<f64>,
direction: &Array1<f64>,
ridge: f64,
ridge_policy: RidgePolicy,
) -> Array1<f64> {
let mut hpen = match work {
BlockWorkingSet::ExactNewton { hessian, .. } => hessian.dot(direction),
BlockWorkingSet::Diagonal {
working_weights, ..
} => {
let solver_design = spec.solver_design();
let x_direction = solver_design.matrixvectormultiply(direction);
let wx_direction = &x_direction * working_weights;
solver_design.transpose_vector_multiply(&wx_direction)
}
};
hpen += &s_lambda.dot(direction);
if ridge_policy.include_quadratic_penalty && ridge > 0.0 {
hpen.scaled_add(ridge, direction);
}
hpen
}
fn symmetric_matrix_diagonal(matrix: &SymmetricMatrix) -> Array1<f64> {
match matrix {
SymmetricMatrix::Dense(mat) => mat.diag().to_owned(),
SymmetricMatrix::Sparse(mat) => {
let mut out = Array1::<f64>::zeros(mat.ncols());
let (symbolic, values) = mat.parts();
let col_ptr = symbolic.col_ptr();
let row_idx = symbolic.row_idx();
for col in 0..mat.ncols() {
for idx in col_ptr[col]..col_ptr[col + 1] {
if row_idx[idx] == col {
out[col] += values[idx];
}
}
}
out
}
}
}
fn block_penalized_metric_diagonal(
spec: &ParameterBlockSpec,
work: &BlockWorkingSet,
s_lambda: &Array2<f64>,
ridge: f64,
ridge_policy: RidgePolicy,
) -> Result<Array1<f64>, String> {
let mut diagonal = match work {
BlockWorkingSet::ExactNewton { hessian, .. } => symmetric_matrix_diagonal(hessian),
BlockWorkingSet::Diagonal {
working_weights, ..
} => spec.design.diag_gram(working_weights)?,
};
if diagonal.len() != s_lambda.nrows() || s_lambda.nrows() != s_lambda.ncols() {
return Err(format!(
"block penalized metric diagonal shape mismatch: diag={}, S={}x{}",
diagonal.len(),
s_lambda.nrows(),
s_lambda.ncols()
));
}
for j in 0..diagonal.len() {
diagonal[j] += s_lambda[[j, j]];
if ridge_policy.include_quadratic_penalty && ridge > 0.0 {
diagonal[j] += ridge;
}
diagonal[j] = positive_joint_diagonal_entry(diagonal[j]);
}
Ok(diagonal)
}
fn block_penalized_metric_norm(
spec: &ParameterBlockSpec,
work: &BlockWorkingSet,
s_lambda: &Array2<f64>,
direction: &Array1<f64>,
ridge: f64,
ridge_policy: RidgePolicy,
) -> Result<f64, String> {
let diagonal = block_penalized_metric_diagonal(spec, work, s_lambda, ridge, ridge_policy)?;
if diagonal.len() != direction.len() {
return Err(format!(
"block penalized metric direction length mismatch: direction={}, diag={}",
direction.len(),
diagonal.len()
));
}
Ok(joint_trust_region_metric_step_norm(direction, &diagonal))
}
fn truncate_block_step_to_metric_radius(
spec: &ParameterBlockSpec,
work: &BlockWorkingSet,
s_lambda: &Array2<f64>,
delta: Array1<f64>,
radius: f64,
ridge: f64,
ridge_policy: RidgePolicy,
) -> Result<(Array1<f64>, f64), String> {
let norm = block_penalized_metric_norm(spec, work, s_lambda, &delta, ridge, ridge_policy)?;
if norm.is_finite() && norm > radius && radius > 0.0 {
Ok((&delta * (radius / norm), radius))
} else {
Ok((delta, norm))
}
}
const TOTAL_QUADRATIC_PENALTY_PAR_MIN_BLOCKS: usize = 4;
// Avoid Rayon overhead for a few tiny blocks; this approximates the dense
// mat-vec work in βᵀSβ before splitting independent block penalties.
const TOTAL_QUADRATIC_PENALTY_PAR_MIN_DENSE_WORK: usize = 16_384;
fn total_quadratic_penalty_parallel_worthwhile(
states: &[ParameterBlockState],
s_lambdas: &[Array2<f64>],
) -> bool {
let n_blocks = states.len().min(s_lambdas.len());
if n_blocks < TOTAL_QUADRATIC_PENALTY_PAR_MIN_BLOCKS || rayon::current_num_threads() <= 1 {
return false;
}
states
.iter()
.zip(s_lambdas.iter())
.map(|(state, s_lambda)| {
let p = state.beta.len().min(s_lambda.ncols());
p.saturating_mul(s_lambda.nrows())
})
.try_fold(0usize, |acc, work| {
let next = acc.saturating_add(work);
(next < TOTAL_QUADRATIC_PENALTY_PAR_MIN_DENSE_WORK).then_some(next)
})
.is_none()
}
fn total_quadratic_penalty(
states: &[ParameterBlockState],
s_lambdas: &[Array2<f64>],
ridge: f64,
ridge_policy: RidgePolicy,
joint_full_width: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
specs: Option<&[ParameterBlockSpec]>,
) -> f64 {
let per_block: f64 = if total_quadratic_penalty_parallel_worthwhile(states, s_lambdas) {
use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
states
.par_iter()
.zip(s_lambdas.par_iter())
.map(|(state, s_lambda)| {
block_quadratic_penalty(&state.beta, s_lambda, ridge, ridge_policy)
})
.reduce(|| 0.0, |left, right| left + right)
} else {
states
.iter()
.zip(s_lambdas.iter())
.map(|(state, s_lambda)| {
block_quadratic_penalty(&state.beta, s_lambda, ridge, ridge_policy)
})
.sum()
};
let joint = match (joint_full_width, specs) {
(Some(bundle), Some(specs)) if !bundle.is_empty() => {
let beta_flat = flatten_state_betas(states, specs);
bundle.quadratic(beta_flat.view())
}
_ => 0.0,
};
per_block + joint
}
/// Locate the first non-finite entry in a Hessian and report it as a
/// canonical "smooth-regularized logdet boundary" error. The same
/// message is used at every site that refuses to factor or iterate on
/// a non-finite Hessian — the logdet computation itself, and the
/// inner-fit entry where exact-Newton block Hessians arrive from the
/// family. A single canonical phrasing means callers and tests
/// recognise this as one mathematical event regardless of where it
/// was caught: a NaN entry is a contract violation against the
/// family's analytic second derivative, full stop.
fn smooth_regularized_logdet_hessian_finite_check(
matrix: &Array2<f64>,
block: Option<usize>,
) -> Result<(), String> {
let Some((row, col, value)) = matrix
.indexed_iter()
.find_map(|((row, col), &value)| (!value.is_finite()).then_some((row, col, value)))
else {
return Ok(());
};
let block_context = match block {
Some(b) => format!(" for block {b}"),
None => String::new(),
};
Err(CustomFamilyError::NumericalFailure { reason: format!(
"smooth-regularized logdet Hessian contains non-finite entry at ({row}, {col}): {value}{block_context}"
) }.into())
}
/// Validate that every exact-Newton block working set in a family
/// evaluation has a finite Hessian. Returns Err on the first
/// non-finite entry using the canonical smooth-regularized logdet
/// boundary message, with the offending block index appended for
/// diagnostics.
///
/// Exact-Newton Hessians are part of the mathematical contract: they
/// are the family's analytic second derivative of the log-likelihood,
/// so any non-finite entry means that derivative is invalid math.
/// Catching it at the family-evaluation boundary lets the inner
/// solver refuse to iterate on a poisoned Hessian, instead of
/// silently "converging" because the gradient happens to be zero or
/// the bad entries get hidden behind a downstream eigendecomposition
/// fallback that the outer optimizer's flags may or may not invoke.
fn validate_block_hessians_finite(eval: &FamilyEvaluation) -> Result<(), String> {
for (b, ws) in eval.blockworking_sets.iter().enumerate() {
let BlockWorkingSet::ExactNewton { hessian, .. } = ws else {
continue;
};
match hessian {
SymmetricMatrix::Dense(matrix) => {
smooth_regularized_logdet_hessian_finite_check(matrix, Some(b))?;
}
SymmetricMatrix::Sparse(matrix) => {
let (symbolic, values) = matrix.parts();
let col_ptr = symbolic.col_ptr();
let row_idx = symbolic.row_idx();
for col in 0..matrix.ncols() {
let start = col_ptr[col];
let end = col_ptr[col + 1];
for idx in start..end {
let row = row_idx[idx];
let value = values[idx];
if !value.is_finite() {
return Err(CustomFamilyError::NumericalFailure { reason: format!(
"smooth-regularized logdet Hessian contains non-finite entry at ({row}, {col}): {value} for block {b}"
) }.into());
}
}
}
}
}
}
Ok(())
}
fn stable_logdet_with_ridge_policy(
matrix: &Array2<f64>,
ridge_floor: f64,
ridge_policy: RidgePolicy,
) -> Result<f64, String> {
let mut a = matrix.clone();
symmetrize_dense_in_place(&mut a);
let p = a.nrows();
let ridge = if ridge_policy.include_penalty_logdet {
effective_solverridge(ridge_floor)
} else {
0.0
};
for i in 0..p {
a[[i, i]] += ridge;
}
match resolved_ridge_determinant_mode(ridge_policy, p) {
RidgeDeterminantMode::Full => {
let chol = a.cholesky(Side::Lower).map_err(|_| {
"cholesky failed while computing full ridge-aware logdet".to_string()
})?;
Ok(2.0 * chol.diag().mapv(f64::ln).sum())
}
RidgeDeterminantMode::Auto => Err(
"internal: resolved_ridge_determinant_mode must resolve Auto to a concrete mode"
.to_string(),
),
RidgeDeterminantMode::PositivePart => {
smooth_regularized_logdet_hessian_finite_check(&a, None)?;
// Smooth-regularized logdet objective, aligned with the gradient
// operator (`DenseSpectralOperator` in `Smooth` mode):
//
// log |A|_reg = Σ_j log r_ε(σ_j), r_ε(σ) = ½(σ + √(σ² + 4ε²))
//
// Every eigenvalue contributes; none are silently dropped. The
// regularizer r_ε is C∞, strictly positive for all real σ, and
// numerically agrees with plain log σ when σ ≫ ε. Negative
// eigenvalues contribute ≈ log(ε²/|σ|) (quadratic damping) so
// indefinite Hessians produce a finite, differentiable cost
// rather than a discontinuous positive-part pseudo-determinant.
//
// This matches exactly what the downstream
// `trace_logdet_gradient = Σ φ'(σ) u^T (dH/dρ) u` computes as the
// analytic gradient — eliminating the cost/gradient mismatch
// that previously broke BFGS line search on indefinite outer
// Hessians.
//
match crate::faer_ndarray::FaerEigh::eigh(&a, Side::Lower) {
Ok((evals, _)) => {
let eval_vec: Vec<f64> = evals
.as_slice()
.map(|sl| sl.to_vec())
.unwrap_or_else(|| evals.iter().copied().collect());
let eps = spectral_epsilon(&eval_vec)
.max(ridge.max(CUSTOM_FAMILY_CONDITION_RELATIVE_FLOOR));
let n_negative = eval_vec.iter().filter(|&&ev| ev < -eps).count();
if n_negative > 0 {
// Diagnostic only: indefiniteness is now handled
// correctly by the smooth regularizer, not ignored.
log::debug!(
"[SmoothRegularizedLogdet] Hessian has {n_negative} \
eigenvalue(s) below -eps={eps:.2e}; r_ε damps them \
smoothly instead of dropping them."
);
}
let logdet: f64 = eval_vec
.iter()
.map(|&sigma| spectral_regularize(sigma, eps).ln())
.sum();
Ok(logdet)
}
Err(eigh_err) => Err(CustomFamilyError::BasisDecompositionFailed {
reason: format!(
"smooth-regularized logdet eigendecomposition failed: {eigh_err}"
),
}
.into()),
}
}
}
}
/// Try Cholesky with an escalating diagonal ridge.
///
/// On attempt `k` (zero-indexed) the diagonal of `matrix` is boosted by
/// `initial_boost * growth^k`. The first successful Cholesky for which
/// `on_success` returns `Some(r)` short-circuits and yields `Some((r, boost,
/// attempt))`; otherwise (Cholesky failure or `on_success` rejection) the
/// ridge is grown and retried up to `max_attempts` times. Returns `None`
/// when every attempt is exhausted.
///
/// Callers that need a no-ridge probe should perform it explicitly before
/// invoking this helper; the helper itself always adds `initial_boost` on
/// the first attempt (which may itself be zero if the caller passes 0.0).
fn try_cholesky_with_escalating_ridge<R>(
matrix: &Array2<f64>,
initial_boost: f64,
max_attempts: usize,
growth: f64,
mut on_success: impl FnMut(&crate::faer_ndarray::FaerCholeskyFactor, usize, f64) -> Option<R>,
) -> Option<(R, f64, usize)> {
let p = matrix.nrows();
let mut boost = initial_boost;
for attempt in 0..max_attempts {
let mut candidate = matrix.clone();
if boost != 0.0 {
for i in 0..p {
candidate[[i, i]] += boost;
}
}
if let Ok(chol) = candidate.cholesky(Side::Lower)
&& let Some(r) = on_success(&chol, attempt, boost)
{
return Some((r, boost, attempt));
}
boost *= growth;
}
None
}
/// Fallback for penalty pseudo-logdet when eigendecomposition fails.
///
/// Penalty matrices are PSD by construction (weighted sum of PSD penalties),
/// so the ridged matrix should be SPD. Uses escalating-ridge Cholesky via
/// the shared `try_cholesky_with_escalating_ridge` helper.
fn penalty_logdet_cholesky_fallback(
s_ridged: &Array2<f64>,
existing_ridge: f64,
block: usize,
p: usize,
eigh_err: &str,
) -> Result<f64, String> {
let diag_scale = s_ridged
.diag()
.iter()
.copied()
.map(f64::abs)
.fold(0.0_f64, f64::max)
.max(1.0);
const MAX_ATTEMPTS: usize = 6;
let initial_boost = diag_scale * 1e-8;
let outcome = try_cholesky_with_escalating_ridge(
s_ridged,
initial_boost,
MAX_ATTEMPTS,
10.0,
|chol, attempt, boost| {
let logdet = 2.0 * chol.diag().mapv(f64::ln).sum();
if logdet.is_finite() {
log::warn!(
"[PenaltyLogdetFallback] eigendecomposition failed for block {block} \
({eigh_err}); using Cholesky with boosted ridge={:.2e} \
(attempt {}/{MAX_ATTEMPTS}, existing_ridge={:.2e}, p={p})",
boost + existing_ridge,
attempt + 1,
existing_ridge,
);
Some(logdet)
} else {
None
}
},
);
if let Some((logdet, _, _)) = outcome {
return Ok(logdet);
}
// Mirror the original message: report the ridge that *would* have been
// applied on the (MAX_ATTEMPTS+1)-th attempt, i.e. initial_boost * 10^MAX_ATTEMPTS.
let final_boost = initial_boost * 10.0_f64.powi(MAX_ATTEMPTS as i32);
Err(CustomFamilyError::BasisDecompositionFailed {
reason: format!(
"penalty logdet eigendecomposition failed for block {block} ({eigh_err}) and \
Cholesky fallback also failed after {MAX_ATTEMPTS} attempts \
(final ridge={:.2e}, p={p})",
final_boost + existing_ridge,
),
}
.into())
}
fn resolved_ridge_determinant_mode(ridge_policy: RidgePolicy, dim: usize) -> RidgeDeterminantMode {
assert!(
dim.checked_add(1).is_some(),
"ridge determinant dimension overflow"
);
match ridge_policy.determinant_mode {
RidgeDeterminantMode::Auto => RidgeDeterminantMode::Full,
mode => mode,
}
}
fn inverse_spdwith_retry(
matrix: &Array2<f64>,
baseridge: f64,
max_retry: usize,
) -> Result<Array2<f64>, String> {
let mut sym = matrix.clone();
symmetrize_dense_in_place(&mut sym);
let invert_via_chol = |chol: &crate::faer_ndarray::FaerCholeskyFactor, _: usize, _: f64| {
let mut ident = Array2::<f64>::eye(sym.nrows());
chol.solve_mat_in_place(&mut ident);
symmetrize_dense_in_place(&mut ident);
Some(ident)
};
// Attempt 0 in the original schedule uses ridge=0 (no diagonal addition).
// Express this as a single-attempt call with initial_boost=0.
if let Some((inv, _, _)) =
try_cholesky_with_escalating_ridge(&sym, 0.0, 1, 1.0, invert_via_chol)
{
return Ok(inv);
}
// Subsequent attempts use ridge = baseridge * 10^(k-1) for k = 1..=max_retry,
// which is `max_retry` total attempts with initial_boost=baseridge, growth=10.
if max_retry > 0
&& let Some((inv, _, _)) =
try_cholesky_with_escalating_ridge(&sym, baseridge, max_retry, 10.0, invert_via_chol)
{
return Ok(inv);
}
Err(CustomFamilyError::BasisDecompositionFailed {
reason: "failed to invert SPD system after Cholesky ridge retries".to_string(),
}
.into())
}
pub(crate) fn symmetrize_dense_in_place(matrix: &mut Array2<f64>) {
crate::linalg::matrix::symmetrize_in_place(matrix);
}
fn validate_flat_direction_length(
direction: &Array1<f64>,
expected: usize,
context: &str,
) -> Result<(), String> {
if direction.len() != expected {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"{context}: direction length mismatch: got {}, expected {expected}",
direction.len()
),
}
.into());
}
Ok::<(), _>(())
}
/// Does a joint Hessian carry genuine cross-block (off-diagonal) coupling?
///
/// The trait's default `exact_newton_joint_hessian` assembles a strictly
/// block-diagonal matrix (per-block `Xᵀ W X` on the diagonal, zeros off-block).
/// A family that overrides it with the true coupled curvature of a multi-block
/// likelihood (GAMLSS μ-σ, Beta-logit `α`/`β`, Dirichlet K-block via the shared
/// concentration sum, …) necessarily fills in nonzero off-diagonal blocks. This
/// is the only structural signal — independent of any hand-set marker — that
/// distinguishes a trusted coupled joint Hessian from the block-diagonal
/// default. The block boundaries come from the per-block β widths.
fn joint_hessian_has_cross_block_coupling(
hessian: &Array2<f64>,
block_states: &[ParameterBlockState],
) -> bool {
let total = block_states
.iter()
.map(|state| state.beta.len())
.sum::<usize>();
if hessian.nrows() != total || hessian.ncols() != total {
// Shape disagreement is handled (loudly) by the symmetrizer/consumers;
// here we only answer the coupling question and must not claim coupling
// for a malformed matrix.
return false;
}
let mut ranges: Vec<(usize, usize)> = Vec::with_capacity(block_states.len());
let mut start = 0usize;
for state in block_states {
let end = start + state.beta.len();
ranges.push((start, end));
start = end;
}
for (a, (ra_start, ra_end)) in ranges.iter().copied().enumerate() {
for (rb_start, rb_end) in ranges.iter().copied().skip(a + 1) {
for i in ra_start..ra_end {
for j in rb_start..rb_end {
if hessian[[i, j]] != 0.0 || hessian[[j, i]] != 0.0 {
return true;
}
}
}
}
}
false
}
fn exact_newton_joint_hessian_from_exact_blocks<F: CustomFamily + ?Sized>(
family: &F,
block_states: &[ParameterBlockState],
) -> Result<Option<Array2<f64>>, String> {
let evaluation = family.evaluate(block_states)?;
if evaluation.blockworking_sets.len() != block_states.len() {
return Err(format!(
"exact_newton_joint_hessian default: working-set count {} != block count {}",
evaluation.blockworking_sets.len(),
block_states.len()
));
}
if evaluation
.blockworking_sets
.iter()
.any(|working_set| !matches!(working_set, BlockWorkingSet::ExactNewton { .. }))
{
return Ok(None);
}
let total = block_states
.iter()
.map(|state| state.beta.len())
.sum::<usize>();
let mut joint = Array2::<f64>::zeros((total, total));
let mut start = 0usize;
for (block_idx, (state, working_set)) in block_states
.iter()
.zip(evaluation.blockworking_sets.iter())
.enumerate()
{
let p_block = state.beta.len();
let end = start + p_block;
let BlockWorkingSet::ExactNewton { hessian, .. } = working_set else {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"exact_newton_joint_hessian default: block {block_idx} working set is not ExactNewton after filter"
),
}
.into());
};
let dense = hessian.to_dense();
if dense.nrows() != p_block || dense.ncols() != p_block {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"exact_newton_joint_hessian default: block {block_idx} Hessian shape {}x{} != expected {p_block}x{p_block}",
dense.nrows(),
dense.ncols()
) }.into());
}
joint.slice_mut(s![start..end, start..end]).assign(&dense);
start = end;
}
Ok(Some(joint))
}
fn exact_newton_joint_hessian_from_working_sets<F: CustomFamily + ?Sized>(
family: &F,
block_states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
) -> Result<Option<Array2<f64>>, String> {
if block_states.len() != specs.len() {
return Err(format!(
"exact_newton_joint_hessian_with_specs default: block state count {} != spec count {}",
block_states.len(),
specs.len()
));
}
let evaluation = family.evaluate(block_states)?;
if evaluation.blockworking_sets.len() != block_states.len() {
return Err(format!(
"exact_newton_joint_hessian_with_specs default: working-set count {} != block count {}",
evaluation.blockworking_sets.len(),
block_states.len()
));
}
let total = specs.iter().map(|spec| spec.design.ncols()).sum::<usize>();
let mut joint = Array2::<f64>::zeros((total, total));
let mut start = 0usize;
for (block_idx, ((state, spec), working_set)) in block_states
.iter()
.zip(specs.iter())
.zip(evaluation.blockworking_sets.iter())
.enumerate()
{
let p_block = spec.design.ncols();
if state.beta.len() != p_block {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"exact_newton_joint_hessian_with_specs default: block {block_idx} beta length {} != design cols {p_block}",
state.beta.len()
) }.into());
}
let end = start + p_block;
let dense = match working_set {
BlockWorkingSet::ExactNewton { hessian, .. } => hessian.to_dense(),
BlockWorkingSet::Diagonal {
working_weights, ..
} => spec
.design
.xt_diag_x_signed_op(SignedWeightsView::from_array(working_weights))?,
};
if dense.nrows() != p_block || dense.ncols() != p_block {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"exact_newton_joint_hessian_with_specs default: block {block_idx} Hessian shape {}x{} != expected {p_block}x{p_block}",
dense.nrows(),
dense.ncols()
) }.into());
}
joint.slice_mut(s![start..end, start..end]).assign(&dense);
start = end;
}
Ok(Some(joint))
}
fn exact_newton_joint_hessian_directional_derivative_from_blocks<F: CustomFamily + ?Sized>(
family: &F,
block_states: &[ParameterBlockState],
d_beta_flat: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
let total = block_states
.iter()
.map(|state| state.beta.len())
.sum::<usize>();
validate_flat_direction_length(
d_beta_flat,
total,
"exact_newton_joint_hessian_directional_derivative default",
)?;
if !family.exact_newton_joint_hessian_beta_dependent() {
return Ok(Some(Array2::zeros((total, total))));
}
let mut joint = Array2::<f64>::zeros((total, total));
let mut start = 0usize;
for (block_idx, state) in block_states.iter().enumerate() {
let p_block = state.beta.len();
let end = start + p_block;
let d_beta_block = d_beta_flat.slice(s![start..end]).to_owned();
let Some(local) = family.exact_newton_hessian_directional_derivative(
block_states,
block_idx,
&d_beta_block,
)?
else {
return Ok(None);
};
if local.nrows() != p_block || local.ncols() != p_block {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"exact_newton_joint_hessian_directional_derivative default: block {block_idx} dH shape {}x{} != expected {p_block}x{p_block}",
local.nrows(),
local.ncols()
) }.into());
}
joint.slice_mut(s![start..end, start..end]).assign(&local);
start = end;
}
Ok(Some(joint))
}
/// Block-diagonal aggregator for the joint second directional derivative.
///
/// Mirrors `exact_newton_joint_hessian_directional_derivative_from_blocks`:
/// for a beta-independent joint Hessian the answer is identically zero;
/// otherwise we ask each block for `D²H_b[u_b, v_b]` via
/// `exact_newton_hessian_second_directional_derivative` and place those
/// per-block contributions on the joint diagonal.
///
/// The previous default returned `Some(zeros)` for beta-independent and
/// `None` (no aggregation at all) for beta-dependent families, silently
/// dropping the per-block `d²H` overrides that families like
/// `OneBlockQuarticExactFamily` provide for the outer Hessian's drift
/// contribution. Aggregating here mirrors the first-derivative path so
/// outer REML receives the curvature term whenever the per-block
/// `exact_newton_hessian_second_directional_derivative` is implemented.
fn exact_newton_joint_hessiansecond_directional_derivative_from_blocks<F: CustomFamily + ?Sized>(
family: &F,
block_states: &[ParameterBlockState],
d_beta_u_flat: &Array1<f64>,
d_betav_flat: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
let total = block_states
.iter()
.map(|state| state.beta.len())
.sum::<usize>();
validate_flat_direction_length(d_beta_u_flat, total, "joint exact-newton d2H u")?;
validate_flat_direction_length(d_betav_flat, total, "joint exact-newton d2H v")?;
if !family.exact_newton_joint_hessian_beta_dependent() {
return Ok(Some(Array2::zeros((total, total))));
}
let mut joint = Array2::<f64>::zeros((total, total));
let mut start = 0usize;
for (block_idx, state) in block_states.iter().enumerate() {
let p_block = state.beta.len();
let end = start + p_block;
let u_block = d_beta_u_flat.slice(s![start..end]).to_owned();
let v_block = d_betav_flat.slice(s![start..end]).to_owned();
let Some(local) = family.exact_newton_hessian_second_directional_derivative(
block_states,
block_idx,
&u_block,
&v_block,
)?
else {
return Ok(None);
};
if local.nrows() != p_block || local.ncols() != p_block {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"exact_newton_joint_hessiansecond_directional_derivative default: block {block_idx} d2H shape {}x{} != expected {p_block}x{p_block}",
local.nrows(),
local.ncols()
) }.into());
}
joint.slice_mut(s![start..end, start..end]).assign(&local);
start = end;
}
Ok(Some(joint))
}
fn exact_newton_joint_hessian_directional_derivative_from_working_sets<F: CustomFamily + ?Sized>(
family: &F,
block_states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
d_beta_flat: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
if block_states.len() != specs.len() {
return Err(format!(
"exact_newton_joint_hessian_directional_derivative_with_specs default: block state count {} != spec count {}",
block_states.len(),
specs.len()
));
}
let total = specs.iter().map(|spec| spec.design.ncols()).sum::<usize>();
validate_flat_direction_length(
d_beta_flat,
total,
"exact_newton_joint_hessian_directional_derivative_with_specs default",
)?;
if !family.exact_newton_joint_hessian_beta_dependent() {
return Ok(Some(Array2::zeros((total, total))));
}
let evaluation = family.evaluate(block_states)?;
if evaluation.blockworking_sets.len() != block_states.len() {
return Err(format!(
"exact_newton_joint_hessian_directional_derivative_with_specs default: working-set count {} != block count {}",
evaluation.blockworking_sets.len(),
block_states.len()
));
}
let mut joint = Array2::<f64>::zeros((total, total));
let mut start = 0usize;
for (block_idx, ((state, spec), working_set)) in block_states
.iter()
.zip(specs.iter())
.zip(evaluation.blockworking_sets.iter())
.enumerate()
{
let p_block = spec.design.ncols();
let end = start + p_block;
let d_beta_block = d_beta_flat.slice(s![start..end]).to_owned();
let local = match working_set {
BlockWorkingSet::ExactNewton { .. } => family
.exact_newton_hessian_directional_derivative(
block_states,
block_idx,
&d_beta_block,
)?,
BlockWorkingSet::Diagonal {
working_weights, ..
} => {
let solver_design = spec.solver_design();
let mut d_eta = solver_design.apply(&d_beta_block);
let mut geometry_correction = Array2::<f64>::zeros((p_block, p_block));
if let Some(geometry) = family.block_geometry_directional_derivative(
block_states,
block_idx,
spec,
&d_beta_block,
)? {
if geometry.d_offset.len() != d_eta.len() {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"exact_newton_joint_hessian_directional_derivative_with_specs default: block {block_idx} geometry offset derivative length {} != eta length {}",
geometry.d_offset.len(),
d_eta.len()
) }.into());
}
d_eta += &geometry.d_offset;
if let Some(d_design) = geometry.d_design {
if d_design.nrows() != solver_design.nrows() || d_design.ncols() != p_block
{
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"exact_newton_joint_hessian_directional_derivative_with_specs default: block {block_idx} d_design shape {}x{} != expected {}x{}",
d_design.nrows(),
d_design.ncols(),
solver_design.nrows(),
p_block
) }.into());
}
d_eta += &d_design.dot(&state.beta);
let x_dense = solver_design.to_dense();
let mut weighted_x = x_dense.clone();
let mut weighted_dx = d_design.clone();
ndarray::Zip::from(weighted_x.rows_mut())
.and(weighted_dx.rows_mut())
.and(working_weights.view())
.for_each(|mut wx_row, mut wdx_row, &wi| {
wx_row.mapv_inplace(|value| value * wi);
wdx_row.mapv_inplace(|value| value * wi);
});
geometry_correction += &fast_atb(&d_design, &weighted_x);
geometry_correction += &fast_atb(&x_dense, &weighted_dx);
}
}
family
.diagonalworking_weights_directional_derivative(
block_states,
block_idx,
&d_eta,
)?
.map(|dw| {
let mut local = solver_design
.xt_diag_x_signed_op(SignedWeightsView::from_array(&dw))?;
local += &geometry_correction;
Ok::<Array2<f64>, String>(local)
})
.transpose()?
}
};
let Some(local) = local else {
return Ok(None);
};
if local.nrows() != p_block || local.ncols() != p_block {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"exact_newton_joint_hessian_directional_derivative_with_specs default: block {block_idx} dH shape {}x{} != expected {p_block}x{p_block}",
local.nrows(),
local.ncols()
) }.into());
}
joint.slice_mut(s![start..end, start..end]).assign(&local);
start = end;
}
Ok(Some(joint))
}
fn exact_newton_joint_hessian_symmetrized<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
total: usize,
context: &str,
) -> Result<Option<Array2<f64>>, String> {
let Some(mut h) = family.exact_newton_joint_hessian_with_specs(states, specs)? else {
return Ok(None);
};
if h.nrows() != total || h.ncols() != total {
return Err(format!(
"{context}: got {}x{}, expected {}x{}",
h.nrows(),
h.ncols(),
total,
total
));
}
symmetrize_dense_in_place(&mut h);
Ok(Some(h))
}
/// Scale-aware exact joint curvature payload for the outer REML evaluator.
pub struct ExactNewtonOuterCurvature {
pub hessian: Array2<f64>,
pub rho_curvature_scale: f64,
pub hessian_logdet_correction: f64,
}
enum JointHessianSource {
Dense(Array2<f64>),
Operator {
apply: Arc<dyn Fn(&Array1<f64>) -> Result<Array1<f64>, String> + Send + Sync>,
/// Write-into matvec used by the inner-Newton PCG hot path so the
/// matvec result no longer allocates an `Array1<f64>` per CG iter.
/// At large scale (~6400 inner CG iters per outer iter, p~200) this
/// removes thousands of small Vec<f64> allocations from the tightest
/// loop. Wired from `workspace.hessian_matvec_into`.
apply_into: Arc<dyn Fn(&Array1<f64>, &mut Array1<f64>) -> Result<(), String> + Send + Sync>,
/// Batched multi-RHS apply: `out = H · V` for `(total, n_rhs)` `V`.
/// Wired from `workspace.hessian_apply_mat`, which for the BMS tiled
/// row-primary Hessian sweeps each row tile once and applies its `Hᵢ`
/// to every column. Column-basis dense reconstruction below uses this
/// to materialise the operator in one batched sweep (`H = H · I`)
/// rather than `total` single-vector HVPs, each of which re-reads every
/// row tile. Numerically identical to looping `apply_into`.
apply_mat: Arc<dyn Fn(&Array2<f64>, &mut Array2<f64>) -> Result<(), String> + Send + Sync>,
diagonal: Array1<f64>,
/// Forced dense materialization that bypasses the workspace's
/// `hessian_dense` amortization gate. Returns `Some` when the
/// workspace can build dense via a structural direct path (e.g.
/// CTN's `scop_gradient_and_negative_hessian`), `None` when the
/// caller should fall back to column-basis HVP through `apply`.
dense_forced: Arc<dyn Fn() -> Result<Option<Array2<f64>>, String> + Send + Sync>,
},
}
const EXACT_JOINT_HESSIAN_DENSE_MAX_BYTES: usize = 512 * 1024 * 1024;
fn exact_joint_hessian_dense_bytes(total: usize) -> Result<usize, String> {
total
.checked_mul(total)
.and_then(|n| n.checked_mul(std::mem::size_of::<f64>()))
.ok_or_else(|| format!("joint Hessian dense byte count overflow for dim={total}"))
}
fn ensure_exact_joint_hessian_dense_budget(total: usize, context: &str) -> Result<(), String> {
let bytes = exact_joint_hessian_dense_bytes(total)?;
if bytes > EXACT_JOINT_HESSIAN_DENSE_MAX_BYTES {
return Err(CustomFamilyError::UnsupportedConfiguration {
reason: format!(
"{context}: exact dense joint Hessian requires {:.2} GiB for dim={total}, \
exceeding the {:.2} GiB cap; refusing approximate determinant algebra",
bytes as f64 / (1024.0 * 1024.0 * 1024.0),
EXACT_JOINT_HESSIAN_DENSE_MAX_BYTES as f64 / (1024.0 * 1024.0 * 1024.0),
),
}
.into());
}
Ok(())
}
struct JointHessianBundle<'a> {
source: JointHessianSource,
beta_flat: Array1<f64>,
compute_dh: Box<DriftDerivFn<'a>>,
compute_dh_many: Option<Box<DriftDerivManyFn<'a>>>,
compute_d2h: Box<DriftSecondDerivFn<'a>>,
/// Optional batched second-derivative callback. The unified evaluator's
/// outer-Hessian ρ-ρ pair loop forwards the K(K+1)/2 (v_k, v_l) pairs
/// here in one call when set, so families that fuse the per-row D²H walk
/// (e.g. survival marginal-slope scanning n rows once per outer eval)
/// amortise the row-walk across all pairs instead of paying it per pair.
compute_d2h_many: Option<Box<DriftSecondDerivManyFn<'a>>>,
owned_compute_dh:
Option<Arc<dyn Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync>>,
owned_compute_dh_many: Option<
Arc<dyn Fn(&[Array1<f64>]) -> Result<Vec<Option<DriftDerivResult>>, String> + Send + Sync>,
>,
owned_compute_d2h: Option<
Arc<
dyn Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String>
+ Send
+ Sync,
>,
>,
/// Owned twin of `compute_d2h_many`. Threaded through to
/// `OwnedJointDerivProvider` so the unified evaluator can share the
/// callback across rayon worker threads when the outer Hessian routes
/// through the parallel pair dispatch.
owned_compute_d2h_many: Option<
Arc<
dyn Fn(&[(Array1<f64>, Array1<f64>)]) -> Result<Vec<Option<DriftDerivResult>>, String>
+ Send
+ Sync,
>,
>,
rho_curvature_scale: f64,
hessian_logdet_correction: f64,
}
type DriftDerivFn<'a> =
dyn Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync + 'a;
type DriftDerivManyFn<'a> =
dyn Fn(&[Array1<f64>]) -> Result<Vec<Option<DriftDerivResult>>, String> + Send + Sync + 'a;
type DriftSecondDerivFn<'a> = dyn Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String>
+ Send
+ Sync
+ 'a;
type DriftSecondDerivManyFn<'a> = dyn Fn(&[(Array1<f64>, Array1<f64>)]) -> Result<Vec<Option<DriftDerivResult>>, String>
+ Send
+ Sync
+ 'a;
fn materialize_joint_hessian_source(
source: &JointHessianSource,
total: usize,
context: &str,
) -> Result<Array2<f64>, String> {
match source {
JointHessianSource::Dense(matrix) => Ok(matrix.clone()),
JointHessianSource::Operator {
apply_mat,
dense_forced,
..
} => {
ensure_exact_joint_hessian_dense_budget(total, context)?;
// Preferred path: the workspace exposes a structural direct-dense
// build (e.g. SCOP's `scop_gradient_and_negative_hessian`). That
// is `Θ(n·p²)` like column-basis HVP would be, but the constant
// factor is much better because the structural build sweeps rows
// once and uses BLAS-3 for the chain-rule pullback. Falling back
// to column-basis HVP would re-walk all `n` rows once per column.
if let Some(mut matrix) = dense_forced()? {
if matrix.nrows() != total || matrix.ncols() != total {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"{context}: dense_forced shape mismatch: got {}x{}, expected {total}x{total}",
matrix.nrows(),
matrix.ncols()
) }.into());
}
if matrix.iter().any(|value| !value.is_finite()) {
return Err(CustomFamilyError::NumericalFailure {
reason: format!("{context}: dense_forced returned non-finite values"),
}
.into());
}
symmetrize_dense_in_place(&mut matrix);
return Ok(matrix);
}
// Column-basis reconstruction `H = H · I`. Driving it through the
// batched multi-RHS apply lets a tiled/streamed operator sweep each
// row tile exactly once for all `total` columns instead of once per
// column (`total` full sweeps). The result is, column for column,
// identical to applying the operator to each unit basis vector.
let identity = Array2::<f64>::eye(total);
let mut matrix = Array2::<f64>::zeros((total, total));
apply_mat(&identity, &mut matrix)?;
if matrix.iter().any(|value| !value.is_finite()) {
return Err(CustomFamilyError::NumericalFailure {
reason: format!("{context}: operator matvec returned non-finite values"),
}
.into());
}
symmetrize_dense_in_place(&mut matrix);
Ok(matrix)
}
}
}
fn exact_newton_joint_hessian_source_from_workspace(
workspace: &Arc<dyn ExactNewtonJointHessianWorkspace>,
total: usize,
intent: MaterializationIntent,
context: &str,
) -> Result<Option<JointHessianSource>, String> {
if workspace.hessian_source_preference_for_intent(intent)
== JointHessianSourcePreference::Operator
{
return exact_newton_joint_hessian_operator_source_from_workspace(
workspace, total, intent, context,
);
}
if let Some(mut hessian) = workspace.hessian_dense()? {
if hessian.nrows() != total || hessian.ncols() != total {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"{context}: dense Hessian shape mismatch: got {}x{}, expected {total}x{total}",
hessian.nrows(),
hessian.ncols()
),
}
.into());
}
if hessian.iter().any(|value| !value.is_finite()) {
return Err(CustomFamilyError::NumericalFailure {
reason: format!("{context}: dense Hessian contains non-finite values"),
}
.into());
}
symmetrize_dense_in_place(&mut hessian);
return Ok(Some(JointHessianSource::Dense(hessian)));
}
exact_newton_joint_hessian_operator_source_from_workspace(workspace, total, intent, context)
}
fn exact_newton_joint_hessian_operator_source_from_workspace(
workspace: &Arc<dyn ExactNewtonJointHessianWorkspace>,
total: usize,
intent: MaterializationIntent,
context: &str,
) -> Result<Option<JointHessianSource>, String> {
let Some(diagonal) = workspace.hessian_diagonal()? else {
if workspace.hessian_source_preference_for_intent(intent)
== JointHessianSourcePreference::Operator
{
return Err(CustomFamilyError::UnsupportedConfiguration {
reason: format!(
"{context}: operator-preferred Hessian workspace did not provide a diagonal"
),
}
.into());
}
return Ok(None);
};
if diagonal.len() != total {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"{context}: operator diagonal length mismatch: got {}, expected {}",
diagonal.len(),
total
),
}
.into());
}
if diagonal.iter().any(|value| !value.is_finite()) {
return Err(CustomFamilyError::NumericalFailure {
reason: format!("{context}: operator diagonal contains non-finite values"),
}
.into());
}
if !workspace.hessian_matvec_available() {
if workspace.hessian_source_preference_for_intent(intent)
== JointHessianSourcePreference::Operator
{
return Err(CustomFamilyError::UnsupportedConfiguration {
reason: format!(
"{context}: operator-preferred Hessian workspace did not provide HVPs"
),
}
.into());
}
return Ok(None);
}
let workspace_apply = Arc::clone(workspace);
let workspace_apply_into = Arc::clone(workspace);
let workspace_apply_mat = Arc::clone(workspace);
let workspace_dense_forced = Arc::clone(workspace);
let context_apply: Arc<str> = Arc::from(context);
let context_apply_into = Arc::clone(&context_apply);
let context_apply_mat = Arc::clone(&context_apply);
let context_dense_forced = Arc::clone(&context_apply);
Ok(Some(JointHessianSource::Operator {
apply: Arc::new(move |v: &Array1<f64>| {
if v.len() != total {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"{}: operator input length mismatch: got {}, expected {total}",
&*context_apply,
v.len()
),
}
.into());
}
let Some(out) = workspace_apply.hessian_matvec(v)? else {
return Err(CustomFamilyError::UnsupportedConfiguration {
reason: "joint exact-newton operator matvec unavailable".to_string(),
}
.into());
};
if out.len() != total {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"{}: operator matvec length mismatch: got {}, expected {total}",
&*context_apply,
out.len()
),
}
.into());
}
if out.iter().any(|value| !value.is_finite()) {
return Err(CustomFamilyError::NumericalFailure {
reason: format!(
"{}: operator matvec returned non-finite values",
&*context_apply
),
}
.into());
}
Ok(out)
}),
apply_into: Arc::new(move |v: &Array1<f64>, out: &mut Array1<f64>| {
if v.len() != total || out.len() != total {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"{}: operator input/output length mismatch: v={} out={} expected={total}",
&*context_apply_into,
v.len(),
out.len()
),
}
.into());
}
if !workspace_apply_into.hessian_matvec_into(v, out)? {
return Err(CustomFamilyError::UnsupportedConfiguration {
reason: "joint exact-newton operator matvec unavailable".to_string(),
}
.into());
}
if out.iter().any(|value| !value.is_finite()) {
return Err(CustomFamilyError::NumericalFailure {
reason: format!(
"{}: operator matvec returned non-finite values",
&*context_apply_into
),
}
.into());
}
Ok(())
}),
apply_mat: Arc::new(move |v_cols: &Array2<f64>, out: &mut Array2<f64>| {
if v_cols.nrows() != total || out.nrows() != total {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"{}: operator batched apply row mismatch: v_cols={}x{} out={}x{} expected rows={total}",
&*context_apply_mat,
v_cols.nrows(),
v_cols.ncols(),
out.nrows(),
out.ncols()
),
}
.into());
}
if v_cols.ncols() != out.ncols() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"{}: operator batched apply column mismatch: v_cols has {} columns, out has {}",
&*context_apply_mat,
v_cols.ncols(),
out.ncols()
),
}
.into());
}
if !workspace_apply_mat.hessian_apply_mat(v_cols, out)? {
return Err(CustomFamilyError::UnsupportedConfiguration {
reason: "joint exact-newton operator batched apply unavailable".to_string(),
}
.into());
}
if out.iter().any(|value| !value.is_finite()) {
return Err(CustomFamilyError::NumericalFailure {
reason: format!(
"{}: operator batched apply returned non-finite values",
&*context_apply_mat
),
}
.into());
}
Ok(())
}),
diagonal,
dense_forced: Arc::new(move || -> Result<Option<Array2<f64>>, String> {
match workspace_dense_forced.hessian_dense_forced()? {
Some(mut matrix) => {
if matrix.nrows() != total || matrix.ncols() != total {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"{}: hessian_dense_forced shape mismatch: got {}x{}, expected {total}x{total}",
&*context_dense_forced,
matrix.nrows(),
matrix.ncols()
) }.into());
}
if matrix.iter().any(|value| !value.is_finite()) {
return Err(CustomFamilyError::NumericalFailure {
reason: format!(
"{}: hessian_dense_forced returned non-finite values",
&*context_dense_forced
),
}
.into());
}
symmetrize_dense_in_place(&mut matrix);
Ok(Some(matrix))
}
None => Ok(None),
}
}),
}))
}
fn symmetrized_square_matrix(
mut matrix: Array2<f64>,
expected: usize,
context: &str,
) -> Result<Array2<f64>, String> {
if matrix.nrows() != expected || matrix.ncols() != expected {
return Err(format!(
"{context}: got {}x{}, expected {}x{}",
matrix.nrows(),
matrix.ncols(),
expected,
expected
));
}
if matrix.iter().any(|value| !value.is_finite()) {
return Err(CustomFamilyError::NumericalFailure {
reason: format!("{context}: matrix contains non-finite values"),
}
.into());
}
symmetrize_dense_in_place(&mut matrix);
Ok(matrix)
}
/// Try exact Newton joint Hessian first, then surrogate. Returns `None` if
/// neither path provides a joint Hessian. When successful, returns the joint
/// Hessian source, flat beta, and boxed closures for computing directional
/// derivatives dH[v] and d²H[u,v].
///
/// This eliminates the previously duplicated exact-Newton and surrogate
/// code blocks in `outerobjectivegradienthessian_internal`.
fn build_joint_hessian_closures<'a, F: CustomFamily + Clone + Send + Sync + 'static>(
family: &'a F,
block_states: &'a [ParameterBlockState],
specs: &'a [ParameterBlockSpec],
total: usize,
options: &BlockwiseFitOptions,
preferred_workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Result<Option<JointHessianBundle<'a>>, String> {
// Path 1: exact Newton joint Hessian (preferred).
let beta_flat = flatten_state_betas(block_states, specs);
let synced = Arc::new(synchronized_states_from_flat_beta(
family,
specs,
block_states,
&beta_flat,
)?);
let hessian_workspace = match preferred_workspace {
Some(workspace) => Some(workspace),
None => family.exact_newton_joint_hessian_workspace_with_options(
synced.as_ref(),
specs,
options,
)?,
};
// Outer-eval entry: prime any per-row jet caches the workspace will hand
// to the directional-derivative path. Runs at top-level rayon (we are
// outside the ext-coord `par_iter` here), so the cache build's own
// `par_iter` enjoys full thread-pool parallelism. PIRLS-side workspace
// construction skips this priming because PIRLS never invokes
// `directional_derivative_operator`.
if let Some(workspace) = hessian_workspace.as_ref() {
workspace.warm_up_outer_caches()?;
}
if let Some(curvature) = family.exact_newton_outer_curvature(block_states)? {
let h_joint_unpen = JointHessianSource::Dense(symmetrized_square_matrix(
curvature.hessian,
total,
"joint exact-newton Hessian shape mismatch in outer gradient (rescaled)",
)?);
let compute_dh = Box::new(exact_newton_dh_closure(
family,
Arc::clone(&synced),
specs,
total,
true,
1.0,
hessian_workspace.clone(),
));
let compute_dh_many = None;
let compute_d2h = Box::new(exact_newton_d2h_closure(
family,
Arc::clone(&synced),
specs,
total,
true,
1.0,
hessian_workspace.clone(),
));
let owned_compute_dh = exact_newton_dh_closure_owned(
family.clone(),
Arc::clone(&synced),
specs.to_vec(),
total,
true,
1.0,
hessian_workspace.clone(),
);
let owned_compute_dh_many = None;
let owned_compute_d2h = exact_newton_d2h_closure_owned(
family.clone(),
Arc::clone(&synced),
specs.to_vec(),
total,
true,
1.0,
hessian_workspace.clone(),
);
return Ok(Some(JointHessianBundle {
source: h_joint_unpen,
beta_flat,
compute_dh,
compute_dh_many,
compute_d2h,
compute_d2h_many: None,
owned_compute_dh: Some(owned_compute_dh),
owned_compute_dh_many,
owned_compute_d2h: Some(owned_compute_d2h),
owned_compute_d2h_many: None,
rho_curvature_scale: curvature.rho_curvature_scale,
hessian_logdet_correction: curvature.hessian_logdet_correction,
}));
}
let exact_joint_source = if let Some(workspace) = hessian_workspace.as_ref() {
exact_newton_joint_hessian_source_from_workspace(
workspace,
total,
MaterializationIntent::OuterGradient,
"joint exact-newton operator mismatch in outer gradient",
)?
} else {
None
};
let exact_joint_source = match exact_joint_source {
Some(source) => Some(source),
None => exact_newton_joint_hessian_symmetrized(
family,
block_states,
specs,
total,
"joint exact-newton Hessian shape mismatch in outer gradient",
)
.map(|source| source.map(JointHessianSource::Dense))?,
};
if let Some(h_joint_unpen) = exact_joint_source {
let compute_dh = Box::new(exact_newton_dh_closure(
family,
Arc::clone(&synced),
specs,
total,
false,
1.0,
hessian_workspace.clone(),
));
let compute_dh_many = exact_newton_dh_many_closure(1.0, hessian_workspace.clone());
let compute_d2h = Box::new(exact_newton_d2h_closure(
family,
Arc::clone(&synced),
specs,
total,
false,
1.0,
hessian_workspace.clone(),
));
let owned_compute_dh = exact_newton_dh_closure_owned(
family.clone(),
Arc::clone(&synced),
specs.to_vec(),
total,
false,
1.0,
hessian_workspace.clone(),
);
let owned_compute_dh_many =
exact_newton_dh_many_closure_owned(1.0, hessian_workspace.clone());
let owned_compute_d2h = exact_newton_d2h_closure_owned(
family.clone(),
Arc::clone(&synced),
specs.to_vec(),
total,
false,
1.0,
hessian_workspace.clone(),
);
let compute_d2h_many = exact_newton_d2h_many_closure(1.0, hessian_workspace.clone());
let owned_compute_d2h_many =
exact_newton_d2h_many_closure_owned(1.0, hessian_workspace.clone());
return Ok(Some(JointHessianBundle {
source: h_joint_unpen,
beta_flat,
compute_dh,
compute_dh_many,
compute_d2h,
compute_d2h_many,
owned_compute_dh: Some(owned_compute_dh),
owned_compute_dh_many,
owned_compute_d2h: Some(owned_compute_d2h),
owned_compute_d2h_many,
rho_curvature_scale: 1.0,
hessian_logdet_correction: 0.0,
}));
}
// Path 2: surrogate joint Hessian (fallback).
if let Some(h_joint_unpen) = family
.joint_outer_hyper_surrogate_hessian_with_specs(block_states, specs)?
.map(|h| {
symmetrized_square_matrix(
h,
total,
"joint outer-hyper surrogate Hessian shape mismatch",
)
})
.transpose()?
{
let beta_flat = flatten_state_betas(block_states, specs);
let compute_dh = Box::new(
move |v_k: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> {
let h_rho = family
.joint_outer_hyper_surrogate_hessian_directional_derivative_with_specs(
block_states,
specs,
v_k,
)?;
match h_rho {
Some(h) => Ok(Some(DriftDerivResult::Dense(symmetrized_square_matrix(
h,
total,
"joint surrogate dH shape mismatch",
)?))),
None => Err(CustomFamilyError::UnsupportedConfiguration {
reason: "joint surrogate dH unavailable for analytic outer gradient"
.to_string(),
}
.into()),
}
},
);
let compute_d2h = Box::new(
move |u: &Array1<f64>, v: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> {
match family
.joint_outer_hyper_surrogate_hessian_second_directional_derivative_with_specs(
block_states,
specs,
u,
v,
)? {
Some(m) => Ok(Some(DriftDerivResult::Dense(symmetrized_square_matrix(
m,
total,
"joint surrogate d2H shape mismatch",
)?))),
None => Ok(None),
}
},
);
let family_owned = family.clone();
let states_owned = block_states.to_vec();
let specs_owned = specs.to_vec();
let owned_compute_dh = Arc::new(
move |v_k: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> {
match family_owned
.joint_outer_hyper_surrogate_hessian_directional_derivative_with_specs(
&states_owned,
&specs_owned,
v_k,
)? {
Some(h) => Ok(Some(DriftDerivResult::Dense(symmetrized_square_matrix(
h,
total,
"joint surrogate dH shape mismatch",
)?))),
None => Err(CustomFamilyError::UnsupportedConfiguration {
reason: "joint surrogate dH unavailable for analytic outer gradient"
.to_string(),
}
.into()),
}
},
);
let family_owned = family.clone();
let states_owned = block_states.to_vec();
let specs_owned = specs.to_vec();
let owned_compute_d2h = Arc::new(
move |u: &Array1<f64>, v: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> {
match family_owned
.joint_outer_hyper_surrogate_hessian_second_directional_derivative_with_specs(
&states_owned,
&specs_owned,
u,
v,
)? {
Some(m) => Ok(Some(DriftDerivResult::Dense(symmetrized_square_matrix(
m,
total,
"joint surrogate d2H shape mismatch",
)?))),
None => Ok(None),
}
},
);
return Ok(Some(JointHessianBundle {
source: JointHessianSource::Dense(h_joint_unpen),
beta_flat,
compute_dh,
compute_dh_many: None,
compute_d2h,
compute_d2h_many: None,
owned_compute_dh: Some(owned_compute_dh),
owned_compute_dh_many: None,
owned_compute_d2h: Some(owned_compute_d2h),
owned_compute_d2h_many: None,
rho_curvature_scale: 1.0,
hessian_logdet_correction: 0.0,
}));
}
Ok(None)
}
/// Build a closure computing dH[v] using exact Newton derivatives on synced states.
/// Non-finite derivative output is treated as a hard error.
/// Symmetrize-and-scale the dH Dense result, optionally rejecting non-finite
/// values first. The borrowed factory (`exact_newton_dh_closure`) guards
/// against non-finite output (`check_finite = true`); the owned factory
/// (`exact_newton_dh_closure_owned`) historically does not (`check_finite =
/// false`). Routing both through this helper keeps that behavioral
/// distinction explicit rather than silently divergent.
fn finalize_dh_dense(
h: Array2<f64>,
total: usize,
scale: f64,
check_finite: bool,
) -> Result<Option<DriftDerivResult>, String> {
if check_finite && h.iter().any(|v| !v.is_finite()) {
return Err(CustomFamilyError::NumericalFailure {
reason: "joint exact-newton dH returned non-finite values".to_string(),
}
.into());
}
let mut sym = symmetrized_square_matrix(h, total, "joint exact-newton dH shape mismatch")?;
if scale != 1.0 {
sym *= scale;
}
Ok(Some(DriftDerivResult::Dense(sym)))
}
/// Single source of truth for the dH[v] three-way dispatch shared by the
/// borrowed (`exact_newton_dh_closure`) and owned
/// (`exact_newton_dh_closure_owned`) closure factories. The `check_finite`
/// flag preserves the lone behavioral difference between the two (the borrowed
/// variant rejects non-finite dense output, the owned variant does not); all
/// other logic — outer-curvature path, workspace-operator fast path, and
/// joint-Hessian fallback — is identical and lives here once.
fn exact_newton_dh_apply<F: CustomFamily + Sync>(
family: &F,
synced_states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
total: usize,
use_outer_curvature_derivatives: bool,
scale: f64,
workspace: Option<&Arc<dyn ExactNewtonJointHessianWorkspace>>,
check_finite: bool,
v_k: &Array1<f64>,
) -> Result<Option<DriftDerivResult>, String> {
// `v_k` is ALREADY the perturbation direction `δβ` the caller wants the
// directional Hessian derivative evaluated along. The `HessianDerivativeProvider`s
// (`BorrowedJointDerivProvider`/`OwnedJointDerivProvider`) own the implicit-
// function-theorem sign `δβ = −H⁻¹(A_k β̂)` and negate before calling this
// closure (matching `exact_newton_d2h_apply` and the owned `_many` closure,
// which also pass the direction straight through). Re-negating here would
// double-negate `D_β H[δβ]`, flipping the mode-response drift in the outer
// LAML trace `½ tr(K · (B_i + D_β H[δβ_i]))` and desynchronising the analytic
// outer gradient from its objective for every β-dependent-Hessian exact
// family (spatial-adaptive, survival/bernoulli marginal-slope). Pass through.
let mode_response = v_k.clone();
if use_outer_curvature_derivatives {
let h_rho = family.exact_newton_outer_curvature_directional_derivative_with_specs(
synced_states,
specs,
&mode_response,
)?;
return match h_rho {
Some(h) => finalize_dh_dense(h, total, scale, check_finite),
None => Err(CustomFamilyError::UnsupportedConfiguration {
reason: "joint exact-newton dH unavailable for analytic outer gradient".to_string(),
}
.into()),
};
}
if let Some(workspace) = workspace
&& let Some(operator) = workspace.directional_derivative_operator(&mode_response)?
{
return Ok(Some(scale_drift_deriv_result(
DriftDerivResult::Operator(operator),
scale,
)));
}
match family.exact_newton_joint_hessian_directional_derivative_with_specs(
synced_states,
specs,
&mode_response,
)? {
Some(h) => finalize_dh_dense(h, total, scale, check_finite),
None => Err(CustomFamilyError::UnsupportedConfiguration {
reason: "joint exact-newton dH unavailable for analytic outer gradient".to_string(),
}
.into()),
}
}
fn exact_newton_dh_closure<'a, F: CustomFamily + Sync>(
family: &'a F,
synced_states: Arc<Vec<ParameterBlockState>>,
specs: &'a [ParameterBlockSpec],
total: usize,
use_outer_curvature_derivatives: bool,
scale: f64,
workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> impl Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync + 'a {
move |v_k: &Array1<f64>| {
exact_newton_dh_apply(
family,
synced_states.as_ref(),
specs,
total,
use_outer_curvature_derivatives,
scale,
workspace.as_ref(),
true,
v_k,
)
}
}
fn exact_newton_dh_many_closure<'a>(
scale: f64,
workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Option<Box<DriftDerivManyFn<'a>>> {
let workspace = workspace?;
Some(Box::new(move |directions: &[Array1<f64>]| {
// `directions` are already the perturbation directions `δβ`; the provider
// owns the IFT sign and pre-negates (see `exact_newton_dh_apply`). The
// owned `_many` counterpart passes them straight through, so this borrowed
// path must too — re-negating here double-flips the mode-response drift.
workspace
.directional_derivative_operators(directions)?
.into_iter()
.map(|maybe_operator| {
Ok(maybe_operator.map(|operator| {
scale_drift_deriv_result(DriftDerivResult::Operator(operator), scale)
}))
})
.collect()
}))
}
/// Single source of truth for the d²H[u,v] three-way dispatch shared by the
/// borrowed (`exact_newton_d2h_closure`) and owned
/// (`exact_newton_d2h_closure_owned`) closure factories. Takes references for
/// `family`/`specs` so both ownership flavors can call it; the only difference
/// between the two factories is borrow-vs-own plumbing, which lives in the
/// wrappers, not here.
fn exact_newton_d2h_apply<F: CustomFamily + Sync>(
family: &F,
synced_states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
total: usize,
use_outer_curvature_derivatives: bool,
scale: f64,
workspace: Option<&Arc<dyn ExactNewtonJointHessianWorkspace>>,
u: &Array1<f64>,
v: &Array1<f64>,
) -> Result<Option<DriftDerivResult>, String> {
if use_outer_curvature_derivatives {
return match family.exact_newton_outer_curvature_second_directional_derivative_with_specs(
synced_states,
specs,
u,
v,
)? {
Some(m) => {
let mut sym =
symmetrized_square_matrix(m, total, "joint exact-newton d2H shape mismatch")?;
if scale != 1.0 {
sym *= scale;
}
Ok(Some(DriftDerivResult::Dense(sym)))
}
None => Ok(None),
};
}
if let Some(workspace) = workspace
&& let Some(operator) = workspace.second_directional_derivative_operator(u, v)?
{
return Ok(Some(scale_drift_deriv_result(
DriftDerivResult::Operator(operator),
scale,
)));
}
match family.exact_newton_joint_hessian_second_directional_derivative_with_specs(
synced_states,
specs,
u,
v,
)? {
Some(m) => {
let mut sym =
symmetrized_square_matrix(m, total, "joint exact-newton d2H shape mismatch")?;
if scale != 1.0 {
sym *= scale;
}
Ok(Some(DriftDerivResult::Dense(sym)))
}
None => Ok(None),
}
}
/// Build a closure computing d²H[u,v] using exact Newton derivatives on synced states.
fn exact_newton_d2h_closure<'a, F: CustomFamily + Sync>(
family: &'a F,
synced_states: Arc<Vec<ParameterBlockState>>,
specs: &'a [ParameterBlockSpec],
total: usize,
use_outer_curvature_derivatives: bool,
scale: f64,
workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> impl Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync + 'a
{
move |u: &Array1<f64>, v: &Array1<f64>| {
exact_newton_d2h_apply(
family,
synced_states.as_ref(),
specs,
total,
use_outer_curvature_derivatives,
scale,
workspace.as_ref(),
u,
v,
)
}
}
fn exact_newton_d2h_many_closure<'a>(
scale: f64,
workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Option<Box<DriftSecondDerivManyFn<'a>>> {
let workspace = workspace?;
Some(Box::new(move |pairs: &[(Array1<f64>, Array1<f64>)]| {
workspace
.second_directional_derivative_operators(pairs)?
.into_iter()
.map(|maybe_operator| {
Ok(maybe_operator.map(|operator| {
scale_drift_deriv_result(DriftDerivResult::Operator(operator), scale)
}))
})
.collect()
}))
}
fn exact_newton_dh_closure_owned<F: CustomFamily + Clone + Send + Sync + 'static>(
family: F,
synced_states: Arc<Vec<ParameterBlockState>>,
specs: Vec<ParameterBlockSpec>,
total: usize,
use_outer_curvature_derivatives: bool,
scale: f64,
workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Arc<dyn Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync> {
Arc::new(move |v_k: &Array1<f64>| {
exact_newton_dh_apply(
&family,
synced_states.as_ref(),
&specs,
total,
use_outer_curvature_derivatives,
scale,
workspace.as_ref(),
false,
v_k,
)
})
}
fn exact_newton_dh_many_closure_owned(
scale: f64,
workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Option<
Arc<dyn Fn(&[Array1<f64>]) -> Result<Vec<Option<DriftDerivResult>>, String> + Send + Sync>,
> {
let workspace = workspace?;
Some(Arc::new(move |directions: &[Array1<f64>]| {
workspace
.directional_derivative_operators(directions)?
.into_iter()
.map(|maybe_operator| {
Ok(maybe_operator.map(|operator| {
scale_drift_deriv_result(DriftDerivResult::Operator(operator), scale)
}))
})
.collect()
}))
}
fn exact_newton_d2h_closure_owned<F: CustomFamily + Clone + Send + Sync + 'static>(
family: F,
synced_states: Arc<Vec<ParameterBlockState>>,
specs: Vec<ParameterBlockSpec>,
total: usize,
use_outer_curvature_derivatives: bool,
scale: f64,
workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Arc<dyn Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync>
{
Arc::new(move |u: &Array1<f64>, v: &Array1<f64>| {
exact_newton_d2h_apply(
&family,
synced_states.as_ref(),
&specs,
total,
use_outer_curvature_derivatives,
scale,
workspace.as_ref(),
u,
v,
)
})
}
fn exact_newton_d2h_many_closure_owned(
scale: f64,
workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Option<
Arc<
dyn Fn(&[(Array1<f64>, Array1<f64>)]) -> Result<Vec<Option<DriftDerivResult>>, String>
+ Send
+ Sync,
>,
> {
let workspace = workspace?;
Some(Arc::new(move |pairs: &[(Array1<f64>, Array1<f64>)]| {
workspace
.second_directional_derivative_operators(pairs)?
.into_iter()
.map(|maybe_operator| {
Ok(maybe_operator.map(|operator| {
scale_drift_deriv_result(DriftDerivResult::Operator(operator), scale)
}))
})
.collect()
}))
}
fn strict_solve_spd(matrix: &Array2<f64>, rhs: &Array1<f64>) -> Result<Array1<f64>, String> {
let mut sym = matrix.clone();
symmetrize_dense_in_place(&mut sym);
let chol = sym
.cholesky(Side::Lower)
.map_err(|_| "strict pseudo-laplace SPD solve failed".to_string())?;
Ok(chol.solvevec(rhs))
}
/// Statistics about a Levenberg-Marquardt-style δ-ridge SPD continuation.
/// Recorded by `strict_solve_spd_with_lm_continuation` and surfaced for
/// diagnostics — a recurring need for nontrivial ridges signals fragile
/// curvature that the controller may need to escalate.
#[derive(Clone, Copy, Debug, Default)]
pub(crate) struct StrictSpdLmStats {
/// δ value finally used (0.0 means the bare strict solve succeeded).
pub(crate) delta_used: f64,
/// Number of escalations performed before Cholesky succeeded.
pub(crate) escalations: usize,
}
/// Strict-mode SPD solve with internal Levenberg-Marquardt δ-ridge
/// continuation: solves `(H + δI) x = b` with δ escalated geometrically
/// until the Cholesky succeeds. The bare `strict_solve_spd` is unchanged —
/// callers that need strict semantics keep them. Callers that want
/// fail-soft Newton on a fragile geometry (e.g. spatial-adaptive seed
/// evaluation) use this wrapper to avoid bouncing the entire seed on a
/// numerically-indefinite block.
///
/// Schedule: δ₀ = max(ε · ‖H‖₁ / p, 1e-12); growth ×10 per step; capped
/// at MAX_ESCALATIONS escalations. The cap prevents runaway curvature
/// from producing arbitrary ridges; if the cap is hit, the bare strict
/// error propagates so the caller can route to a different optimization
/// path (e.g. sparse/gradient-only standard REML at full data).
/// Shared escalation/ridge-growth schedule used by the three
/// `strict_*_spd_with_lm_continuation` helpers. Hoisted here so a single
/// change updates the solve / inverse / logdet paths in lockstep.
const STRICT_SPD_LM_MAX_ESCALATIONS: usize = 16;
const STRICT_SPD_LM_RIDGE_GROWTH: f64 = 10.0;
/// Floor applied to IRLS working weights so downstream divisions cannot hit
/// exact zero. Used as the default `minweight` in `CustomFamilyOptions` and
/// mirrored in tests that override it.
///
/// Sourced from the canonical PIRLS positive-weight floor
/// ([`crate::solver::pirls::MIN_WEIGHT`] = `1e-12`) so every floored family
/// shares one definition; this alias keeps the descriptive local name at the
/// `minweight` defaults.
const CUSTOM_FAMILY_WEIGHT_FLOOR: f64 = crate::solver::pirls::MIN_WEIGHT;
/// Default initial ridge δ for the explicit-stabilization Cholesky escalation
/// schedule. Enters the quadratic term, the Laplace Hessian, and the penalty
/// log-determinant via the active `RidgePolicy`.
const CUSTOM_FAMILY_RIDGE_FLOOR: f64 = 1e-12;
/// Relative eigenvalue floor used wherever an eigendecomposition needs to
/// distinguish "real" curvature from noise: `eps_floor = EVAL_FLOOR · max|λ|`.
/// Applied uniformly in the strict-SPD LM eigen fallback, positive-part
/// pseudo-inverse, and penalty-direction projection.
const CUSTOM_FAMILY_EVAL_FLOOR: f64 = 1e-12;
/// Absolute relative-condition guard used to prevent the eigen / spectral
/// floors from collapsing to zero when `max|λ|` is itself tiny. Combined with
/// `CUSTOM_FAMILY_EVAL_FLOOR · max|λ|` via `.max(...)`.
const CUSTOM_FAMILY_CONDITION_RELATIVE_FLOOR: f64 = 1e-14;
/// Shared engine: try the bare strict path, fall through to an escalating
/// LM δ-ridge Cholesky, and finally an eigen-floor fallback that clamps every
/// eigenvalue from below at `eps_floor = 1e-12 · max|λ|`. Each caller
/// (solve / inverse / logdet) supplies the three operation-specific closures.
///
/// Centralizing the LM/eigen scaffolding here both removes ~180 lines of
/// near-duplicated code and guarantees the three sibling helpers stay in
/// lockstep — any future change to the schedule, the trace_scale heuristic,
/// or the eigen-floor logic now lives in exactly one place.
fn strict_spd_lm_engine<R>(
matrix: &Array2<f64>,
op_label: &'static str,
empty: R,
bare_path: impl FnOnce(&Array2<f64>) -> Result<R, String>,
process_chol: impl FnOnce(&crate::faer_ndarray::FaerCholeskyFactor) -> R,
process_eigen: impl FnOnce(&Array1<f64>, &Array2<f64>, f64) -> R,
) -> Result<(R, StrictSpdLmStats), String> {
if let Ok(r) = bare_path(matrix) {
return Ok((r, StrictSpdLmStats::default()));
}
let p = matrix.nrows();
if p == 0 {
return Ok((empty, StrictSpdLmStats::default()));
}
let mut sym = matrix.clone();
symmetrize_dense_in_place(&mut sym);
let trace_scale = (0..p).map(|i| sym[[i, i]].abs()).sum::<f64>() / (p as f64);
let delta0 = (f64::EPSILON * trace_scale.max(1.0)).max(CUSTOM_FAMILY_RIDGE_FLOOR);
let mut delta = delta0;
for escalation in 1..=STRICT_SPD_LM_MAX_ESCALATIONS {
let mut ridged = sym.clone();
for i in 0..p {
ridged[[i, i]] += delta;
}
if let Ok(chol) = ridged.cholesky(Side::Lower) {
return Ok((
process_chol(&chol),
StrictSpdLmStats {
delta_used: delta,
escalations: escalation,
},
));
}
delta *= STRICT_SPD_LM_RIDGE_GROWTH;
}
// δ-ridge schedule exhausted; fall back to rank-aware eigen-floor handling.
// Floors every eigenvalue at `eps_floor = 1e-12 · max|λ|` so well-conditioned
// modes are resolved exactly and rank-deficient directions are handled with
// controlled curvature, preventing the spatial-adaptive pilot from collapsing
// to a cold full-data run.
let max_esc = STRICT_SPD_LM_MAX_ESCALATIONS;
let (evals, evecs) = FaerEigh::eigh(&sym, Side::Lower).map_err(|e| {
format!(
"{op_label} failed even with LM δ-ridge continuation \
(escalated {max_esc} times to δ={delta:.3e}, trace_scale={trace_scale:.3e}); \
eigen-floor fallback also failed: {e}"
)
})?;
let max_abs_eval = evals.iter().fold(0.0_f64, |a, &b| a.max(b.abs()));
let eps_floor = (CUSTOM_FAMILY_EVAL_FLOOR * max_abs_eval).max(1e-300);
Ok((
process_eigen(&evals, &evecs, eps_floor),
StrictSpdLmStats {
delta_used: delta,
escalations: STRICT_SPD_LM_MAX_ESCALATIONS + 1,
},
))
}
pub(crate) fn strict_solve_spd_with_lm_continuation(
matrix: &Array2<f64>,
rhs: &Array1<f64>,
) -> Result<(Array1<f64>, StrictSpdLmStats), String> {
let p = matrix.nrows();
strict_spd_lm_engine(
matrix,
"strict pseudo-laplace SPD solve",
Array1::<f64>::zeros(0),
|m| strict_solve_spd(m, rhs),
|chol| chol.solvevec(rhs),
|evals, evecs, eps_floor| {
// x = Q diag(1/Λ̃) Qᵀ rhs.
let mut q_t_rhs = Array1::<f64>::zeros(p);
for k in 0..p {
let mut acc = 0.0;
for i in 0..p {
acc += evecs[[i, k]] * rhs[i];
}
q_t_rhs[k] = acc / evals[k].max(eps_floor);
}
let mut x = Array1::<f64>::zeros(p);
for i in 0..p {
let mut acc = 0.0;
for k in 0..p {
acc += evecs[[i, k]] * q_t_rhs[k];
}
x[i] = acc;
}
x
},
)
}
/// Exact pseudo-Laplace log-determinant `log|H + S_λ|` of the REML/LAML
/// objective, computed from the eigenspectrum with **no δ-ridge** so the value
/// stays on the same objective as the analytic gradient `tr((H+S_λ)⁻¹ ·)`
/// (gam#748).
///
/// The earlier strict path returned `log|H + S_λ + δI|` with `δ = δ(ρ)`
/// escalated geometrically until factorization succeeded. That makes `V(ρ)`
/// carry a ρ-dependent, discontinuous `δ(ρ)` the analytic derivatives ignore —
/// exactly the objective/derivative mismatch the
/// operator-dense path's own comment forbids ("mixing an approximate
/// determinant with exact traces gives ARC a Hessian for a different
/// objective"). The strict path now computes one honest quantity:
///
/// - eigendecompose the symmetrised `H + S_λ`;
/// - **reject** (return `Err`) when any eigenvalue is genuinely negative
/// (`λ < −tol`). An indefinite joint coefficient Hessian is a real defect
/// (a non-stationary inner β or a mis-signed curvature block); rejecting it
/// tells the outer optimizer to step back, instead of masking it with a
/// biased finite number;
/// - sum `Σ_{λ > tol} log λ` — the exact pseudo-logdet on the positive
/// eigenspace, which is `C∞` in ρ because the positive eigenspace of a PSD
/// `S(ρ)=Σ e^{ρ_k} S_k` is structurally fixed. A near-zero band `[−tol, tol]`
/// (a structural null space) is simply not in `range` and contributes no
/// term, matching the projected `tr` derivative; a near-singular-but-positive
/// curvature is accepted exactly as the historical Cholesky strict path did.
fn strict_exact_pseudo_logdet(
matrix: &Array2<f64>,
accumulation_depth: usize,
) -> Result<f64, String> {
let mut sym = matrix.clone();
symmetrize_dense_in_place(&mut sym);
let (evals, _) = FaerEigh::eigh(&sym, Side::Lower)
.map_err(|e| format!("strict pseudo-laplace eigendecomposition failed: {e}"))?;
let p = sym.nrows();
let max_abs_eval = evals.iter().fold(0.0_f64, |acc, &ev| acc.max(ev.abs()));
// Bauer-Fike: |δσ| ≤ p·‖δH‖_∞; n-term fma roundoff gives ‖δH‖_∞ ≤ ε·n·‖H‖,
// so σ_noise ≤ ε·n·p·‖H‖₂. Tenfold slack absorbs sign cancellations,
// and a 100·ε floor handles the ‖H‖→0 limit. This `neg_tol` is the
// INDEFINITENESS-rejection band only: an eigenvalue below `−neg_tol` is a
// genuine negative curvature (non-stationary β / mis-signed block) and is
// rejected, not masked (gam#748).
let eps = f64::EPSILON;
let eps_np = eps * (accumulation_depth as f64) * (p as f64);
// `neg_tol` is the INDEFINITENESS-rejection band only: an eigenvalue below
// `−neg_tol` is a genuine negative curvature (non-stationary β / mis-signed
// block) and is rejected, not masked (gam#748).
let neg_tol = (10.0 * eps_np * max_abs_eval).max(100.0 * eps);
// POSITIVE-eigenspace inclusion cutoff for the pseudo-logdet sum. This MUST
// be byte-identical to the cutoff the analytic REML gradient's trace kernel
// uses (`positive_eigenvalue_threshold`, the `range(H+Sλ)` Moore–Penrose
// pinv drop in `joint_penalty_subspace_trace_parts`), or the LAML VALUE
// `½ log|H+Sλ|₊` and its analytic GRADIENT `½ tr((H+Sλ)⁺ ∂Sλ)` are evaluated
// over DIFFERENT subspaces and describe DIFFERENT objectives — the "mixing
// an approximate determinant with exact traces gives ARC a Hessian for a
// different objective" trap (gam#748).
//
// Historically this sum used the Bauer–Fike `neg_tol = 10·ε·n·p·‖H‖`, a
// factor of ~n/10 LARGER than the kernel's `100·ε·p·‖H‖`. At an oversmoothed
// marginal-slope ρ probe a penalty-null trend eigenvalue lands in the band
// `(100·ε·p·‖H‖, 10·ε·n·p·‖H‖)`: DROPPED from the value logdet but KEPT in
// the gradient kernel, so the analytic outer gradient is the derivative of a
// different objective than the value. ARC's predicted descent then never
// matches the actual objective change and the outer optimizer freezes
// (constant ‖g‖, stuck cost — gam#808). Sharing the kernel's threshold here
// removes the desync at the source; both are `C∞` in ρ (the positive
// eigenspace of a PSD-shifted Hessian is structurally fixed).
let pos_tol = positive_eigenvalue_threshold(evals.as_slice().unwrap());
if evals.iter().any(|&ev| ev < -neg_tol) {
let min_eval = evals.iter().copied().fold(f64::INFINITY, f64::min);
let below = evals.iter().filter(|&&ev| ev < -neg_tol).count();
return Err(CustomFamilyError::NumericalFailure {
reason: format!(
"strict pseudo-laplace logdet: {below} eigenvalue(s) below -neg_tol \
(min(λ)={min_eval:.6e}, max|λ|={max_abs_eval:.6e}, neg_tol={neg_tol:.6e}, εnp={eps_np:.6e}); \
indefinite joint coefficient Hessian rejected (no δ-ridge masking, gam#748)"
),
}
.into());
}
Ok(evals
.iter()
.copied()
.filter(|&ev| ev > pos_tol)
.map(f64::ln)
.sum())
}
fn pinv_positive_part(matrix: &Array2<f64>, ridge_floor: f64) -> Result<Array2<f64>, String> {
let mut sym = matrix.clone();
symmetrize_dense_in_place(&mut sym);
let (eigenvalues, eigenvectors) = sym
.eigh(Side::Lower)
.map_err(|e| format!("positive-part covariance eigendecomposition failed: {e}"))?;
let max_abs_eigenvalue = eigenvalues.iter().fold(0.0_f64, |a, &b| a.max(b.abs()));
let tol = (max_abs_eigenvalue * CUSTOM_FAMILY_EVAL_FLOOR)
.max(ridge_floor.max(CUSTOM_FAMILY_CONDITION_RELATIVE_FLOOR));
let p = matrix.nrows();
let mut pinv = Array2::<f64>::zeros((p, p));
for (k, &ev) in eigenvalues.iter().enumerate() {
if ev <= tol {
continue;
}
let inv_ev = 1.0 / ev;
for i in 0..p {
let vi = eigenvectors[[i, k]];
for j in 0..p {
pinv[[i, j]] += inv_ev * vi * eigenvectors[[j, k]];
}
}
}
symmetrize_dense_in_place(&mut pinv);
Ok(pinv)
}
fn include_exact_newton_logdet_h<F: CustomFamily + ?Sized>(
family: &F,
options: &BlockwiseFitOptions,
) -> bool {
options.use_remlobjective
&& matches!(
family.exact_newton_outerobjective(),
ExactNewtonOuterObjective::RidgedQuadraticReml
| ExactNewtonOuterObjective::StrictPseudoLaplace
)
}
pub(crate) fn custom_family_outer_derivatives<F: CustomFamily + ?Sized>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
) -> (
crate::solver::outer_strategy::Derivative,
crate::solver::outer_strategy::DeclaredHessianForm,
) {
use crate::solver::outer_strategy::{DeclaredHessianForm, Derivative};
// The capability-vs-policy split: capability tells us *what the family
// can compute*; policy tells us *what we should ask for at this size*.
//
// For the outer-strategy declaration here we have only `specs` and
// `options` (no resolved psi_dim), so policy is queried at
// psi_dim = 0 — the gradient/Hessian forms returned here are the
// pre-psi declarations consumed by the outer planner ladder. The
// per-iter clamp in `optimize_spatial_length_scale_exact_joint`
// consults `outer_derivative_policy` again with the realized
// psi_dim for the κ optimizer.
let policy = family.outer_derivative_policy(specs, 0, options);
let gradient = if policy.capability.has_gradient() {
Derivative::Analytic
} else {
Derivative::Unavailable
};
// The analytic outer Hessian is routed to ARC whenever the realized family
// exposes second-order calculus. Matrix-free Hessian support is a
// representation capability used by the evaluator; it must not be hidden
// from the outer optimizer by a cost-based first-order policy.
let hessian = if options.use_outer_hessian
&& include_exact_newton_logdet_h(family, options)
&& policy.capability.has_hessian()
{
DeclaredHessianForm::Either
} else {
DeclaredHessianForm::Unavailable
};
(gradient, hessian)
}
fn include_exact_newton_logdet_s<F: CustomFamily + ?Sized>(
family: &F,
options: &BlockwiseFitOptions,
) -> bool {
family.exact_newton_outerobjective() == ExactNewtonOuterObjective::RidgedQuadraticReml
&& options.use_remlobjective
}
fn use_exact_newton_strict_spd<F: CustomFamily + ?Sized>(family: &F) -> bool {
family.exact_newton_outerobjective() == ExactNewtonOuterObjective::StrictPseudoLaplace
}
fn blockwise_logdet_terms<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
states: &mut [ParameterBlockState],
block_log_lambdas: &[Array1<f64>],
options: &BlockwiseFitOptions,
) -> Result<(f64, f64), String> {
blockwise_logdet_terms_with_workspace(family, specs, states, block_log_lambdas, options, None)
}
fn blockwise_logdet_terms_with_workspace<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
states: &mut [ParameterBlockState],
block_log_lambdas: &[Array1<f64>],
options: &BlockwiseFitOptions,
preferred_workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Result<(f64, f64), String> {
let include_logdet_h = include_exact_newton_logdet_h(family, options);
let include_logdet_s = include_exact_newton_logdet_s(family, options);
if !include_logdet_h && !include_logdet_s {
return Ok((0.0, 0.0));
}
let strict_spd = use_exact_newton_strict_spd(family);
refresh_all_block_etas(family, specs, states)?;
let ranges = block_param_ranges(specs);
let total = ranges.last().map(|(_, e)| *e).unwrap_or(0);
// Universal full-span robustness: the outer REML logdet of the
// penalized Hessian must use the SAME Jeffreys-augmented Hessian
// `H + S_λ + H_Φ` the inner Newton converged on, or the LAML score and its
// analytic derivatives describe a different objective. Compute `H_Φ` once
// over the full-span basis `Z_J` and add it into whichever
// logdet path runs below. `None` ⇒ no logdet-H contribution (logdet-S only).
// Cheap matrix-free conditioning pre-check for the OUTER logdet H_Φ. When a
// matrix-free workspace exposes the Hessian-vector product, bound the joint
// information's spectrum from a few matvecs (no dense H, no O(p³) eigh): if it
// certifies well-conditioned the exact gate is certain to return H_Φ = 0, so
// we skip the whole dense formation and use `None` (no logdet-H Jeffreys
// contribution), byte-identical to the gated-off path. This keeps the outer
// LAML logdet consistent with the inner solve (which also gated the term off
// on the same well-conditioned geometry) while preserving the matrix-free path
// at outer-eval scale. Returns `false`/unsure ⇒ exact formation below.
let outer_precheck_eligible = include_logdet_h
&& total >= crate::estimate::reml::jeffreys_subspace::CHEAP_CONDITIONING_PRECHECK_MIN_DIM;
let outer_jeffreys_precheck_skips = match preferred_workspace.as_ref() {
Some(ws) if outer_precheck_eligible && ws.hessian_matvec_available() => {
let hv = |v: &Array1<f64>| -> Result<Array1<f64>, String> {
match ws.hessian_matvec(v)? {
Some(out) if out.len() == total => Ok(out),
// Workspace declined this matvec ⇒ cannot certify ⇒ do not skip.
// Return a non-finite sentinel so the cheap estimator bails to
// the conservative `false` (never skip on an unresolved apply).
_ => Ok(Array1::from_elem(total, f64::NAN)),
}
};
crate::estimate::reml::jeffreys_subspace::jeffreys_term_skippable_via_matvec(hv, total)
.unwrap_or(false)
}
_ => false,
};
let logdet_jeffreys_hphi: Option<Array2<f64>> = if include_logdet_h
&& !outer_jeffreys_precheck_skips
&& !options.seed_screening
&& family.joint_jeffreys_term_required()
{
// Skipped during seed screening: this per-axis Jeffreys curvature
// (O(p · per-axis-Hdot)) augments the outer LAML logdet `½ log|H+Sλ+H_Φ|`,
// a refinement the screening SCORE does not need. Screening ranks seeds by
// the un-augmented `½ log|H+Sλ|` plus the value-only Firth penalty already
// in `penalty_value`; the load-bearing H_Φ is restored for the real fit
// (gam#729/#808).
match build_joint_jeffreys_subspace(specs, &ranges)? {
Some(z_joint) => {
custom_family_joint_jeffreys_term(family, states, specs, &ranges, &z_joint)?
.map(|(_phi, _grad, hphi)| hphi)
}
None => None,
}
} else {
None
};
let compute_block_logdet_term = |b: usize| -> Result<(Array2<f64>, f64), String> {
let spec = &specs[b];
let (start, end) = ranges[b];
let p = end - start;
let lambdas = block_log_lambdas[b].mapv(f64::exp);
let mut s_lambda = Array2::<f64>::zeros((p, p));
for (k, s) in spec.penalties.iter().enumerate() {
s.add_scaled_to(lambdas[k], &mut s_lambda);
}
let block_logdet = if include_logdet_s {
// Pseudo-logdet of S_λ on the positive eigenspace.
//
// CONSISTENCY REQUIREMENT (gam#752/#748/#808 class): this VALUE is
// the `log|S_λ|₊` term of the outer REML/LAML objective, and its
// ρ-gradient is supplied separately by
// `compute_block_penalty_logdet_derivs`, which differentiates the
// canonical `PenaltyPseudologdet`. If the value used a *different*
// positive/null eigenspace split (e.g. structural-count `skip(m0)`
// by COUNT, or a ridge-blind `positive_eigenvalue_threshold`) than
// the gradient's by-magnitude `> ridge + noise_band` rule, the
// outer optimizer would see an objective and a gradient that
// describe different functions near the ridge boundary (a barely-
// active mode `λ_k σ_k → 0` whose ridged eigenvalue dips below
// `ridge + noise_band` is kept by the count rule but dropped by the
// magnitude rule). To guarantee value↔gradient agree by
// construction, compute the value from the SAME canonical
// `PenaltyPseudologdet` the gradient differentiates, with the same
// dense penalty components, the same λ, and the same ridge.
let ridge = if options.ridge_policy.include_penalty_logdet {
effective_solverridge(options.ridge_floor)
} else {
0.0
};
let penalties_dense: Vec<Array2<f64>> =
spec.penalties.iter().map(|pen| pen.to_dense()).collect();
let lambdas_vec: Vec<f64> = lambdas.to_vec();
match crate::estimate::reml::penalty_logdet::PenaltyPseudologdet::from_components(
&penalties_dense,
&lambdas_vec,
ridge,
) {
Ok(pld) => pld.value(),
Err(eigh_err_msg) => {
// `from_components` only fails when the single internal
// eigendecomposition fails, which for PSD penalties is
// purely numerical. Fall back to Cholesky on the ridged
// matrix (which should be SPD). The Cholesky logdet
// includes null-space contributions (~m₀ × ln(ridge)),
// a smooth bias that does not corrupt the REML gradient.
let mut s_for_logdet = s_lambda.clone();
if ridge > 0.0 {
for i in 0..p {
s_for_logdet[[i, i]] += ridge;
}
}
penalty_logdet_cholesky_fallback(&s_for_logdet, ridge, b, p, &eigh_err_msg)?
}
}
} else {
0.0
};
Ok((s_lambda, block_logdet))
};
// Per-block penalty assembly and eigendecomposition are independent.
// Use rayon only from non-rayon callers so inner operator/eigendecomp work
// does not nest under an existing worker. Collecting an indexed range into
// a Vec preserves block order; totals are accumulated sequentially below
// to keep floating-point summation deterministic.
let block_terms: Vec<Result<(Array2<f64>, f64), String>> =
if specs.len() > 1 && rayon::current_thread_index().is_none() {
use rayon::iter::{IntoParallelIterator, ParallelIterator};
(0..specs.len())
.into_par_iter()
.map(compute_block_logdet_term)
.collect()
} else {
(0..specs.len()).map(compute_block_logdet_term).collect()
};
let mut s_lambdas = Vec::with_capacity(block_terms.len());
let mut penalty_logdet_s_total = 0.0;
for block_term in block_terms {
let (s_lambda, block_logdet) = block_term?;
s_lambdas.push(s_lambda);
penalty_logdet_s_total += block_logdet;
}
if !include_logdet_h {
return Ok((0.0, penalty_logdet_s_total));
}
// Try the shared scale-aware exact curvature path first.
if let Some(curvature) = family.exact_newton_outer_curvature(states)? {
let mut h_joint = symmetrized_square_matrix(
curvature.hessian,
total,
"joint exact-newton Hessian validation in logdet terms (rescaled)",
)?;
for (b, s_lambda) in s_lambdas.iter().enumerate() {
let (start, end) = ranges[b];
h_joint
.slice_mut(ndarray::s![start..end, start..end])
.scaled_add(curvature.rho_curvature_scale, s_lambda);
}
if let Some(hphi) = logdet_jeffreys_hphi.as_ref() {
h_joint.scaled_add(curvature.rho_curvature_scale, hphi);
}
let logdet_h_scaled = if strict_spd {
strict_exact_pseudo_logdet(&h_joint, joint_observation_count(states))?
} else {
stable_logdet_with_ridge_policy(
&h_joint,
options.ridge_floor * curvature.rho_curvature_scale,
options.ridge_policy,
)?
};
let logdet_h_total = logdet_h_scaled + curvature.hessian_logdet_correction;
return Ok((logdet_h_total, penalty_logdet_s_total));
}
let exact_joint_source = if let Some(workspace) = preferred_workspace.as_ref() {
exact_newton_joint_hessian_source_from_workspace(
workspace,
total,
MaterializationIntent::LogdetFactorization,
"joint exact-newton operator mismatch in logdet terms",
)?
} else if !strict_spd && use_joint_matrix_free_path(total, joint_observation_count(states)) {
family
.exact_newton_joint_hessian_workspace_with_options(states, specs, options)?
.as_ref()
.map(|workspace| {
exact_newton_joint_hessian_source_from_workspace(
workspace,
total,
MaterializationIntent::LogdetFactorization,
"joint exact-newton operator mismatch in logdet terms",
)
})
.transpose()?
.flatten()
} else {
None
};
if let Some(source) = exact_joint_source {
// Exact determinant of H + S_λ for operator-backed coefficient Hessians.
//
// The REML gradient and Hessian use analytic trace identities such as
// ∂ log|A(θ)| = tr(A⁻¹ A_θ). Mixing an approximate determinant with
// exact traces violates that identity and gives ARC a Hessian for a
// different objective. Materializing the coefficient Hessian by
// canonical-basis HVPs keeps the objective/derivative pair exact. At
// large-scale CTN scale `total` is a few hundred, so this is sub-MiB; the
// materializer below refuses oversized systems before allocation.
let mut h_joint = materialize_joint_hessian_source(
&source,
total,
"joint exact-newton operator dense logdet materialization",
)?;
for (b, s_lambda) in s_lambdas.iter().enumerate() {
let (start, end) = ranges[b];
h_joint
.slice_mut(ndarray::s![start..end, start..end])
.scaled_add(1.0, s_lambda);
}
if let Some(hphi) = logdet_jeffreys_hphi.as_ref() {
h_joint.scaled_add(1.0, hphi);
}
let logdet_h_total = if strict_spd {
strict_exact_pseudo_logdet(&h_joint, joint_observation_count(states))?
} else {
stable_logdet_with_ridge_policy(&h_joint, options.ridge_floor, options.ridge_policy)?
};
return Ok((logdet_h_total, penalty_logdet_s_total));
}
// Fallback: try the non-rescaled symmetrized path (for families that
// don't implement exact_newton_outer_curvature but do provide
// a plain joint Hessian).
if let Some(mut h_joint) = exact_newton_joint_hessian_symmetrized(
family,
states,
specs,
total,
"joint exact-newton Hessian validation in logdet terms",
)? {
for (b, s_lambda) in s_lambdas.iter().enumerate() {
let (start, end) = ranges[b];
h_joint
.slice_mut(ndarray::s![start..end, start..end])
.scaled_add(1.0, s_lambda);
}
if let Some(hphi) = logdet_jeffreys_hphi.as_ref() {
h_joint.scaled_add(1.0, hphi);
}
let logdet_h_total = if strict_spd {
strict_exact_pseudo_logdet(&h_joint, joint_observation_count(states))?
} else {
stable_logdet_with_ridge_policy(&h_joint, options.ridge_floor, options.ridge_policy)?
};
return Ok((logdet_h_total, penalty_logdet_s_total));
}
let eval = family.evaluate(states)?;
if eval.blockworking_sets.len() != specs.len() {
return Err(format!(
"family returned {} block working sets, expected {}",
eval.blockworking_sets.len(),
specs.len()
));
}
let mut logdet_h_total = 0.0;
let logdet_s_total = penalty_logdet_s_total;
for b in 0..specs.len() {
let spec = &specs[b];
let work = &eval.blockworking_sets[b];
let p = spec.design.ncols();
let xtwx = match work {
BlockWorkingSet::Diagonal {
working_response: _,
working_weights,
} => with_block_geometry(family, states, spec, b, |x_dyn, _| {
let w = floor_positiveworking_weights(working_weights, options.minweight);
let (xtwx, _) = weighted_normal_equations(x_dyn, &w, None)?;
Ok(xtwx)
})?,
BlockWorkingSet::ExactNewton {
gradient: _,
hessian,
} => {
if hessian.nrows() != p || hessian.ncols() != p {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"block {b} exact-newton Hessian shape mismatch: got {}x{}, expected {}x{}",
hessian.nrows(),
hessian.ncols(),
p,
p
) }.into());
}
hessian.to_dense()
}
};
let s_lambda = &s_lambdas[b];
let mut h = xtwx;
h += s_lambda;
logdet_h_total += if strict_spd {
strict_exact_pseudo_logdet(&h, joint_observation_count(states))?
} else {
stable_logdet_with_ridge_policy(&h, options.ridge_floor, options.ridge_policy)?
};
}
Ok((logdet_h_total, logdet_s_total))
}
/// Snapshot of a single block's eta for line-search rollback.
///
/// Created from a specific block's state; can only restore to or update
/// that same block. There is no shared buffer across blocks, so
/// cross-block length confusion is structurally impossible.
struct BlockEtaCheckpoint {
saved: Array1<f64>,
}
impl BlockEtaCheckpoint {
/// Capture the current eta of `state`.
fn capture(state: &ParameterBlockState) -> Self {
Self {
saved: state.eta.clone(),
}
}
/// Capture into a pre-allocated buffer, returning the filled checkpoint.
/// The buffer is taken (O(1) move) and filled with eta's data (O(n) copy).
fn capture_reuse(state: &ParameterBlockState, buf: &mut Array1<f64>) -> Self {
if buf.len() == state.eta.len() {
buf.assign(&state.eta);
Self {
saved: std::mem::take(buf),
}
} else {
Self::capture(state)
}
}
/// Return the internal buffer for recycling.
fn into_buffer(self) -> Array1<f64> {
self.saved
}
/// Restore: `state.eta = saved`.
fn restore_eta(&self, state: &mut ParameterBlockState) {
state.eta.assign(&self.saved);
}
/// Incremental update: `state.eta = saved + alpha * direction`.
fn restore_eta_with_step(
&self,
state: &mut ParameterBlockState,
alpha: f64,
direction: &Array1<f64>,
) {
// In-place: eta = eta_backup + alpha * xd (zero allocations).
state.eta.assign(&self.saved);
state.eta.scaled_add(alpha, direction);
}
}
/// Classification of which branch of the trust-region radius policy
/// fired on a single update — surfaced in cycle logs so it is possible
/// to tell at a glance whether the inner solver is being throttled by
/// the TR (e.g., `RejectFloor`/`ShrinkOnRejection`) or, conversely,
/// whether the step is sitting well inside the region (`HoldInside`)
/// so the slow convergence is NOT a TR-policy issue.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum JointTrustRegionDecision {
/// `rho > 0.75` AND `step_norm >= 0.99 * old_radius` — model is good
/// AND the step is at the TR boundary, so doubling reflects a real
/// constraint that was just lifted.
GrowAtBoundary,
/// `rho > 0.75` but the step is well inside the region; radius held
/// because no evidence the TR was constraining the step. When the
/// inner is converging linearly and this branch fires every cycle,
/// the TR is NOT the bottleneck — Newton itself is finding short
/// steps for a reason unrelated to the trust radius.
HoldInside,
/// `0.25 <= rho <= 0.75` (moderate model fidelity) — radius held.
HoldModerate,
/// `rho < 0.25` but step accepted (positive descent above noise).
/// Radius shrunk to a quarter to be more conservative next cycle.
ShrinkOnMarginalAccept,
/// Step rejected — radius shrunk and capped at half the proposed
/// step norm so a re-proposal is constrained inside the rejected
/// region.
ShrinkOnRejection,
/// Radius was already at the floor before this update. Persistent
/// `RejectFloor` is the unambiguous signal of a degenerate ρ region.
RejectFloor,
}
impl JointTrustRegionDecision {
fn label(&self) -> &'static str {
match self {
Self::GrowAtBoundary => "grow_at_boundary",
Self::HoldInside => "hold_inside",
Self::HoldModerate => "hold_moderate",
Self::ShrinkOnMarginalAccept => "shrink_marginal_accept",
Self::ShrinkOnRejection => "shrink_reject",
Self::RejectFloor => "reject_floor",
}
}
}
#[derive(Clone, Copy, Debug)]
struct JointTrustRegionUpdate {
rho: f64,
radius: f64,
accepted: bool,
decision: JointTrustRegionDecision,
}
fn update_joint_trust_region_radius(
old_radius: f64,
step_norm: f64,
actual_reduction: f64,
predicted_reduction: f64,
objective_scale: f64,
) -> JointTrustRegionUpdate {
// Floating-point noise floor relative to the objective magnitude.
// When both the model-predicted and the realized reductions are at
// this scale, their sign is dominated by round-off in the
// log-likelihood evaluation rather than by genuine descent or
// ascent; rejecting on that sign would discard a perfectly
// converged step. Mirrors the noise-floor handling in
// src/solver/pirls.rs (see the analogous `noise_floor` block).
let noise_floor = objective_scale.abs().max(1.0) * 1e-14;
let predicted_finite_positive =
predicted_reduction > noise_floor && predicted_reduction.is_finite();
let rho = if actual_reduction.abs() <= noise_floor {
// The realized objective change is at the floating-point round-off floor:
// the step neither helped nor hurt beyond noise, so it is a numerically
// neutral (converged) step. Treat it as `rho = 1` REGARDLESS of whether
// `predicted_reduction` happens to sit just above the floor. The previous
// form only took this branch when `!predicted_finite_positive`; when a
// tiny-but-valid Newton step near a flat-objective optimum produced
// `predicted_reduction` marginally above `noise_floor` while
// `actual_reduction` was within it, it divided two round-off-level
// quantities and got a spurious negative `rho`, rejecting the step and
// ratcheting the trust radius to the floor — pinning the solve far below
// the (small, valid) Newton step it needed (gam#797 last-mile; same
// pinning family observed on clustered bernoulli). Keying neutrality on the
// *actual* reduction is the correct round-off guard.
1.0
} else if predicted_finite_positive {
actual_reduction / predicted_reduction
} else {
f64::NEG_INFINITY
};
let accepted = rho.is_finite() && rho > 0.0 && actual_reduction >= -noise_floor;
let mut radius = old_radius;
let decision: JointTrustRegionDecision;
if !accepted {
radius *= 0.25;
if step_norm.is_finite() && step_norm > 0.0 {
radius = radius.min(0.5 * step_norm);
}
decision = JointTrustRegionDecision::ShrinkOnRejection;
} else if rho < 0.25 {
radius *= 0.25;
decision = JointTrustRegionDecision::ShrinkOnMarginalAccept;
} else if rho > 0.75 && step_norm >= 0.99 * old_radius {
radius *= 2.0;
decision = JointTrustRegionDecision::GrowAtBoundary;
} else if rho > 0.75 {
decision = JointTrustRegionDecision::HoldInside;
} else {
decision = JointTrustRegionDecision::HoldModerate;
}
if !radius.is_finite() || radius <= 0.0 {
radius = 1.0e-12;
}
let clamped_radius = radius.clamp(1.0e-12, 1.0e6);
// Promote to RejectFloor if we landed at the absolute floor. The
// base classification is preserved up to this final clamp; the
// floor classification is just a stronger label that captures the
// "no descent direction exists at this radius" signal.
let final_decision = if clamped_radius <= 1.0e-12 + f64::EPSILON
&& matches!(
decision,
JointTrustRegionDecision::ShrinkOnRejection
| JointTrustRegionDecision::ShrinkOnMarginalAccept
) {
JointTrustRegionDecision::RejectFloor
} else {
decision
};
JointTrustRegionUpdate {
rho,
radius: clamped_radius,
accepted,
decision: final_decision,
}
}
fn joint_objective_roundoff_slack(old_objective: f64, trial_objective: f64) -> f64 {
(64.0 * f64::EPSILON * (1.0 + old_objective.abs() + trial_objective.abs())).max(1.0e-10)
}
// True iff the line search detected a noise-level realized reduction (i.e.
// the trial step neither helped nor hurt the objective beyond round-off)
// AND the local quadratic model agrees that no further descent is available
// within tolerance. `actual_reduction <= 0` is kept (not made sign-symmetric)
// because at rank-deficient optima (σ_min(H) ≲ ε_machine) the outer-gradient
// FD identity requires β trajectories to be CONSISTENT across λ probes —
// accepting positive-noise-level reductions exits the loop one attempt
// earlier than the negative case and decorrelates the null-space drift
// between consecutive REML evaluations. Concretely:
// `outer_lamlgradient_matches_finite_differencewhen_joint_exact_path_is_active`
// at HardPseudo σ_min ~ 1e-10 fails when symmetric. The asymmetric guard
// preserves the spin avoidance for the common (negative-noise) case at
// large scale while leaving the rank-deficient FD identity intact.
fn joint_objective_floor_reached(
old_objective: f64,
trial_objective: f64,
actual_reduction: f64,
predicted_reduction: f64,
objective_tol: f64,
) -> bool {
trial_objective.is_finite()
&& actual_reduction <= 0.0
&& actual_reduction.abs() <= joint_objective_roundoff_slack(old_objective, trial_objective)
&& predicted_reduction.is_finite()
&& predicted_reduction
<= objective_tol.max(joint_objective_roundoff_slack(
old_objective,
trial_objective,
))
}
/// True iff the joint-Newton proposal is already at the step-tolerance floor —
/// the unclamped Newton step's inf-norm is within `STEP_FLOOR_CERT_FACTOR ×
/// step_tol` (the same round-off band the constrained-stationary certificate
/// uses for "a hair above tol"). At the floor the iterate is doing KKT polishing
/// on a flat objective, so a `predicted_reduction = rhs·δ − ½δᵀHδ ≤ 0` is the
/// SIGN of two near-equal O(step²) quantities (round-off), NOT a model-invalid
/// descent direction; the preconditioned-descent substitution must be suppressed
/// there or it replaces the tiny polishing step with an objective-descent step
/// that catapults the KKT residual off the near-converged iterate (gam#787 binary
/// matern centers=12: residual 1.7e-4 → 4.7e-1, never recovers).
fn joint_proposal_at_step_floor(proposal_step_inf: f64, step_tol: f64) -> bool {
const STEP_FLOOR_CERT_FACTOR: f64 = 4.0;
proposal_step_inf.is_finite()
&& step_tol.is_finite()
&& proposal_step_inf <= STEP_FLOOR_CERT_FACTOR * step_tol
}
fn joint_trust_region_metric_step_norm(delta: &Array1<f64>, metric_diag: &Array1<f64>) -> f64 {
assert_eq!(delta.len(), metric_diag.len());
joint_trust_region_metric_step_norm_view(delta.view(), metric_diag.view())
}
fn joint_trust_region_metric_step_norm_view(
delta: ArrayView1<f64>,
metric_diag: ArrayView1<f64>,
) -> f64 {
assert_eq!(delta.len(), metric_diag.len());
delta
.iter()
.zip(metric_diag.iter())
.map(|(step, weight)| step * step * positive_joint_diagonal_entry(*weight))
.sum::<f64>()
.sqrt()
}
fn joint_trust_region_block_metric_norms(
delta: &Array1<f64>,
ranges: &[(usize, usize)],
metric_diag: &Array1<f64>,
) -> Vec<f64> {
assert_eq!(delta.len(), metric_diag.len());
ranges
.iter()
.map(|(start, end)| {
joint_trust_region_metric_step_norm_view(
delta.slice(s![*start..*end]),
metric_diag.slice(s![*start..*end]),
)
})
.collect()
}
fn truncate_joint_step_to_block_metric_radii(
delta: &mut Array1<f64>,
ranges: &[(usize, usize)],
metric_diag: &Array1<f64>,
block_radii: &[f64],
) -> Vec<f64> {
assert_eq!(ranges.len(), block_radii.len());
assert_eq!(delta.len(), metric_diag.len());
let mut norms = Vec::with_capacity(ranges.len());
for (block_idx, (start, end)) in ranges.iter().copied().enumerate() {
let metric_view = metric_diag.slice(s![start..end]);
let mut block = delta.slice_mut(s![start..end]);
let norm = joint_trust_region_metric_step_norm_view(block.view(), metric_view);
let radius = block_radii[block_idx];
if norm.is_finite() && norm > radius && radius > 0.0 {
block.mapv_inplace(|v| v * (radius / norm));
norms.push(radius);
} else {
norms.push(norm);
}
}
norms
}
fn joint_block_step_hit_trust_boundary(step_norm: f64, radius: f64) -> bool {
step_norm.is_finite() && radius > 0.0 && step_norm >= 0.99 * radius
}
/// Per-block dogleg step (Powell, blending the Cauchy and Newton points within
/// the block's M-metric trust radius). This is the principled globalization for
/// the coupled location-scale inner Newton (gam#826/#808): box-truncating the
/// Newton step alone freezes progress when the spectral solve is degenerate at
/// the oversmoothed seed — the high-curvature `log_sigma` block has
/// `λ ~ exp(2·ρ_bound)` so its Newton component is `O(g/λ) ≈ 5e-21`, the
/// mean/trend blocks get isotropically shrunk to the radius, and the residual
/// stalls while β barely moves. The dogleg always includes the Cauchy leg
/// (the model-minimizing steepest-descent step in the block metric), so the
/// realized decrease is at least the Cauchy decrease whenever the block
/// gradient is nonzero — progress is guaranteed even when the Newton step is
/// numerically frozen. Inside the radius the dogleg returns the exact Newton
/// step, so the converged β, the KKT certificate, and the well-conditioned /
/// #729 endgame are byte-identical to the undamped solve.
///
/// Inputs per block `b`:
/// * `newton[start..end]` — Newton (spectral) step block `δ_N`.
/// * `cauchy[start..end]` — the FULL (unconstrained) Cauchy block
/// `δ_C = τ·p_sd`, where `p_sd = M⁻¹·rhs` is the M-metric steepest-descent
/// direction of the model and `τ` minimizes the model along it; precomputed
/// once per cycle by `joint_cauchy_step` (the curvature `p_sd·H·p_sd` needs
/// a coupled Hessian-vector product, so it must be hoisted out of the
/// radius-shrink loop).
/// * `radius` — the block's current M-metric trust radius.
///
/// Returns the block step norms in the M-metric (same contract as
/// `truncate_joint_step_to_block_metric_radii`) and overwrites `out` with the
/// dogleg blend per block.
fn joint_dogleg_step_to_block_metric_radii(
newton: &Array1<f64>,
cauchy: &Array1<f64>,
ranges: &[(usize, usize)],
metric_diag: &Array1<f64>,
block_radii: &[f64],
out: &mut Array1<f64>,
) -> Vec<f64> {
assert_eq!(ranges.len(), block_radii.len());
assert_eq!(newton.len(), metric_diag.len());
assert_eq!(cauchy.len(), metric_diag.len());
assert_eq!(out.len(), metric_diag.len());
let mut norms = Vec::with_capacity(ranges.len());
for (block_idx, (start, end)) in ranges.iter().copied().enumerate() {
let metric_view = metric_diag.slice(s![start..end]);
let newton_b = newton.slice(s![start..end]);
let cauchy_b = cauchy.slice(s![start..end]);
let radius = block_radii[block_idx];
let newton_norm = joint_trust_region_metric_step_norm_view(newton_b, metric_view);
let cauchy_norm = joint_trust_region_metric_step_norm_view(cauchy_b, metric_view);
let mut out_b = out.slice_mut(s![start..end]);
// Degenerate radius (non-finite or non-positive): nothing moves.
if !radius.is_finite() || radius <= 0.0 {
out_b.fill(0.0);
norms.push(0.0);
continue;
}
// Newton step (or a non-finite Cauchy fallback) inside the radius: take
// the exact Newton step. This is the only branch a well-conditioned /
// converging fit ever reaches near the optimum, so the endgame numerics
// are unchanged.
if newton_norm.is_finite() && newton_norm <= radius {
out_b.assign(&newton_b);
norms.push(newton_norm);
continue;
}
// Cauchy leg longer than the radius (or Newton/Cauchy not comparable):
// scale the Cauchy step to the boundary. When the Cauchy step itself is
// unusable, fall back to scaling the Newton step (pre-dogleg behavior).
if !(cauchy_norm.is_finite() && cauchy_norm > 0.0) {
let scale = if newton_norm.is_finite() && newton_norm > 0.0 {
radius / newton_norm
} else {
0.0
};
out_b.assign(&newton_b);
out_b.mapv_inplace(|v| v * scale);
norms.push(if scale > 0.0 { radius } else { 0.0 });
continue;
}
if cauchy_norm >= radius {
let scale = radius / cauchy_norm;
out_b.assign(&cauchy_b);
out_b.mapv_inplace(|v| v * scale);
norms.push(radius);
continue;
}
// Dogleg blend: δ(θ) = δ_C + θ·(δ_N − δ_C), θ ∈ [0,1], pick θ so
// ‖δ(θ)‖_M = radius. Solve the quadratic ‖δ_C + θ·d‖²_M = radius² with
// d = δ_N − δ_C, a = ‖d‖²_M, b = 2·⟨δ_C, d⟩_M, c = ‖δ_C‖²_M − radius².
let mut a = 0.0_f64;
let mut b = 0.0_f64;
for ((cb, nb), w) in cauchy_b.iter().zip(newton_b.iter()).zip(metric_view.iter()) {
let m = positive_joint_diagonal_entry(*w);
let d = nb - cb;
a += m * d * d;
b += 2.0 * m * cb * d;
}
let c = cauchy_norm * cauchy_norm - radius * radius;
// a > 0 because δ_N ≠ δ_C here (Newton outside, Cauchy inside the
// radius). Largest root in [0,1] keeps the step on the dogleg path.
let disc = (b * b - 4.0 * a * c).max(0.0);
let theta = if a > 0.0 {
((-b + disc.sqrt()) / (2.0 * a)).clamp(0.0, 1.0)
} else {
0.0
};
for ((o, cb), nb) in out_b.iter_mut().zip(cauchy_b.iter()).zip(newton_b.iter()) {
*o = cb + theta * (nb - cb);
}
let norm = joint_trust_region_metric_step_norm_view(out_b.view(), metric_view);
norms.push(norm);
}
norms
}
/// Unconstrained Cauchy point of the joint penalized quadratic model in the
/// block-diagonal M-metric: `δ_C = τ·p_sd` with `p_sd = M⁻¹·rhs` (the M-metric
/// steepest-descent direction of the model `m(δ) = −rhs·δ + ½·δᵀHδ` at δ=0)
/// and `τ = (rhs·p_sd)/(p_sd·H·p_sd)` minimizing the model along `p_sd`. When
/// the curvature `p_sd·H·p_sd ≤ 0` the model is unbounded below along `p_sd`,
/// so `δ_C` is just `p_sd` (the dogleg's boundary scaling then takes it to the
/// radius — a descent step on the indefinite/flat direction). `h_psd` must be
/// `H_pen·p_sd` for the SAME penalized (and Firth-augmented, when armed) Hessian
/// the trust-region model uses, so the dogleg path is consistent with the
/// accept/reject quadratic.
fn joint_cauchy_step(rhs: &Array1<f64>, p_sd: &Array1<f64>, h_psd: &Array1<f64>) -> Array1<f64> {
let directional = rhs.dot(p_sd);
if !directional.is_finite() || directional <= 0.0 {
// `p_sd` is not an ascent direction of −m (no descent on the objective);
// emit a zero Cauchy step so the dogleg falls back to the Newton leg.
return Array1::zeros(p_sd.len());
}
let curvature = p_sd.dot(h_psd);
let mut delta = p_sd.clone();
if curvature.is_finite() && curvature > 0.0 {
let tau = directional / curvature;
if tau.is_finite() && tau > 0.0 {
delta.mapv_inplace(|v| tau * v);
}
}
// Non-positive curvature: leave δ_C = p_sd; the dogleg scales it to the
// trust boundary (the model decreases without bound along p_sd there).
delta
}
fn shrink_active_joint_block_trust_radii(
block_radii: &mut [f64],
block_step_norms: &[f64],
factor: f64,
) -> f64 {
assert_eq!(block_radii.len(), block_step_norms.len());
// Joint-Newton step-rejection radius shrink. Must guarantee strict
// monotone decrease of `max(block_radii)` until the floor, otherwise the
// next trust-region attempt computes a step byte-identical to the rejected
// one and the inner loop stalls forever (gam joint-Newton fully-rejected
// cycles, root cause behind the 8-cycle bail at FULLY_REJECTED_STALL_MAX_CYCLES).
//
// Two cooperating mechanisms:
// * For every block that participates in the shrink, the new radius is
// pulled below the rejected step's magnitude (`0.5 · step_norm`),
// matching the analogous clamp in `update_joint_trust_region_radius`'s
// reject branch. This forces the next step to be strictly smaller
// than the current one even when `radius * factor` is still larger
// than `step_norm` (which happens whenever the dogleg/truncate path
// returned a Newton step shorter than the block's radius).
// * Block participation: by default only shrink blocks whose step hit
// the per-block trust boundary (the boundary block was the one the
// trust radius actually constrained — interior blocks took their
// natural Newton step and shrinking their radius is wasted). BUT when
// every boundary block already sits at the 1e-12 floor, further
// shrinking those blocks is a no-op (they'd just re-clamp to the
// floor), so we *must* shrink the interior blocks instead to actually
// change the joint step. Without this carve-out the deadlock was:
// boundary block pinned at 1e-12, interior block radius held at its
// pre-stall value, `max(block_radii)` held by the interior block, the
// dogleg/truncate produces an identical joint δ every cycle, every
// trust attempt rejects on the same objective check, the cycle burns
// to `inner_loop_hard_ceiling` (1200) cycles wasting ~120 s per
// outer ρ-evaluation — the Rust CI Test hang and the
// `rust_margslope_aniso_duchon16d_*` large-scale 2400 s timeout.
const RADIUS_FLOOR: f64 = 1.0e-12;
let any_boundary_block = block_radii
.iter()
.zip(block_step_norms)
.any(|(radius, step_norm)| joint_block_step_hit_trust_boundary(*step_norm, *radius));
let all_boundary_blocks_at_floor = any_boundary_block
&& block_radii
.iter()
.zip(block_step_norms)
.filter(|(radius, step_norm)| {
joint_block_step_hit_trust_boundary(**step_norm, **radius)
})
.all(|(radius, _)| *radius <= RADIUS_FLOOR * (1.0 + 1.0e-12));
for (radius, step_norm) in block_radii.iter_mut().zip(block_step_norms) {
let at_boundary = joint_block_step_hit_trust_boundary(*step_norm, *radius);
let participates = if all_boundary_blocks_at_floor {
// Boundary-at-floor stall: the boundary blocks cannot shrink any
// further, so participate every block (including interior ones)
// so the joint step magnitude actually changes.
true
} else if any_boundary_block {
at_boundary
} else {
true
};
if participates {
let mut new_radius = *radius * factor;
if step_norm.is_finite() && *step_norm > 0.0 {
new_radius = new_radius.min(0.5 * *step_norm);
}
*radius = new_radius.clamp(RADIUS_FLOOR, 1.0e6);
}
}
block_radii.iter().copied().fold(0.0_f64, f64::max)
}
fn apply_joint_feasibility_limit<F: CustomFamily + ?Sized>(
family: &F,
states: &[ParameterBlockState],
ranges: &[(usize, usize)],
trial_delta: &mut Array1<f64>,
) -> Result<bool, String> {
// Collect each block's feasibility α and apply the *minimum* to the
// JOINT trial step, not to each block in isolation.
//
// The joint Newton direction δ̂ = H⁻¹(−g) is the unique descent direction
// for the local quadratic model up to a positive scalar; any α·δ̂ with
// α ∈ (0, 1] is still a descent direction on the joint objective.
// Scaling ONLY one block by α produces (α·δ̂_A, δ̂_B, …), which is
// neither δ̂ nor α·δ̂ and is not, in general, a descent direction on
// the joint quadratic.
//
// Production survival_marginal_slope failure mode at large scale:
// the time block returned α ≈ 1e-4 (monotonicity guard); per-block
// scaling crushed δ_time to ~2.3e-4 while logslope kept its full
// unconstrained Newton step. The joint step was no longer a Newton
// direction; the time-block gradient stayed at ‖g_time‖ ≈ 5.6e8 for
// the next 15+ cycles, triggering the linearized-rate stall
// early-exit on every outer seed.
//
// Scaling the joint step by min α preserves Newton direction; the
// trust-region/line-search already chooses the appropriate step size
// within direction, this barrier check just enforces feasibility on
// top of that direction.
let mut joint_alpha = 1.0_f64;
let mut limiting_block: Option<usize> = None;
for (block_idx, (start, end)) in ranges.iter().copied().enumerate() {
let block_delta = trial_delta.slice(s![start..end]).to_owned();
if let Some(alpha_max) = family.max_feasible_step_size(states, block_idx, &block_delta)? {
if !alpha_max.is_finite() || alpha_max <= 0.0 {
return Err(format!(
"joint Newton block {block_idx} has no positive feasible step"
));
}
if alpha_max < joint_alpha {
joint_alpha = alpha_max;
limiting_block = Some(block_idx);
}
}
}
if joint_alpha < 1.0 {
trial_delta.mapv_inplace(|v| joint_alpha * v);
log::debug!(
"[PIRLS/joint-Newton] feasibility scaled joint step by α={:.3e} (block {:?} binding)",
joint_alpha,
limiting_block,
);
Ok(true)
} else {
Ok(false)
}
}
fn joint_inner_kkt_converged(residual: f64, residual_tol: f64) -> bool {
residual.is_finite() && residual_tol.is_finite() && residual <= residual_tol
}
/// Per-iterate diagnostic snapshot assembled when the joint Newton inner solve
/// refuses to certify constrained-stationarity. The report breaks the failure
/// down by block (so the offending smooth can be named), records the H_pen
/// eigenvalue spectrum (so rank-deficiency in the penalized Hessian is
/// detectable from logs), and classifies the refusal so downstream tooling
/// can act without re-deriving the cert math.
#[derive(Clone, Debug)]
struct KktRefusalReport {
block_names: Vec<String>,
block_widths: Vec<usize>,
block_beta_inf: Vec<f64>,
block_grad_inf: Vec<f64>,
block_penalty_grad_inf: Vec<f64>,
block_residual_inf: Vec<f64>,
block_carrying_residual: Option<usize>,
hpen_eigenvalues_sorted_desc: Vec<f64>,
hpen_min_abs_eigenvalue: f64,
hpen_max_abs_eigenvalue: f64,
hpen_condition_number: f64,
hpen_nullity_at_rank_tol: usize,
hpen_rank_tol: f64,
hpen_null_gradient_inf: f64,
hpen_null_vector_block_inf: Vec<f64>,
hpen_null_vector_carrying_block: Option<usize>,
active_set_rows_total: usize,
accepted_step_inf: f64,
proposal_step_inf: f64,
trust_radius: f64,
cycle: usize,
residual_tol: f64,
obj_tol: f64,
step_tol: f64,
linearized_rel: f64,
scalar_model_relerr: f64,
objective_change: f64,
projected_residual_inf: f64,
diagnosis: KktRefusalDiagnosis,
}
/// Three-way classification of why the cert refused, computed from the
/// H_pen spectrum and the projected residual at the refusing iterate.
/// `RankDeficientHPen` is the regression canary the nullspace lead's
/// smooth-construction rework is intended to eliminate; keep this variant
/// intact when extending — it doubles as the user-facing signal for
/// "an unconstrained polynomial null space slipped past absorption."
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum KktRefusalDiagnosis {
RankDeficientHPen,
PhantomMultiplierWithWellConditionedH,
ActiveSetIncomplete,
/// Cross-block identifiability aliasing surfaced mid-inner-solve
/// (e.g., a binding active set materialised a 2-way alias that
/// the pre-fit audit could not see at the cold design). The fix
/// is structural — drop or reparameterise the aliased block;
/// rho-anneal will not recover.
AliasingDetectedAtFit,
}
impl KktRefusalDiagnosis {
pub(crate) fn as_str(&self) -> &'static str {
match self {
KktRefusalDiagnosis::RankDeficientHPen => "rank_deficient_H_pen",
KktRefusalDiagnosis::PhantomMultiplierWithWellConditionedH => {
"phantom_multiplier_with_well_conditioned_H"
}
KktRefusalDiagnosis::ActiveSetIncomplete => "active_set_incomplete",
KktRefusalDiagnosis::AliasingDetectedAtFit => "aliasing_detected_at_fit",
}
}
/// Parse the textual `diagnosis:` field embedded in the structured
/// bubbled error string. Returns `None` when no recognised label is
/// present (legacy / non-cert-refusal error strings).
pub(crate) fn parse_from_error(message: &str) -> Option<Self> {
let marker = "diagnosis: ";
let start = message.rfind(marker)? + marker.len();
let tail = &message[start..];
let end = tail
.find(|c: char| c == ';' || c == '\n' || c == ' ')
.unwrap_or(tail.len());
match &tail[..end] {
"rank_deficient_H_pen" => Some(KktRefusalDiagnosis::RankDeficientHPen),
"phantom_multiplier_with_well_conditioned_H" => {
Some(KktRefusalDiagnosis::PhantomMultiplierWithWellConditionedH)
}
"active_set_incomplete" => Some(KktRefusalDiagnosis::ActiveSetIncomplete),
"aliasing_detected_at_fit" => Some(KktRefusalDiagnosis::AliasingDetectedAtFit),
_ => None,
}
}
fn guidance(self) -> &'static str {
match self {
KktRefusalDiagnosis::RankDeficientHPen => {
"check whether the named block has a structural or numerical null direction \
not identified by the likelihood/penalty combination; for Duchon-style \
smooths this may be a polynomial null space, while marginal-slope fits can \
also expose callback-owned weak directions"
}
KktRefusalDiagnosis::PhantomMultiplierWithWellConditionedH => {
"check whether the named block has a near-separated or weakly identified \
direction despite a well-conditioned penalized Hessian; in marginal-slope \
fits this often indicates marginal/logslope coupling rather than a \
Matérn/Duchon polynomial-nullspace failure"
}
KktRefusalDiagnosis::ActiveSetIncomplete => {
"check whether the named block's linear constraints need an additional \
active row or a tighter constrained re-solve; this is an active-set \
certification failure, not a polynomial-nullspace diagnosis"
}
KktRefusalDiagnosis::AliasingDetectedAtFit => {
"check whether the named block aliases another block after runtime \
constraints or callbacks materialize; drop or reparameterize the aliased \
direction before fitting"
}
}
}
}
/// Relative rank tolerance applied to `|λ|/λ_max` when counting the
/// nullity of `H_pen`. Matches the threshold the surrounding REML
/// penalty-rank machinery uses for "structurally zero".
const KKT_REFUSAL_RANK_TOL: f64 = 1e-10;
/// Self-vanishing Levenberg–Marquardt damping factor for the range-restricted
/// spectral Newton step (`solve_joint_newton_step_on_spectral_range`). The
/// caller forms the residual-scaled magnitude
/// `μ = JOINT_SPECTRAL_LEVENBERG_FACTOR · ‖∇L − Sβ‖∞`, which the solve converts
/// to a DIMENSIONLESS, scale-invariant Marquardt damping `ν = μ / λ_max` applied
/// MULTIPLICATIVELY to each range curvature (`curvature·(1 + ν)`), not added
/// (`curvature + μ`). The multiplicative form is essential on a coupled
/// location-scale joint Hessian whose spectrum spans the penalty scale
/// (`λ ~ e²⁴` at the oversmoothed seed) and the likelihood scale (the
/// mean/wiggle XᵀWX curvature): an ADDITIVE μ — set by the penalty-inflated
/// residual — swamps the small likelihood curvature and freezes that block
/// (#826), whereas the multiplicative `1/(1+ν)` throttle is identical across all
/// scales so no block stalls. Both forms cap the unbounded `component/λ` step
/// along near-singular (ill-conditioned but above-`KKT_REFUSAL_RANK_TOL`)
/// eigen-directions — the modes that make the undamped step oscillate — and both
/// vanish as the iterate converges (`ν → 0`), recovering the exact Moore–Penrose
/// Newton step so the KKT fixed point and the well-identified fast path are
/// unchanged. `1e-3` keeps the damping two to three orders below the dominant
/// curvature on a well-conditioned problem.
const JOINT_SPECTRAL_LEVENBERG_FACTOR: f64 = 1.0e-3;
#[derive(Clone, Debug)]
struct JointSpectralNewtonStep {
delta: Array1<f64>,
range_rhs_inf: f64,
null_rhs_inf: f64,
lambda_max_abs: f64,
lambda_min_positive: f64,
nullity: usize,
rank_tol: f64,
/// Number of eigen-directions whose curvature was negative (beyond the
/// rank cutoff) and was reflected to `|λ|` to form a modified-Newton
/// descent step. Zero for a genuinely positive-semidefinite model.
reflected_negative_modes: usize,
/// Most negative eigenvalue encountered (≤ 0); `0.0` when the model was
/// positive-semidefinite within the rank cutoff.
most_negative_eigenvalue: f64,
}
/// Production home for the exact trust-region engine ([`WhitenedHessianSpectrum`]),
/// wired into the unconstrained dense-spectral joint-Newton step in
/// `inner_blockwise_fit` (gam#979). Kept in its own module so the engine's
/// helpers stay namespaced; the parent reaches it via `whitened_spectrum::`.
mod whitened_spectrum {
use super::*;
/// Eigendecomposition of the metric-whitened penalized Hessian, retained so
/// every trust-radius shrink within one Newton cycle re-solves the
/// trust-region subproblem from the SAME `O(p³)` factorization at `O(p)` cost.
///
/// # Why this exists (gam#979)
///
/// The coupled marginal↔logslope inner Newton needs ONE globalization, not a
/// stack of approximations. Historically the joint step was a *modified-Newton*
/// (reflect indefinite eigenvalues to `|λ|`) wrapped in a *heuristically gated*
/// multiplicative Marquardt damping (engaged on `nullity>0`, or condition number
/// over a threshold, or after N non-improving cycles) and then a *dogleg* between
/// that step and the Cauchy point, truncated to per-block step-norm trust radii.
/// Each piece approximates a different facet of the one exact object below, and
/// each had to be gated so it would not re-break the case another piece was added
/// for (#826 vs #808 vs #733/#734 vs #787). When none of the gates matched the
/// operating point — well-conditioned `H_pen`, yet a coupled near-aliased
/// direction with a huge raw Newton component — the truncated direction made only
/// Cauchy-sized progress, the gain ratio never justified growing the radius, and
/// the residual crawled for hundreds of cycles (the #979 "phantom multiplier"
/// grind / survival hang).
///
/// [`Self::trust_region_step`] replaces all of that with the *exact* solution of
/// the trust-region subproblem
/// minimize `−rhsᵀδ + ½ δᵀ H_pen δ` subject to `‖δ‖_D ≤ r`,
/// via the Moré–Sorensen characterization: the minimizer is `δ(λ) = (H_pen +
/// λD)⁻¹ rhs` for the unique `λ ≥ max(0, −γ_min)` with `‖δ(λ)‖_D = r` (or `λ = 0`
/// when the Newton step is interior and `H_pen ≻ 0`). Working in the `D`-metric
/// generalized eigenbasis this is a scalar secular equation in `λ`, solved by a
/// safeguarded Newton iteration on the already-computed spectrum. Properties that
/// make it the right object:
/// * indefiniteness is handled exactly (`λ ≥ −γ_min` makes `H_pen+λD ⪰ 0` on
/// the boundary — no reflection heuristic, no negative-curvature special case
/// other than the rigorous hard case);
/// * the damping `λ` is determined by the trust radius, not by nullity /
/// condition / stall gates — those gates disappear;
/// * it self-vanishes: at the KKT fixed point `rhs → 0 ⇒ δ → 0`, and once the
/// iterate is in a region where `H_pen ≻ 0` the Newton step goes interior so
/// `λ = 0` and convergence is quadratic — the converged β, the KKT
/// certificate, and the REML/LAML the residual feeds are byte-identical to an
/// undamped exact-Newton solve;
/// * it is affine covariant in the `D` metric, so blocks at wildly different
/// curvature scales (the penalty `λ ~ e²⁴` modes vs the `XᵀWX` likelihood
/// modes at an oversmoothed seed) are damped uniformly by `1/(γ_k+λ)` — the
/// scale-invariance the per-block radii and the multiplicative-Marquardt form
/// were each hand-built to approximate.
///
/// The genuine penalty null space (`|γ_k| ≤ null_cutoff`) is still projected out
/// (the gam#553 Moore–Penrose range restriction): an unidentified gauge direction
/// carries no finite Newton step and is left unchanged, its stationarity-residual
/// component reported via [`JointSpectralNewtonStep::null_rhs_inf`].
pub(super) struct WhitenedHessianSpectrum {
/// Generalized eigenvalues `γ_k` of `(H_pen, D)` = eigenvalues of the
/// whitened matrix `A = D^{-1/2} H_pen D^{-1/2}`.
gamma: Array1<f64>,
/// Whitened eigenvectors `v_k` (columns) of `A`.
evecs: Array2<f64>,
/// rhs in the whitened eigenbasis: `c_k = v_kᵀ D^{-1/2} rhs`.
c: Array1<f64>,
/// `D^{-1/2}` diagonal, mapping a whitened step `η` back to `δ = D^{-1/2} η`.
d_inv_sqrt: Array1<f64>,
/// `max_k |γ_k|` (the curvature scale; `D`-whitened).
lambda_max_abs: f64,
/// Curvature magnitude at/below which a direction is treated as genuinely
/// unidentified (penalty null space) and dropped from the step.
null_cutoff: f64,
}
impl WhitenedHessianSpectrum {
/// Eigendecompose the `D`-whitened penalized Hessian once. `metric_diag`
/// supplies the positive trust-region metric `D` (each entry is passed
/// through [`positive_joint_diagonal_entry`] so a non-positive curvature
/// estimate becomes a safe positive scale). `rank_tol` is the relative
/// near-singularity cutoff; the genuine numerical-rank floor is derived from
/// the whitened spectrum exactly as the legacy spectral solve did.
pub(super) fn decompose(
h_pen: &Array2<f64>,
rhs: &Array1<f64>,
metric_diag: &Array1<f64>,
rank_tol: f64,
) -> Result<Self, String> {
let p = h_pen.nrows();
if h_pen.ncols() != p || rhs.len() != p || metric_diag.len() != p {
return Err(format!(
"whitened trust-region decomposition dimension mismatch: H={}x{}, rhs={}, metric={}",
h_pen.nrows(),
h_pen.ncols(),
rhs.len(),
metric_diag.len()
));
}
let d_inv_sqrt = Array1::from_iter(
metric_diag
.iter()
.map(|w| 1.0 / positive_joint_diagonal_entry(*w).sqrt()),
);
// A = D^{-1/2} H D^{-1/2}; symmetric since H is symmetric and D diagonal.
let mut a = Array2::<f64>::zeros((p, p));
for i in 0..p {
for j in 0..p {
a[[i, j]] = h_pen[[i, j]] * d_inv_sqrt[i] * d_inv_sqrt[j];
}
}
symmetrize_dense_in_place(&mut a);
let (gamma, evecs) = FaerEigh::eigh(&a, Side::Lower)
.map_err(|e| format!("whitened trust-region eigendecomposition failed: {e}"))?;
// c = Vᵀ (D^{-1/2} rhs).
let whitened_rhs = &d_inv_sqrt * rhs;
let c = evecs.t().dot(&whitened_rhs);
let lambda_max_abs = gamma.iter().map(|v| v.abs()).fold(0.0_f64, f64::max);
let numerical_floor = lambda_max_abs * (p as f64).sqrt() * f64::EPSILON;
let cutoff = rank_tol * lambda_max_abs;
let null_cutoff = cutoff.min(numerical_floor);
Ok(Self {
gamma,
evecs,
c,
d_inv_sqrt,
lambda_max_abs,
null_cutoff,
})
}
/// `‖η(λ)‖²_2 = Σ_{identified k} c_k² / (γ_k + λ)²` — the squared `D`-metric
/// norm of the trial step as a function of the Levenberg shift `λ`. Only
/// identified (above-`null_cutoff`) modes participate; the null space carries
/// no step.
fn step_norm_sq(&self, lambda: f64) -> f64 {
let mut acc = 0.0;
for k in 0..self.gamma.len() {
if self.gamma[k].abs() <= self.null_cutoff {
continue;
}
let denom = self.gamma[k] + lambda;
if denom.abs() <= f64::MIN_POSITIVE {
return f64::INFINITY;
}
let t = self.c[k] / denom;
acc += t * t;
}
acc
}
/// Assemble the whitened step `η(λ) = Σ c_k/(γ_k+λ) v_k` over identified
/// modes and map it back to `δ = D^{-1/2} η`. Returns `(δ, range_rhs_inf,
/// null_rhs_inf, nullity, lambda_min_positive, reflected_negative_modes,
/// most_negative)` diagnostics consistent with the legacy spectral step.
fn assemble(
&self,
lambda: f64,
extra_min_mode: Option<(usize, f64)>,
) -> JointSpectralNewtonStep {
let p = self.gamma.len();
let mut eta = Array1::<f64>::zeros(p);
let mut range_rhs_inf = 0.0_f64;
let mut null_rhs_inf = 0.0_f64;
let mut lambda_min_positive = f64::INFINITY;
let mut nullity = 0usize;
let mut reflected_negative_modes = 0usize;
let mut most_negative = 0.0_f64;
for k in 0..p {
let g = self.gamma[k];
if g.abs() <= self.null_cutoff {
nullity += 1;
null_rhs_inf = null_rhs_inf.max(self.c[k].abs());
continue;
}
range_rhs_inf = range_rhs_inf.max(self.c[k].abs());
if g < 0.0 {
reflected_negative_modes += 1;
most_negative = most_negative.min(g);
} else {
lambda_min_positive = lambda_min_positive.min(g);
}
let denom = g + lambda;
if denom.abs() > f64::MIN_POSITIVE {
let coeff = self.c[k] / denom;
for i in 0..p {
eta[i] += coeff * self.evecs[[i, k]];
}
}
}
// Hard case: add τ·v_min along a minimal-curvature eigenvector to reach
// the trust boundary when rhs has no component there.
if let Some((k_min, tau)) = extra_min_mode {
for i in 0..p {
eta[i] += tau * self.evecs[[i, k_min]];
}
}
// δ = D^{-1/2} η.
let delta = &self.d_inv_sqrt * η
JointSpectralNewtonStep {
delta,
range_rhs_inf,
null_rhs_inf,
lambda_max_abs: self.lambda_max_abs,
lambda_min_positive,
nullity,
rank_tol: KKT_REFUSAL_RANK_TOL,
reflected_negative_modes,
most_negative_eigenvalue: most_negative,
}
}
/// Exact solution of the trust-region subproblem inside the `D`-metric ball
/// of radius `trust_radius`. When `trust_radius` is non-finite or `≤ 0` the
/// unconstrained (Moore–Penrose, range-restricted) Newton step is returned —
/// i.e. the caller opted out of the trust region.
pub(super) fn trust_region_step(&self, trust_radius: f64) -> JointSpectralNewtonStep {
// Smallest identified curvature (signed). Empty identified set ⇒ pure
// null space ⇒ zero step.
let mut gamma_min_id = f64::INFINITY;
let mut any_identified = false;
for k in 0..self.gamma.len() {
if self.gamma[k].abs() <= self.null_cutoff {
continue;
}
any_identified = true;
gamma_min_id = gamma_min_id.min(self.gamma[k]);
}
if !any_identified {
return self.assemble(0.0, None);
}
let unconstrained_radius = !(trust_radius.is_finite() && trust_radius > 0.0);
// Interior Newton step is admissible only when the model is convex on the
// identified range (γ_min > 0); then λ = 0 gives the exact Newton step.
if gamma_min_id > 0.0 {
let newton_norm = self.step_norm_sq(0.0).sqrt();
if unconstrained_radius || newton_norm <= trust_radius {
return self.assemble(0.0, None);
}
} else if unconstrained_radius {
// No trust region but an indefinite/semidefinite model: the
// unconstrained problem is unbounded below. Fall back to the
// reflected modified-Newton step (|γ| curvature) so the caller still
// receives a finite descent direction; the downstream accept/reject
// validates it. This path is only hit when a caller explicitly
// disables the trust region on an indefinite model.
return self.assemble_reflected();
}
// Boundary solution: find λ ≥ λ_lo with ‖η(λ)‖ = trust_radius.
let lambda_lo = (-gamma_min_id).max(0.0);
// Hard case detection: is rhs orthogonal to the minimal-curvature
// eigenspace? If so ‖η(λ_lo)‖ is finite and may be below the radius.
let min_mode_tol = self.null_cutoff.max(self.lambda_max_abs * 1e-12);
let mut hard_case_component_sq = 0.0;
let mut k_min_witness = None;
for k in 0..self.gamma.len() {
if self.gamma[k].abs() <= self.null_cutoff {
continue;
}
if (self.gamma[k] - gamma_min_id).abs() <= min_mode_tol {
hard_case_component_sq += self.c[k] * self.c[k];
k_min_witness = Some(k);
}
}
// Evaluate the norm just above the pole. With a real rhs component at the
// minimal mode the norm diverges at λ_lo, so the secular root is interior
// to (λ_lo, ∞) and a small relative offset brackets it. With no such
// component (hard case) the norm at λ_lo is finite.
let lambda_lo_eval = lambda_lo + self.lambda_max_abs.max(1.0) * 1e-12;
if hard_case_component_sq <= (self.lambda_max_abs.max(1.0) * 1e-12).powi(2) {
let norm_at_lo = self.step_norm_sq(lambda_lo_eval).sqrt();
if norm_at_lo < trust_radius {
// Hard case: λ = λ_lo, then add τ·v_min to reach the boundary.
if let Some(k_min) = k_min_witness {
let deficit =
(trust_radius * trust_radius - norm_at_lo * norm_at_lo).max(0.0);
let tau = deficit.sqrt();
return self.assemble(lambda_lo, Some((k_min, tau)));
}
return self.assemble(lambda_lo, None);
}
}
// Safeguarded Newton on φ(λ) = 1/‖η(λ)‖ − 1/r (well-behaved, ~linear),
// bracketed in [lo, hi]. φ is increasing in λ (‖η‖ decreasing), φ(lo)<0,
// and we grow hi until φ(hi)>0.
let target = trust_radius;
let mut lo = lambda_lo_eval;
let mut hi = lambda_lo_eval.max(self.lambda_max_abs).max(1.0);
let mut grow_guard = 0;
while self.step_norm_sq(hi).sqrt() > target && grow_guard < 200 {
hi *= 2.0;
grow_guard += 1;
}
let mut lambda = 0.5 * (lo + hi);
for _ in 0..100 {
let q = self.step_norm_sq(lambda);
let norm = q.sqrt();
if !norm.is_finite() {
lo = lambda;
lambda = 0.5 * (lo + hi);
continue;
}
// Maintain the bracket on φ(λ) = 1/norm − 1/target.
if norm > target {
lo = lambda;
} else {
hi = lambda;
}
let phi = 1.0 / norm - 1.0 / target;
if phi.abs() <= 1e-12 / target {
break;
}
// q'(λ) = -2 Σ c_k²/(γ_k+λ)³ ⇒ d/dλ (1/norm) = -½ q^{-3/2} q'.
let mut q_prime = 0.0;
for k in 0..self.gamma.len() {
if self.gamma[k].abs() <= self.null_cutoff {
continue;
}
let denom = self.gamma[k] + lambda;
if denom.abs() <= f64::MIN_POSITIVE {
continue;
}
q_prime += -2.0 * self.c[k] * self.c[k] / (denom * denom * denom);
}
let phi_prime = -0.5 * q.powf(-1.5) * q_prime;
let next = if phi_prime.abs() > f64::MIN_POSITIVE {
lambda - phi / phi_prime
} else {
0.5 * (lo + hi)
};
// Safeguard into the bracket.
lambda = if next.is_finite() && next > lo && next < hi {
next
} else {
0.5 * (lo + hi)
};
if (hi - lo) <= 1e-14 * (1.0 + hi.abs()) {
break;
}
}
self.assemble(lambda, None)
}
/// Reflected modified-Newton step (`|γ_k|` curvature, no trust region). Only
/// used when a caller disables the trust region on an indefinite model — the
/// trust-region path proper never reflects.
fn assemble_reflected(&self) -> JointSpectralNewtonStep {
let p = self.gamma.len();
let mut eta = Array1::<f64>::zeros(p);
let mut range_rhs_inf = 0.0_f64;
let mut null_rhs_inf = 0.0_f64;
let mut lambda_min_positive = f64::INFINITY;
let mut nullity = 0usize;
let mut reflected_negative_modes = 0usize;
let mut most_negative = 0.0_f64;
for k in 0..p {
let g = self.gamma[k];
if g.abs() <= self.null_cutoff {
nullity += 1;
null_rhs_inf = null_rhs_inf.max(self.c[k].abs());
continue;
}
range_rhs_inf = range_rhs_inf.max(self.c[k].abs());
let curvature = if g < 0.0 {
reflected_negative_modes += 1;
most_negative = most_negative.min(g);
g.abs()
} else {
lambda_min_positive = lambda_min_positive.min(g);
g
};
let coeff = self.c[k] / curvature;
for i in 0..p {
eta[i] += coeff * self.evecs[[i, k]];
}
}
let delta = &self.d_inv_sqrt * η
JointSpectralNewtonStep {
delta,
range_rhs_inf,
null_rhs_inf,
lambda_max_abs: self.lambda_max_abs,
lambda_min_positive,
nullity,
rank_tol: KKT_REFUSAL_RANK_TOL,
reflected_negative_modes,
most_negative_eigenvalue: most_negative,
}
}
}
}
#[cfg(test)]
mod trust_region_subproblem_tests {
use super::whitened_spectrum::WhitenedHessianSpectrum;
use super::*;
use ndarray::array;
fn metric_norm(delta: &Array1<f64>, d: &Array1<f64>) -> f64 {
delta
.iter()
.zip(d.iter())
.map(|(x, w)| x * x * positive_joint_diagonal_entry(*w))
.sum::<f64>()
.sqrt()
}
/// Interior case: a positive-definite model with a generous trust radius
/// must return the exact (full) Newton step `H⁻¹ rhs`, i.e. λ = 0.
#[test]
fn interior_returns_exact_newton_step() {
let h = array![[3.0, 1.0], [1.0, 2.0]];
let rhs = array![1.0, -2.0];
let d = array![1.0, 1.0];
let spec = WhitenedHessianSpectrum::decompose(&h, &rhs, &d, KKT_REFUSAL_RANK_TOL).unwrap();
let step = spec.trust_region_step(1e6);
// Exact Newton: H δ = rhs.
let resid = h.dot(&step.delta) - &rhs;
assert!(
resid.iter().all(|v| v.abs() < 1e-10),
"interior step must solve H δ = rhs exactly, residual {resid:?}"
);
}
/// Boundary case: a tight radius forces `‖δ‖_D = r` and the KKT condition
/// `(H + λD) δ = rhs` with `λ > 0`.
#[test]
fn boundary_satisfies_more_sorensen_kkt() {
let h = array![[3.0, 1.0], [1.0, 2.0]];
let rhs = array![1.0, -2.0];
let d = array![1.0, 1.0];
let spec = WhitenedHessianSpectrum::decompose(&h, &rhs, &d, KKT_REFUSAL_RANK_TOL).unwrap();
let r = 0.3;
let step = spec.trust_region_step(r);
let norm = metric_norm(&step.delta, &d);
assert!(
(norm - r).abs() < 1e-8,
"boundary step must lie on the trust boundary: ‖δ‖_D={norm} vs r={r}"
);
// Recover λ from one coordinate of (H+λD)δ = rhs and check the whole
// system is satisfied at that λ.
let hd = h.dot(&step.delta);
// Solve λ minimizing ‖(H+λD)δ − rhs‖ in least squares over the single
// scalar λ: λ* = (Dδ)·(rhs − Hδ) / (Dδ)·(Dδ).
let dd = &d * &step.delta;
let lam = dd.dot(&(&rhs - &hd)) / dd.dot(&dd);
assert!(lam > 0.0, "boundary multiplier must be positive, got {lam}");
let resid = &hd + &(lam * &dd) - &rhs;
assert!(
resid.iter().all(|v| v.abs() < 1e-7),
"(H+λD)δ = rhs must hold at the recovered λ={lam}, residual {resid:?}"
);
}
/// Indefinite model: the exact subproblem still returns a finite boundary
/// step that is a descent direction (rhsᵀδ > 0) and lies on the boundary.
#[test]
fn indefinite_model_returns_descent_step_on_boundary() {
// Eigenvalues +4 and -1: genuinely indefinite.
let h = array![[1.5, 2.5], [2.5, 1.5]];
let rhs = array![1.0, 0.4];
let d = array![1.0, 1.0];
let spec = WhitenedHessianSpectrum::decompose(&h, &rhs, &d, KKT_REFUSAL_RANK_TOL).unwrap();
let r = 0.7;
let step = spec.trust_region_step(r);
assert!(step.reflected_negative_modes >= 1 || step.most_negative_eigenvalue < 0.0);
let norm = metric_norm(&step.delta, &d);
assert!(
(norm - r).abs() < 1e-7,
"indefinite boundary step ‖δ‖_D={norm} vs r={r}"
);
assert!(
rhs.dot(&step.delta) > 0.0,
"step must be a descent direction for −rhsᵀδ + ½δᵀHδ (rhsᵀδ>0)"
);
// (H+λD) must be PSD at the chosen λ (most negative eigenvalue ≥ -λ).
let dd = &d * &step.delta;
let lam = dd.dot(&(&rhs - &h.dot(&step.delta))) / dd.dot(&dd);
assert!(lam >= 1.0 - 1e-6, "λ must dominate -γ_min=1, got {lam}");
}
/// Self-vanishing: as rhs → 0 the step → 0 regardless of the radius, so the
/// converged β and the KKT fixed point are unchanged by the globalization.
#[test]
fn step_vanishes_as_rhs_vanishes() {
let h = array![[3.0, 1.0], [1.0, 2.0]];
let rhs = array![1e-13, -2e-13];
let d = array![1.0, 1.0];
let spec = WhitenedHessianSpectrum::decompose(&h, &rhs, &d, KKT_REFUSAL_RANK_TOL).unwrap();
let step = spec.trust_region_step(0.5);
assert!(
step.delta.iter().all(|v| v.abs() < 1e-11),
"near-zero rhs must give near-zero step, got {:?}",
step.delta
);
}
/// Null space: a genuinely zero-curvature direction is dropped from the step
/// (Moore–Penrose range restriction) and reported via `null_rhs_inf`.
#[test]
fn null_direction_is_dropped_and_reported() {
// Second coordinate has zero curvature; rhs has mass there.
let h = array![[2.0, 0.0], [0.0, 0.0]];
let rhs = array![1.0, 0.5];
let d = array![1.0, 1.0];
let spec = WhitenedHessianSpectrum::decompose(&h, &rhs, &d, KKT_REFUSAL_RANK_TOL).unwrap();
let step = spec.trust_region_step(1e6);
assert_eq!(step.nullity, 1, "one null direction expected");
assert!(
step.null_rhs_inf >= 0.5 - 1e-9,
"null-space rhs component must be reported, got {}",
step.null_rhs_inf
);
// The identified direction takes its exact Newton component (1/2).
assert!((step.delta[0] - 0.5).abs() < 1e-10);
assert!(step.delta[1].abs() < 1e-10, "null coordinate left at 0");
}
/// Non-identity metric: the boundary is measured in the `D` norm, so a step
/// with a large lightly-weighted coordinate is admissible.
#[test]
fn respects_non_identity_metric() {
let h = array![[2.0, 0.0], [0.0, 8.0]];
let rhs = array![1.0, 1.0];
let d = array![1.0, 16.0];
let spec = WhitenedHessianSpectrum::decompose(&h, &rhs, &d, KKT_REFUSAL_RANK_TOL).unwrap();
let r = 0.2;
let step = spec.trust_region_step(r);
let norm = metric_norm(&step.delta, &d);
assert!(
(norm - r).abs() < 1e-8,
"step must lie on the D-metric boundary, ‖δ‖_D={norm} vs r={r}"
);
}
/// Shrinking the radius re-solves the subproblem (the direction bends toward
/// the gradient) rather than rescaling a fixed direction — the property the
/// dogleg/truncation lacked. A halved radius must not merely halve the step.
#[test]
fn radius_shrink_bends_direction_not_just_scale() {
let h = array![[50.0, 0.0], [0.0, 0.5]];
let rhs = array![1.0, 1.0];
let d = array![1.0, 1.0];
let spec = WhitenedHessianSpectrum::decompose(&h, &rhs, &d, KKT_REFUSAL_RANK_TOL).unwrap();
let big = spec.trust_region_step(1.0).delta;
let small = spec.trust_region_step(0.25).delta;
// Direction (unit vectors) must differ: a pure truncation keeps the
// direction fixed; the exact subproblem rotates toward the steep mode.
let big_u = &big / metric_norm(&big, &d);
let small_u = &small / metric_norm(&small, &d);
let cos = big_u.dot(&small_u);
assert!(
cos < 0.9999,
"exact TR step must bend the direction under radius shrink (cos={cos})"
);
}
}
/// Numerical nullity of a symmetric penalized Hessian at the shared
/// `KKT_REFUSAL_RANK_TOL` relative cutoff (the same threshold the spectral
/// range solve and the REML penalty-rank machinery use). Returns `None` only
/// when the eigendecomposition fails or the matrix is the zero matrix (no
/// finite curvature scale to normalize against); callers treat a `None` as
/// "could not certify full rank" and fall back to the conservative (damped)
/// path.
///
/// This exists so the CONSTRAINED active-set QP branch can decide whether the
/// joint design is genuinely rank-deficient (`nullity > 0` ⇒ an unidentified
/// gauge direction that needs the self-vanishing Levenberg floor to make the
/// QP minimizer unique) or fully identified (`nullity == 0` ⇒ the exact,
/// undamped Newton/KKT step is well-posed and converges quadratically). The
/// spectral-range branch already gets this for free via
/// `JointSpectralNewtonStep::nullity`; the constrained branch never runs the
/// eigensolve otherwise, so it computes it here on the already-penalized `lhs`.
/// PSD part of a symmetric matrix: eigendecompose and clamp negative
/// eigenvalues to zero. Used by the step consumers that REQUIRE a convex
/// model (the constrained active-set QP and the SPD-PCG matvec) when folding
/// the exact divided-difference Jeffreys curvature `H_Φ`, which is indefinite
/// exactly where `Φ` is (gam#979). On a PSD input this is the identity (up to
/// eigendecomposition round-off). Falls back to the zero matrix if the
/// eigendecomposition fails — the safe unaugmented step, never a wrong one.
fn symmetric_psd_projection(matrix: &Array2<f64>) -> Array2<f64> {
let p = matrix.nrows();
let mut sym = matrix.clone();
symmetrize_dense_in_place(&mut sym);
let Ok((evals, evecs)) = FaerEigh::eigh(&sym, Side::Lower) else {
return Array2::zeros((p, p));
};
if evals.iter().all(|lam| *lam >= 0.0) {
return sym;
}
let clamped = Array1::from_iter(evals.iter().map(|lam| lam.max(0.0)));
let scaled = &evecs * &clamped.view().insert_axis(ndarray::Axis(0));
scaled.dot(&evecs.t())
}
fn symmetric_penalized_hessian_nullity(lhs: &Array2<f64>) -> Option<usize> {
let p = lhs.nrows();
if p == 0 || lhs.ncols() != p {
return Some(0);
}
let (evals, _) = FaerEigh::eigh(lhs, Side::Lower).ok()?;
let max_abs = evals.iter().map(|x: &f64| x.abs()).fold(0.0_f64, f64::max);
if !(max_abs.is_finite() && max_abs > 0.0) {
return None;
}
let cutoff = KKT_REFUSAL_RANK_TOL * max_abs;
Some(evals.iter().filter(|x| x.abs() < cutoff).count())
}
#[allow(clippy::too_many_arguments)]
fn compute_kkt_refusal_report(
cycle: usize,
states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
s_lambdas: &[Array2<f64>],
ranges: &[(usize, usize)],
cached_joint_gradient: Option<&Array1<f64>>,
cached_active_sets: &[Option<Vec<usize>>],
block_constraints: &[Option<LinearInequalityConstraints>],
joint_hessian_source: Option<&JointHessianSource>,
total_p: usize,
ridge: f64,
ridge_policy: RidgePolicy,
accepted_step_inf: f64,
proposal_step_inf: f64,
trust_radius: f64,
residual_tol: f64,
obj_tol: f64,
step_tol: f64,
objective_change: f64,
projected_residual_inf: f64,
math: Option<&JointNewtonMathDiagnostic>,
) -> KktRefusalReport {
let block_names: Vec<String> = specs.iter().map(|s| s.name.clone()).collect();
let block_widths: Vec<usize> = states.iter().map(|s| s.beta.len()).collect();
let block_beta_inf: Vec<f64> = states
.iter()
.map(|s| s.beta.iter().map(|x: &f64| x.abs()).fold(0.0_f64, f64::max))
.collect();
let block_grad_inf: Vec<f64> = match cached_joint_gradient {
Some(joint_grad) => {
let mut acc = 0usize;
states
.iter()
.map(|s| {
let n = s.beta.len();
let end = (acc + n).min(joint_grad.len());
// A width-0 block (e.g. a constant-scale `noise_formula="1"`
// log_sigma channel collapsed to zero free coefficients,
// gam#553) has no gradient and a zero residual — report 0.0,
// not the NaN sentinel. The NaN sentinel is reserved for a
// genuine layout mismatch: a positive-width block whose
// coordinates fall past the end of the joint gradient.
let nrm = if n == 0 {
0.0
} else if acc < end {
joint_grad
.slice(ndarray::s![acc..end])
.iter()
.map(|x: &f64| x.abs())
.fold(0.0_f64, f64::max)
} else {
f64::NAN
};
acc += n;
nrm
})
.collect()
}
None => vec![f64::NAN; states.len()],
};
let block_penalty_grad_inf: Vec<f64> = ranges
.iter()
.enumerate()
.map(|(b, _)| {
let mut penalty_block = s_lambdas[b].dot(&states[b].beta);
if ridge_policy.include_quadratic_penalty && ridge > 0.0 {
penalty_block += &states[b].beta.mapv(|v| ridge * v);
}
penalty_block
.iter()
.map(|x: &f64| x.abs())
.fold(0.0_f64, f64::max)
})
.collect();
let residual_vec_opt = cached_joint_gradient.and_then(|joint_grad| {
exact_newton_joint_projected_stationarity_vector_from_gradient(
joint_grad,
states,
specs,
s_lambdas,
ridge,
ridge_policy,
block_constraints,
Some(cached_active_sets),
)
.ok()
});
let block_residual_inf: Vec<f64> = match residual_vec_opt.as_ref() {
Some(residual) => ranges
.iter()
.map(|(start, end)| {
// A zero-width block (start == end) has no residual of its own;
// an empty `fold` would report a spurious `0.0`. Mark it `NaN`
// so the `is_finite()` filter below excludes it from the
// carrying-block selection (it cannot carry residual it has no
// parameters for).
if start >= end {
f64::NAN
} else {
residual
.slice(ndarray::s![*start..*end])
.iter()
.map(|x: &f64| x.abs())
.fold(0.0_f64, f64::max)
}
})
.collect(),
None => vec![f64::NAN; states.len()],
};
let block_carrying_residual = block_residual_inf
.iter()
.enumerate()
.filter(|(_, v)| v.is_finite())
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
.map(|(i, _)| i);
let mut hpen_eigenvalues_sorted_desc: Vec<f64> = Vec::new();
let mut hpen_min_abs_eigenvalue = f64::NAN;
let mut hpen_max_abs_eigenvalue = f64::NAN;
let mut hpen_condition_number = f64::NAN;
let mut hpen_nullity_at_rank_tol = 0usize;
let mut hpen_null_gradient_inf = f64::NAN;
let mut hpen_null_vector_block_inf = Vec::new();
let mut hpen_null_vector_carrying_block = None;
if total_p > 0
&& let Some(source) = joint_hessian_source
&& let Ok(mut h_joint) =
materialize_joint_hessian_source(source, total_p, "KKT refusal diagnostic spectrum")
{
let model_diagonal_ridge = if ridge_policy.include_quadratic_penalty && ridge > 0.0 {
ridge
} else {
0.0
};
add_joint_penalty_to_matrix(&mut h_joint, ranges, s_lambdas, model_diagonal_ridge, None);
symmetrize_dense_in_place(&mut h_joint);
if let Ok((evals, evecs)) = FaerEigh::eigh(&h_joint, Side::Lower) {
let mut sorted: Vec<f64> = evals.iter().copied().collect();
sorted.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
let max_abs = sorted.iter().map(|x: &f64| x.abs()).fold(0.0_f64, f64::max);
let min_abs = sorted
.iter()
.map(|x: &f64| x.abs())
.fold(f64::INFINITY, f64::min);
let cutoff = KKT_REFUSAL_RANK_TOL * max_abs;
hpen_nullity_at_rank_tol = sorted.iter().filter(|x| x.abs() < cutoff).count();
hpen_max_abs_eigenvalue = max_abs;
hpen_min_abs_eigenvalue = if min_abs.is_finite() {
min_abs
} else {
f64::NAN
};
hpen_condition_number = if min_abs > 0.0 && min_abs.is_finite() {
max_abs / min_abs
} else {
f64::INFINITY
};
if let Some(residual) = residual_vec_opt.as_ref()
&& residual.len() == total_p
&& hpen_nullity_at_rank_tol > 0
{
let mut best_component = 0.0_f64;
let mut best_block_inf = vec![0.0_f64; ranges.len()];
for k in 0..evals.len() {
if evals[k].abs() >= cutoff {
continue;
}
let component = evecs.column(k).dot(residual).abs();
if component > best_component {
best_component = component;
best_block_inf.clear();
best_block_inf.extend(ranges.iter().map(|(start, end)| {
evecs
.slice(ndarray::s![*start..*end, k])
.iter()
.map(|x: &f64| x.abs())
.fold(0.0_f64, f64::max)
}));
}
}
hpen_null_gradient_inf = best_component;
hpen_null_vector_block_inf = best_block_inf;
hpen_null_vector_carrying_block = hpen_null_vector_block_inf
.iter()
.enumerate()
.filter(|(_, v)| v.is_finite())
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
.map(|(i, _)| i);
}
hpen_eigenvalues_sorted_desc = sorted;
}
}
let active_set_rows_total: usize = cached_active_sets
.iter()
.map(|maybe_rows| maybe_rows.as_ref().map(|v| v.len()).unwrap_or(0))
.sum();
let any_block_has_constraints = block_constraints.iter().any(|c| c.is_some());
let diagnosis = if hpen_nullity_at_rank_tol > 0 {
KktRefusalDiagnosis::RankDeficientHPen
} else if any_block_has_constraints
&& cached_active_sets.iter().any(|s| s.is_some())
&& projected_residual_inf > residual_tol
{
// Well-conditioned H_pen, the user has bound constraints, the current
// active set already pinned some rows, yet the projected residual is
// still many tolerances above the threshold. The cert refused
// *because* the projection captured part of the multiplier but not
// all of it — i.e. the active set is missing a row.
KktRefusalDiagnosis::ActiveSetIncomplete
} else {
KktRefusalDiagnosis::PhantomMultiplierWithWellConditionedH
};
KktRefusalReport {
block_names,
block_widths,
block_beta_inf,
block_grad_inf,
block_penalty_grad_inf,
block_residual_inf,
block_carrying_residual,
hpen_eigenvalues_sorted_desc,
hpen_min_abs_eigenvalue,
hpen_max_abs_eigenvalue,
hpen_condition_number,
hpen_nullity_at_rank_tol,
hpen_rank_tol: KKT_REFUSAL_RANK_TOL,
hpen_null_gradient_inf,
hpen_null_vector_block_inf,
hpen_null_vector_carrying_block,
active_set_rows_total,
accepted_step_inf,
proposal_step_inf,
trust_radius,
cycle,
residual_tol,
obj_tol,
step_tol,
linearized_rel: math
.map(JointNewtonMathDiagnostic::linearized_rel)
.unwrap_or(f64::NAN),
scalar_model_relerr: math
.map(JointNewtonMathDiagnostic::scalar_model_relative_error)
.unwrap_or(f64::NAN),
objective_change,
projected_residual_inf,
diagnosis,
}
}
impl KktRefusalReport {
fn carrying_block_label(&self) -> String {
match self.block_carrying_residual {
Some(idx) => format!(
"{} (idx={}, |g|={:.3e}, |Sβ|={:.3e}, |∇L-Sβ|={:.3e}, |β|={:.3e}, width={})",
self.block_names.get(idx).map(String::as_str).unwrap_or("?"),
idx,
self.block_grad_inf.get(idx).copied().unwrap_or(f64::NAN),
self.block_penalty_grad_inf
.get(idx)
.copied()
.unwrap_or(f64::NAN),
self.block_residual_inf
.get(idx)
.copied()
.unwrap_or(f64::NAN),
self.block_beta_inf.get(idx).copied().unwrap_or(f64::NAN),
self.block_widths.get(idx).copied().unwrap_or(0),
),
None => "<no block carries finite residual>".to_string(),
}
}
fn beta_inf(&self) -> f64 {
self.block_beta_inf.iter().copied().fold(0.0_f64, f64::max)
}
fn null_direction_label(&self) -> String {
match self.hpen_null_vector_carrying_block {
Some(idx) => format!(
"{} (idx={}, |u_block|∞={:.3e}, |uᵀg_proj|={:.3e})",
self.block_names.get(idx).map(String::as_str).unwrap_or("?"),
idx,
self.hpen_null_vector_block_inf
.get(idx)
.copied()
.unwrap_or(f64::NAN),
self.hpen_null_gradient_inf,
),
None => format!("none (|uᵀg_proj|={:.3e})", self.hpen_null_gradient_inf),
}
}
/// Multi-line structured log emitted at the cert REFUSED site. The
/// per-block residual / eigenspectrum / diagnosis breakdown is what
/// makes the failure actionable (vs the legacy one-liner that only
/// reported aggregate residual + cert math).
fn format_structured_log(&self, four_tol: f64) -> String {
format!(
"[PIRLS/joint-Newton convergence] cycle {:>3} | cert REFUSED: residual={:.3e} > tol={:.3e} (cert)\n \
carrying-block: {}\n \
block_names={:?}, block_widths={:?}, block_grad_inf={:?}, block_penalty_grad_inf={:?}, block_residual_inf={:?}\n \
H_pen spectrum: λ_max={:.3e}, λ_min={:.3e}, cond={:.3e}, nullity@{:.0e}={} (of {} eigenvalues)\n \
free-null diagnostic: {}\n \
cert math: linearized_rel={:.3e}, scalar_relerr={:.3e}, |Δobj|={:.3e} (tol={:.3e}), accepted_step_inf={:.3e} (tol={:.3e}), proposal_step_inf={:.3e}, trust_radius={:.3e}, |β|∞={:.3e}, active_set_rows_total={}\n \
diagnosis: {}",
self.cycle,
self.projected_residual_inf,
four_tol,
self.carrying_block_label(),
self.block_names,
self.block_widths,
self.block_grad_inf,
self.block_penalty_grad_inf,
self.block_residual_inf,
self.hpen_max_abs_eigenvalue,
self.hpen_min_abs_eigenvalue,
self.hpen_condition_number,
self.hpen_rank_tol,
self.hpen_nullity_at_rank_tol,
self.hpen_eigenvalues_sorted_desc.len(),
self.null_direction_label(),
self.linearized_rel,
self.scalar_model_relerr,
self.objective_change,
self.obj_tol,
self.accepted_step_inf,
self.step_tol,
self.proposal_step_inf,
self.trust_radius,
self.beta_inf(),
self.active_set_rows_total,
self.diagnosis.as_str(),
)
}
/// Single-string formatter used by the bubbled error returned from
/// the inner solver, where the caller wants one self-contained line
/// even though the data is structured.
fn format_bubbled_error(&self) -> String {
let carrying = self.carrying_block_label();
format!(
"cycle={} cert REFUSED: residual={:.3e} > tol={:.3e}; \
carrying-block: {}; block_names={:?}, block_widths={:?}, \
block_grad_inf={:?}, block_penalty_grad_inf={:?}, block_residual_inf={:?}; \
H_pen spectrum: λ_max={:.3e}, λ_min={:.3e}, cond={:.3e}, nullity@{:.0e}={}/{}; \
free-null diagnostic: {}; \
cert math: linearized_rel={:.3e}, scalar_relerr={:.3e}, |Δobj|={:.3e}, \
accepted_step_inf={:.3e}, proposal_step_inf={:.3e}, trust_radius={:.3e}, \
|β|∞={:.3e}, active_set_rows_total={}; diagnosis: {}; {}",
self.cycle,
self.projected_residual_inf,
4.0 * self.residual_tol,
carrying,
self.block_names,
self.block_widths,
self.block_grad_inf,
self.block_penalty_grad_inf,
self.block_residual_inf,
self.hpen_max_abs_eigenvalue,
self.hpen_min_abs_eigenvalue,
self.hpen_condition_number,
self.hpen_rank_tol,
self.hpen_nullity_at_rank_tol,
self.hpen_eigenvalues_sorted_desc.len(),
self.null_direction_label(),
self.linearized_rel,
self.scalar_model_relerr,
self.objective_change,
self.accepted_step_inf,
self.proposal_step_inf,
self.trust_radius,
self.beta_inf(),
self.active_set_rows_total,
self.diagnosis.as_str(),
self.diagnosis.guidance(),
)
}
}
const JOINT_PCG_REL_TOL: f64 = 1e-8;
const PCG_ETA_MAX: f64 = 1.0e-1;
const PCG_ETA_MIN: f64 = 1.0e-8;
const PCG_GAMMA: f64 = 0.9;
const PCG_ALPHA: f64 = 1.618_033_988_749_895;
/// Eisenstat–Walker adaptive forcing term for the inner PCG tolerance:
/// when the previous outer KKT residual is known, scale the next inner
/// solve's relative tolerance by `γ·(‖r_cur‖/‖r_prev‖)^α`, clamped to
/// `[PCG_ETA_MIN, PCG_ETA_MAX]`. On the first cycle (no previous
/// residual) we use the loose `PCG_ETA_MAX` to avoid over-solving when
/// the iterate is far from the optimum.
fn joint_pcg_eisenstat_walker_forcing(prev_kkt_norm: Option<f64>, current_kkt_norm: f64) -> f64 {
if !current_kkt_norm.is_finite() || current_kkt_norm < 0.0 {
return JOINT_PCG_REL_TOL;
}
let Some(prev_kkt_norm) = prev_kkt_norm else {
return PCG_ETA_MAX;
};
if !prev_kkt_norm.is_finite() || prev_kkt_norm <= 0.0 {
return JOINT_PCG_REL_TOL;
}
let ratio = current_kkt_norm / prev_kkt_norm;
if !ratio.is_finite() || ratio < 0.0 {
return JOINT_PCG_REL_TOL;
}
(PCG_GAMMA * ratio.powf(PCG_ALPHA)).clamp(PCG_ETA_MIN, PCG_ETA_MAX)
}
fn apply_joint_penalized_hessian_into(
source: &JointHessianSource,
ranges: &[(usize, usize)],
s_lambdas: &[Array2<f64>],
diagonal_ridge: f64,
vector: &Array1<f64>,
out: &mut Array1<f64>,
joint_full_width: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
) -> Result<(), String> {
let mut penalty = Array1::<f64>::zeros(vector.len());
apply_joint_penalized_hessian_into_with_workspace(
source,
ranges,
s_lambdas,
diagonal_ridge,
vector,
out,
&mut penalty,
joint_full_width,
)
}
/// Variant of [`apply_joint_penalized_hessian_into`] that reuses a
/// caller-supplied scratch buffer for the penalty term instead of
/// allocating per call. Use this in hot loops (e.g. the trust-region
/// trial loop) where `penalty_scratch` and the output `out` are hoisted
/// outside the loop and reused across attempts.
///
/// `penalty_scratch` must have the same length as `vector`; its contents
/// are overwritten on every call.
fn apply_joint_penalized_hessian_into_with_workspace(
source: &JointHessianSource,
ranges: &[(usize, usize)],
s_lambdas: &[Array2<f64>],
diagonal_ridge: f64,
vector: &Array1<f64>,
out: &mut Array1<f64>,
penalty_scratch: &mut Array1<f64>,
joint_full_width: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
) -> Result<(), String> {
match source {
JointHessianSource::Dense(h_joint) => {
crate::faer_ndarray::fast_av_view_into(h_joint, vector, out.view_mut());
}
JointHessianSource::Operator { apply_into, .. } => {
apply_into(vector, out)?;
}
}
penalty_scratch.fill(0.0);
apply_joint_block_penalty_into(
ranges,
s_lambdas,
vector,
diagonal_ridge,
penalty_scratch,
joint_full_width,
);
*out += &*penalty_scratch;
Ok(())
}
fn stabilized_joint_solver_diagonal_ridge<F: CustomFamily + ?Sized>(
family: &F,
source: &JointHessianSource,
ranges: &[(usize, usize)],
s_lambdas: &[Array2<f64>],
base_diagonal_ridge: f64,
ridge_floor: f64,
joint_full_width: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
) -> f64 {
if use_exact_newton_strict_spd(family) {
return base_diagonal_ridge;
}
let JointHessianSource::Dense(h_joint) = source else {
return base_diagonal_ridge;
};
let mut lhs = h_joint.clone();
add_joint_penalty_to_matrix(
&mut lhs,
ranges,
s_lambdas,
base_diagonal_ridge,
joint_full_width,
);
let shift = exact_newton_stabilizing_shift(&lhs, ridge_floor).unwrap_or(0.0);
if shift > 0.0 {
log::debug!(
"[PIRLS/joint-Newton] stabilized dense penalized Hessian with diagonal shift {:.3e}",
shift
);
}
base_diagonal_ridge + shift
}
fn joint_quadratic_predicted_reduction(
rhs: &Array1<f64>,
hpen_delta: &Array1<f64>,
delta: &Array1<f64>,
) -> f64 {
rhs.dot(delta) - 0.5 * delta.dot(hpen_delta)
}
fn joint_preconditioned_descent_delta(
source: &JointHessianSource,
ranges: &[(usize, usize)],
s_lambdas: &[Array2<f64>],
diagonal_ridge: f64,
rhs: &Array1<f64>,
joint_full_width: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
) -> Result<Array1<f64>, String> {
let base_diagonal = match source {
JointHessianSource::Dense(h_joint) => h_joint.diag().to_owned(),
JointHessianSource::Operator { diagonal, .. } => diagonal.clone(),
};
let preconditioner = joint_penalty_preconditioner_diag(
&base_diagonal,
ranges,
s_lambdas,
diagonal_ridge,
joint_full_width,
);
let mut delta = rhs / &preconditioner;
if !delta.iter().all(|v| v.is_finite()) || rhs.dot(&delta) <= 0.0 {
delta.assign(rhs);
}
let directional = rhs.dot(&delta);
if directional.is_finite() && directional > 0.0 {
let mut hpen_delta = Array1::<f64>::zeros(rhs.len());
apply_joint_penalized_hessian_into(
source,
ranges,
s_lambdas,
diagonal_ridge,
&delta,
&mut hpen_delta,
joint_full_width,
)?;
let curvature = delta.dot(&hpen_delta);
if curvature.is_finite() && curvature > 0.0 {
let alpha = (directional / curvature).clamp(1.0e-12, 1.0);
delta.mapv_inplace(|v| alpha * v);
}
}
Ok(delta)
}
fn joint_line_search_log_likelihood<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
line_search_options: &BlockwiseFitOptions,
states: &[ParameterBlockState],
) -> Result<(f64, Option<Arc<dyn ExactNewtonJointHessianWorkspace>>), String> {
family
.log_likelihood_only_with_options(states, line_search_options)
.map(|log_likelihood| (log_likelihood, None))
}
fn coefficient_line_search_options(
options: &BlockwiseFitOptions,
early_exit_threshold: f64,
) -> BlockwiseFitOptions {
let mut line_search_options = options.clone();
// Preserve `outer_score_subsample` so the trial-objective and the
// Hessian/gradient share a row measure: the trust-region ratio
// ρ = [F(β) − F(β + δ)] / [−g·δ − ½·δᵀHδ] is only valid when
// numerator and denominator evaluate the same measure. Disable
// *auto*-install so no mid-iteration mask rebuild can occur, and
// tag scope=InnerCoefficient so any sibling auto-install path that
// somehow gets reached bails out (cf. `install_auto_outer_subsample_options`).
line_search_options.auto_outer_subsample = false;
line_search_options.outer_eval_context =
options
.outer_eval_context
.as_ref()
.map(|ctx| OuterEvalContext {
rho: ctx.rho.clone(),
eval_id: ctx.eval_id,
scope: EvalScope::InnerCoefficient,
});
line_search_options.early_exit_threshold = Some(early_exit_threshold);
line_search_options
}
type JointGradientLoad = (
f64,
Option<Array1<f64>>,
Option<FamilyEvaluation>,
Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
);
fn load_joint_gradient_evaluation<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
states: &[ParameterBlockState],
prefer_workspace: bool,
preferred_workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Result<JointGradientLoad, String> {
let workspace = match preferred_workspace {
Some(workspace) => Some(workspace),
None if prefer_workspace && family.inner_joint_workspace_gradient_available(specs) => {
family.exact_newton_joint_hessian_workspace_with_options(states, specs, options)?
}
None => None,
};
if let Some(workspace_ref) = workspace.as_ref()
&& let Some(joint_eval) = workspace_ref.joint_gradient_evaluation()?
{
return Ok((
joint_eval.log_likelihood,
Some(joint_eval.gradient),
None,
Some(Arc::clone(workspace_ref)),
));
}
if let Some(joint_eval) = family.exact_newton_joint_gradient_evaluation(states, specs)? {
return Ok((
joint_eval.log_likelihood,
Some(joint_eval.gradient),
None,
workspace,
));
}
let eval = family.evaluate(states)?;
let log_likelihood = eval.log_likelihood;
let gradient = exact_newton_joint_gradient_from_eval(&eval, specs, states)?;
Ok((log_likelihood, gradient, Some(eval), workspace))
}
fn require_projected_kkt_residual(
residual: Option<ProjectedKktResidual>,
context: &str,
) -> Result<ProjectedKktResidual, String> {
match residual {
Some(residual) => Ok(residual),
None => Err(CustomFamilyError::UnsupportedConfiguration { reason: format!(
"{context}: converged joint-Newton exact inner solve did not produce a projected KKT \
residual; refusing to assemble REML/LAML derivatives without the IFT correction input"
) }.into()),
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum ConstrainedStationaryCertificate {
NotCandidate,
Accept,
RefusePhantomMultiplier,
}
#[derive(Clone, Debug)]
struct JointNewtonMathDiagnostic {
old_kkt_inf: f64,
linearized_next_kkt_inf: f64,
predicted_reduction: f64,
actual_reduction: f64,
trust_ratio: f64,
step_inf: f64,
proposal_inf: f64,
}
impl JointNewtonMathDiagnostic {
fn scalar_model_relative_error(&self) -> f64 {
(self.actual_reduction - self.predicted_reduction).abs()
/ self.predicted_reduction.abs().max(1.0)
}
fn linearized_rel(&self) -> f64 {
self.linearized_next_kkt_inf / (1.0 + self.old_kkt_inf)
}
}
fn constrained_stationary_certificate_decision(
math: &JointNewtonMathDiagnostic,
objective_change: f64,
objective_tol: f64,
step_tol: f64,
geometric_tail_bound: Option<f64>,
residual: f64,
residual_tol: f64,
) -> ConstrainedStationaryCertificate {
let linearized_rel = math.linearized_rel();
let scalar_model_relerr = math.scalar_model_relative_error();
let objective_exhausted = objective_change <= objective_tol
|| geometric_tail_bound.is_some_and(|tail| tail <= objective_tol);
let step_exhausted =
math.step_inf.is_finite() && step_tol.is_finite() && math.step_inf <= step_tol;
if !(objective_exhausted
&& step_exhausted
&& linearized_rel >= 0.5
&& scalar_model_relerr <= 1e-3)
{
return ConstrainedStationaryCertificate::NotCandidate;
}
// A large linearized residual can mean either an honest active-set
// multiplier or an H-null/rank-deficient direction that Newton cannot
// move. Only the projected KKT residual distinguishes those cases. This
// small tolerance band is intentionally tied to the inner residual
// tolerance, because this branch is allowed to certify convergence only
// when the active-set projection has actually captured the multiplier.
//
// The band is a small MULTIPLE of `residual_tol`, not exactly `1x`: this
// branch fires only once the iterate is already proven stationary (objective
// exhausted, step exhausted, `linearized_rel >= 0.5` so the residual is
// multiplier/null mass not a gradient defect, `scalar_relerr <= 1e-3` so the
// quadratic model is exact). There the active-projected residual stalls at the
// conditioning/round-off floor — for the survival baseline-hazard block
// (well-conditioned after the data-seeded baseline, gam#797) it floors a hair
// above the scale-relative `residual_tol`, so demanding exactly `<= tol` leaves
// a fully-stationary iterate uncertified. A `4x` band certifies the genuinely
// converged iterate while still rejecting a residual orders of magnitude above
// tolerance (a real defect), the only case this guard must catch.
let cert_residual_factor = 4.0;
if residual.is_finite() && residual <= cert_residual_factor * residual_tol {
ConstrainedStationaryCertificate::Accept
} else {
ConstrainedStationaryCertificate::RefusePhantomMultiplier
}
}
/// True iff the recent KKT-residual tail (`history`, oldest→newest) shows STEADY
/// geometric descent: every consecutive pair strictly decreased by at least the
/// factor `(1 - min_drop)` over the whole window.
///
/// This distinguishes a still-converging Newton direction from a genuine
/// multiplier/null plateau at the certificate-refusal gate (gam#787 duchon
/// centers≥20). The constrained-stationary refusal fires on a flat objective +
/// `linearized_rel ≥ 0.5`, but those signals ALSO hold for a logslope block
/// whose residual is dropping by a steady factor each cycle (objective already
/// at its Φ-bounded floor while the KKT residual still polishes): refusing there
/// rejects the seed a few cycles short of `residual_tol`. Requiring a STEADY
/// drop over `≥ window` cycles (not a single lucky decrease) keeps a noisy
/// near-plateau from being falsely extended, and the inner cycle cap still
/// bounds the extra work.
fn residual_in_steady_geometric_descent(history: &std::collections::VecDeque<f64>) -> bool {
let window = history.len();
if window < 3 {
return false;
}
let min_drop = 0.1; // each cycle must cut the residual by ≥ 10%.
history
.iter()
.zip(history.iter().skip(1))
.all(|(prev, next)| {
prev.is_finite() && next.is_finite() && *prev > 0.0 && *next < (1.0 - min_drop) * *prev
})
}
/// Inf-norm of the active-set-projected stationarity residual restricted to the
/// **range** of the joint penalized Hessian `H_pen = H + S(λ) + ridge·I`.
///
/// A penalized smooth whose penalty has a polynomial null space the censored /
/// location-scale data does not pin down (TP / Bernstein trend directions in a
/// survival `time_transform` or `log_sigma` channel, gam#553) leaves a residual
/// that lives entirely in `ker(H_pen)`: along that direction the objective has
/// neither curvature nor a constraint, so it is a genuinely *free* gauge
/// direction, not an unresolved KKT defect. The total residual inf-norm then
/// stays large forever and the phantom-multiplier refusal never clears, aborting
/// the fit at REML startup even though the iterate is stationary on the entire
/// identifiable (range) subspace.
///
/// The downstream outer IFT trace already removes the null-space component via
/// the projected pseudo-inverse `U_S·H_proj⁻¹·U_Sᵀ`, so only a *range-space*
/// residual component can bias the envelope gradient (see the "do NOT
/// soft-accept" investigation note at the certifier call site). This returns the
/// range-space inf-norm so the certifier can accept iff that — the only part
/// that matters for outer correctness — is at tolerance, while a real defect
/// (residual with mass in the curved subspace) still refuses.
///
/// Returns `None` when the penalized Hessian cannot be materialized or
/// eigendecomposed, or carries no numerical null space — in which case the
/// caller keeps the strict total-residual refusal (no null space ⇒ range = all).
fn projected_residual_range_space_inf(
projected_residual: &Array1<f64>,
joint_hessian_source: &JointHessianSource,
ranges: &[(usize, usize)],
s_lambdas: &[Array2<f64>],
ridge: f64,
ridge_policy: RidgePolicy,
total_p: usize,
) -> Option<f64> {
if total_p == 0 || projected_residual.len() != total_p {
return None;
}
let mut h_joint = materialize_joint_hessian_source(
joint_hessian_source,
total_p,
"penalty-null-space certificate spectrum",
)
.ok()?;
let model_diagonal_ridge = if ridge_policy.include_quadratic_penalty && ridge > 0.0 {
ridge
} else {
0.0
};
add_joint_penalty_to_matrix(&mut h_joint, ranges, s_lambdas, model_diagonal_ridge, None);
symmetrize_dense_in_place(&mut h_joint);
let (evals, evecs) = FaerEigh::eigh(&h_joint, Side::Lower).ok()?;
let max_abs = evals.iter().map(|x: &f64| x.abs()).fold(0.0_f64, f64::max);
if !(max_abs.is_finite() && max_abs > 0.0) {
return None;
}
let cutoff = KKT_REFUSAL_RANK_TOL * max_abs;
let nullity = evals.iter().filter(|x| x.abs() < cutoff).count();
if nullity == 0 {
// No data-unconstrained null space — the range is the whole space, so
// the strict total-residual refusal already governs. Signal "no relief".
return None;
}
// Range-space residual = residual minus its projection onto ker(H_pen).
// Equivalently, accumulate the residual's coordinates along every
// range-space (|λ| ≥ cutoff) eigenvector. The eigenbasis is orthonormal,
// so ‖P_range r‖∞ is read off the reconstructed range component.
let mut range_component = Array1::<f64>::zeros(total_p);
for k in 0..evals.len() {
if evals[k].abs() < cutoff {
continue;
}
let coeff = evecs.column(k).dot(projected_residual);
range_component.scaled_add(coeff, &evecs.column(k));
}
Some(
range_component
.iter()
.map(|x: &f64| x.abs())
.fold(0.0_f64, f64::max),
)
}
fn inner_blockwise_fit<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
block_log_lambdas: &[Array1<f64>],
options: &BlockwiseFitOptions,
warm_start: Option<&ConstrainedWarmStart>,
) -> Result<BlockwiseInnerResult, String> {
// Inner-blockwise prelude waypoints. At large-scale n the cold-start
// path between function entry and the first PIRLS/JN cycle-summary
// log can run for many minutes (sometimes hours) silently while
// row-kernel workspace builds run. Emit a `[STAGE] PIRLS/inner`
// line at each transition so the next failed run pinpoints which
// named step holds time. Gated on large-scale n so small-fit
// tests stay quiet.
let inner_started = std::time::Instant::now();
let mut states = buildblock_states(family, specs)?;
refresh_all_block_etas(family, specs, &mut states)?;
let total_joint_p = specs.iter().map(|spec| spec.design.ncols()).sum::<usize>();
let total_joint_n = joint_observation_count(&states);
const INNER_PRELUDE_LOG_MIN_N: usize = 100_000;
let prelude_log = total_joint_n >= INNER_PRELUDE_LOG_MIN_N;
if prelude_log {
log::info!(
"[STAGE] PIRLS/inner step=buildblock_states+refresh_etas elapsed={:.3}s n={} p={} blocks={}",
inner_started.elapsed().as_secs_f64(),
total_joint_n,
total_joint_p,
specs.len(),
);
}
let matrix_free_joint_requested = use_joint_matrix_free_path(total_joint_p, total_joint_n)
|| family.prefers_matrix_free_inner_joint(specs, &states);
let has_workspace_source = family.inner_coefficient_hessian_hvp_available(specs);
// Probe the *spec-aware* joint Hessian: it is the canonical source of the
// coupled joint curvature. A family may override only
// `exact_newton_joint_hessian_with_specs` (the variant that has access to
// the realized block designs needed to assemble the cross-block
// `X_aᵀ diag(w_ab) X_b` blocks — e.g. the Dirichlet common-parameterization
// family, whose `evaluate` emits diagonal working sets so the spec-less
// default block assembler returns `None`). Routing the inner joint-Newton
// availability gate through the spec-less `exact_newton_joint_hessian`
// would then mis-classify such a family as "no joint Hessian" and drop it
// onto pure block-diagonal backfitting, which fails to reach KKT on small,
// concentrated coupled likelihoods. The `_with_specs` path subsumes the
// spec-less one for every family (single-block / uncoupled delegate
// identically), so it is the correct probe here.
let has_joint_exacthessian = if has_workspace_source {
true
} else {
family
.exact_newton_joint_hessian_with_specs(&states, specs)?
.is_some()
};
let coupled_exact_joint_required = specs.len() >= 2
&& !family.likelihood_blocks_uncoupled()
&& (family.has_explicit_joint_hessian() || has_workspace_source);
// Multi-block families have always taken the joint path when an exact
// joint Hessian is available. Single-block families also take it when a
// coefficient-Hessian workspace is wired; dense vs. operator form is a
// later representation choice, not a cache-construction gate.
let use_joint_newton = has_joint_exacthessian && (specs.len() >= 2 || has_workspace_source);
let joint_workspace_requested = use_joint_newton && has_workspace_source;
let inner_tol = options.inner_tol;
let inner_max_cycles_base = options.inner_max_cycles;
// Per-outer-call inner-cycle cap. The earlier "adaptive inner cycle
// cap" doubled this mid-loop on plateaus, but that turned out to be
// the wrong response to stalled descent (descent ratios pinned at
// ~0.999 paired with a sub-tolerance objective change is the
// no-descent signal, not a "give Newton more cycles" signal). The
// plateau-flat-objective convergence certificate in the inner-cycle
// body now handles that case directly, so the cap stays fixed at the
// baseline for the lifetime of this outer call.
let inner_max_cycles = capped_inner_max_cycles(options, inner_max_cycles_base);
// Each block's assembled penalty matrix depends only on that block's
// penalties and smoothing parameters. Build these setup matrices in
// parallel, but keep the coordinate-descent and line-search loops below
// strictly serial because each accepted block update changes the state seen
// by later blocks.
use rayon::iter::{IntoParallelIterator, ParallelIterator};
let s_lambdas_launch_started = std::time::Instant::now();
let s_lambdas_par_iter = (0..specs.len()).into_par_iter().map(|b| {
let spec = &specs[b];
let Some(block_log_lambda) = block_log_lambdas.get(b) else {
return Err(CustomFamilyError::UnsupportedConfiguration {
reason: format!("missing log-smoothing parameter vector for block {b}"),
}
.into());
};
if block_log_lambda.len() != spec.penalties.len() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"block {b} log-smoothing parameter length {} does not match penalties {}",
block_log_lambda.len(),
spec.penalties.len()
),
}
.into());
}
let p = spec.design.ncols();
let lambdas = block_log_lambda.mapv(f64::exp);
let mut s_lambda = Array2::<f64>::zeros((p, p));
for (k, s) in spec.penalties.iter().enumerate() {
s.add_scaled_to(lambdas[k], &mut s_lambda);
}
Ok(s_lambda)
});
let s_lambdas_collect_started = std::time::Instant::now();
let s_lambdas_launch_elapsed = s_lambdas_launch_started.elapsed();
let s_lambdas = s_lambdas_par_iter.collect::<Result<Vec<_>, String>>()?;
if prelude_log {
log::info!(
"[STAGE] PIRLS/inner step=s_lambdas par_iter launch={:.3}s collect={:.3}s blocks={} (since inner-start={:.3}s)",
s_lambdas_launch_elapsed.as_secs_f64(),
s_lambdas_collect_started.elapsed().as_secs_f64(),
specs.len(),
inner_started.elapsed().as_secs_f64(),
);
}
let ridge = effective_solverridge(options.ridge_floor);
let joint_bundle: Option<&crate::families::joint_penalty::JointPenaltyBundle> =
options.joint_penalties.as_deref();
if let Some(bundle) = joint_bundle {
for (i, spec) in bundle.specs.iter().enumerate() {
if spec.dim() != total_joint_p {
return Err(format!(
"joint penalty {i}: dim {} != total compiled p {}",
spec.dim(),
total_joint_p,
));
}
}
if bundle.specs.len() != bundle.log_lambdas.len() {
return Err(format!(
"joint penalty bundle: {} specs vs {} log_lambdas",
bundle.specs.len(),
bundle.log_lambdas.len(),
));
}
}
let mut cached_active_sets: Vec<Option<Vec<usize>>> = vec![None; specs.len()];
if let Some(seed) = warm_start
&& seed.block_beta.len() == states.len()
&& seed.active_sets.len() == states.len()
{
if warm_start_matches_block_log_lambdas(seed, block_log_lambdas)
&& let Some(cached) = seed.cached_inner.as_ref()
&& cached.converged
&& seed
.block_beta
.iter()
.zip(&states)
.all(|(beta_seed, state)| beta_seed.len() == state.beta.len())
{
for (state, beta_seed) in states.iter_mut().zip(&seed.block_beta) {
state.beta.assign(beta_seed);
}
cached_active_sets = seed.active_sets.clone();
refresh_all_block_etas(family, specs, &mut states)?;
log::info!(
"[PIRLS/joint-Newton warm-start] reused cached same-rho inner mode | cycles={} logdet_h={:.6e} logdet_s={:.6e}",
cached.cycles,
cached.block_logdet_h,
cached.block_logdet_s,
);
return Ok(BlockwiseInnerResult {
block_states: states,
active_sets: normalize_active_sets(cached_active_sets),
log_likelihood: cached.log_likelihood,
penalty_value: cached.penalty_value,
cycles: cached.cycles,
converged: cached.converged,
block_logdet_h: cached.block_logdet_h,
block_logdet_s: cached.block_logdet_s,
s_lambdas,
joint_workspace: cached.joint_workspace.clone(),
kkt_residual: cached.kkt_residual.clone(),
active_constraints: cached.active_constraints.clone(),
});
}
// Cold-start path: copy prior β where dimensions match
// (best-effort; mismatched blocks keep the freshly-built
// initial state).
for (b, beta_seed) in seed.block_beta.iter().enumerate() {
if beta_seed.len() == states[b].beta.len() {
let beta_projected =
family.post_update_block_beta(&states, b, &specs[b], beta_seed.clone())?;
states[b].beta.assign(&beta_projected);
}
}
cached_active_sets = seed.active_sets.clone();
refresh_all_block_etas(family, specs, &mut states)?;
}
let load_joint_started = std::time::Instant::now();
if prelude_log {
log::info!(
"[STAGE] PIRLS/inner step=load_joint_gradient_evaluation begin use_joint_newton={} joint_workspace_requested={} (since inner-start={:.3}s)",
use_joint_newton,
joint_workspace_requested,
inner_started.elapsed().as_secs_f64(),
);
}
let (
mut current_log_likelihood,
mut cached_eval,
mut cached_joint_gradient,
mut cached_joint_workspace,
) = if use_joint_newton {
let (log_likelihood, gradient, eval, workspace) = load_joint_gradient_evaluation(
family,
specs,
options,
&states,
joint_workspace_requested,
None,
)?;
(log_likelihood, eval, gradient, workspace)
} else {
let eval = family.evaluate(&states)?;
let log_likelihood = eval.log_likelihood;
(log_likelihood, Some(eval), None, None)
};
if prelude_log {
log::info!(
"[STAGE] PIRLS/inner step=load_joint_gradient_evaluation end elapsed={:.3}s log_likelihood={:.6e} has_gradient={} has_workspace={}",
load_joint_started.elapsed().as_secs_f64(),
current_log_likelihood,
cached_joint_gradient.is_some(),
cached_joint_workspace.is_some(),
);
}
// Validate exact-Newton block Hessians at the family-evaluation
// boundary. A non-finite entry is a contract violation against the
// family's analytic second derivative; refuse to iterate before
// any factorization rather than letting it slip through to a
// downstream logdet check that may be gated off by the outer
// optimizer's flags.
let validate_started = std::time::Instant::now();
if let Some(eval) = cached_eval.as_ref() {
validate_block_hessians_finite(eval)?;
}
if prelude_log {
log::info!(
"[STAGE] PIRLS/inner step=validate_block_hessians_finite elapsed={:.3}s checked={}",
validate_started.elapsed().as_secs_f64(),
cached_eval.is_some(),
);
}
let penalty_started = std::time::Instant::now();
let mut current_penalty = total_quadratic_penalty(
&states,
&s_lambdas,
ridge,
options.ridge_policy,
joint_bundle,
Some(specs),
);
if prelude_log {
log::info!(
"[STAGE] PIRLS/inner step=total_quadratic_penalty elapsed={:.3}s penalty={:.6e} (prelude_total={:.3}s)",
penalty_started.elapsed().as_secs_f64(),
current_penalty,
inner_started.elapsed().as_secs_f64(),
);
}
let mut lastobjective = -current_log_likelihood + current_penalty;
let mut converged = false;
let mut cycles_done = 0usize;
// Pre-allocate per-block eta backup buffers to avoid O(n) allocation
// per block per cycle in the backtracking line search.
let mut eta_backups: Vec<Array1<f64>> =
states.iter().map(|s| Array1::zeros(s.eta.len())).collect();
// ── Joint Newton fast path ──
//
// When the family provides an exact joint Hessian (GAMLSS location-scale),
// solve the full (p_mu + p_ls) × (p_mu + p_ls) system in one Newton step
// per cycle instead of iterating between blocks. This converges quadratically
// (5-10 steps) instead of linearly (20-100+ blockwise cycles).
//
// Generic block-diagonal surrogate families may still fall back to
// blockwise iteration if the joint surrogate is unavailable. Families that
// advertise a real coupled joint Hessian must not: the blockwise loop only
// sees principal blocks, so it drops the cross-block curvature that makes
// the joint problem well conditioned near saturated optima.
// `last_residual_tol` mirrors the per-cycle KKT tolerance computed inside
// the joint-Newton loop (`inner_tol · (1 + max(‖∇L‖∞, ‖Sβ‖∞))`). It must
// live at function scope so both the post-converged exit block inside
// `if use_joint_newton` AND the post-block-fit IFT residual builder
// outside that branch can thread the same tolerance into the
// `ProjectedKktResidual::with_metadata(...)` builder. Seed at `inner_tol`
// so a path that skips the loop entirely (no joint-Newton, or zero
// cycles) still records a finite, non-NaN tolerance on the residual
// carrier rather than NaN.
let mut last_residual_tol: f64 = inner_tol;
if use_joint_newton {
// Build block ranges for the joint system.
let ranges: Vec<(usize, usize)> = {
let mut offset = 0;
specs
.iter()
.map(|s| {
let start = offset;
offset += s.design.ncols();
(start, offset)
})
.collect()
};
let total_p: usize = ranges.last().map_or(0, |r| r.1);
// Universal full-span Jeffreys/Firth robustness. Build `Z_J` once and
// use the same term in the coupled Newton step, objective value, and
// stationarity checks so a near-separating coefficient is bounded by
// the likelihood's own Fisher geometry instead of an ad-hoc ridge.
// `None` (empty coefficient system) leaves every step and objective at
// the un-augmented inner Newton.
//
// Continuous-response families (the canonical example: transformation-
// normal h(Y|x) ~ N(0,1)) opt out via
// `joint_jeffreys_term_required() = false`. They have no separation
// regime, the Fisher information is `O(n)` on every identified
// direction by construction, and each Jeffreys evaluation costs
// `p` directional-derivative calls into the family's exact joint
// Hessian — at large scale (CTN duchon16d, p=144, n=20000) that
// is the dominant per-cycle cost (~200 s/cycle on three calls per
// cycle), exhausting the inner budget before the algorithm converges
// while contributing essentially zero to the gradient/curvature.
let joint_jeffreys_subspace = if family.joint_jeffreys_term_required() {
build_joint_jeffreys_subspace(specs, &ranges)?
} else {
None
};
// FIRTH MERIT BOOKKEEPING (gam#826/#872 — per-cycle Φ fold, not a carried
// value). `current_penalty` / `lastobjective` hold ONLY the quadratic
// penalty `½βᵀSβ` (NO Φ). The Firth value `−Φ` is folded into the
// accept/reject comparison FRESH at each β under the same
// `jeffreys_skippable_this_cycle` gate the step and KKT residual use, so
// `old_objective` (old β) and `trialobjective` (trial β) are always on the
// same objective `−ℓ + ½βᵀSβ − Φ` regardless of whether a cycle skips the
// term. Carrying Φ in `current_penalty` (the previous design) desynced
// old-vs-trial by ±Φ whenever the per-cycle skippable decision flipped —
// and the cycle-0 baseline folded Φ UNCONDITIONALLY while the trial folded
// it gated, so a skippable cycle 0 saw a spurious `Δobj = ±Φ`, rejected
// every backtrack, and refused as a `phantom_multiplier` at a zero step
// (the binomial location-scale coupled non-convergence). SIGN: Firth ADDS
// ½log|I| to the log-likelihood ⇒ the NLL objective SUBTRACTS Φ, matching
// the Newton step rhs / KKT residual which ADD `∇Φ` to `∇L − Sβ`.
let joint_mode_diagonal_ridge =
if ridge > 0.0 && options.ridge_policy.include_quadratic_penalty {
ridge
} else {
0.0
};
// Exact joint Newton steps are guarded by two independent mechanisms:
// family-owned feasibility (`max_feasible_step_size`) and the adaptive
// trust region below. There is intentionally no family hook for a
// hard per-attempt coefficient-space clamp; keeping the policy local
// avoids stale no-op configuration and makes the trust-region behavior
// explicit at the only place it is used.
// Cross-cycle convergence carry-over: set at the end of every
// accepted cycle so the next cycle can distinguish a true KKT
// optimum on a rank-deficient null mode (objective stuck
// because every direction is along the null space) from
// genuine non-convergence. The residual signal does not need
// a carry-over — `residual <= residual_tol` is the canonical
// KKT certificate and the end-of-cycle test consumes it
// directly when it fires.
// Predicted-reduction tracker for the principled trust-region
// stopping criterion (Conn-Gould-Toint, *Trust-Region Methods*,
// Theorem 6.4.6). The Newton model at the accepted step has a
// predicted decrease `m(0) − m(δ) = −g·δ − 0.5·δ·H·δ`. For an
// unclipped Newton step (H·δ = −g) this is `0.5·g·H⁻¹·g`, the
// Newton decrement squared / 2. When the model itself predicts
// a decrease smaller than the objective tolerance, no descent
// direction the Hessian can resolve will lower the objective
// by more than `objective_tol`, and continuing is wall-clock
// waste regardless of whether the raw gradient residual or
// step-norm gates have closed.
//
// Cross-cycle convergence carry-over: set at the end of every
// accepted cycle so the next cycle's line-search-failure path
// can distinguish a true KKT optimum on a rank-deficient
// Hessian (no meaningful trial step, even though step_inf is
// O(1) along the null mode) from genuine non-convergence.
let mut last_cycle_residual_below_tol = false;
let mut last_cycle_obj_change_below_tol = false;
let mut joint_trust_radius = 1.0_f64;
let mut joint_block_trust_radii = vec![1.0_f64; ranges.len()];
let mut last_accepted_hit_joint_trust_boundary = false;
// Hard upper bound for the for-loop's range. The cap is fixed at
// `inner_max_cycles` for the lifetime of this outer call (the
// earlier mid-loop cap extension was removed in favor of the
// plateau-flat-objective convergence certificate), but the
// sentinel pattern is retained — the `.max(200)` floor is a
// harmless safety pad and the explicit `cycle >= inner_max_cycles`
// break keeps the existing `continue` statements in the body
// working
// (they advance `cycle` via the iterator), unlike a `while` +
// manual-counter rewrite.
let inner_loop_hard_ceiling = inner_max_cycles.max(200);
// Verbose cadence for the inner joint-Newton log block. Boring cycles
// (first-attempt accepts with no convergence event) emit ONE compact
// one-liner instead of the 4-line pre-cycle/TR/cycle-summary/convergence
// block. Verbose cycles (first, last, every 20th, all rejections,
// convergence events) keep the full detail. JOINT_LOG_VERBOSE_PERIOD is
// tuned so a 200-cycle inner solve emits ~10 detailed waypoints plus
// 1 compact line per remaining cycle (~210 lines), down from ~800.
const JOINT_LOG_VERBOSE_PERIOD: usize = 50;
// Residual-stall detector for joint Newton. Distinct from the
// blockwise loglik-frozen divergence detector lower in the file:
// that one requires the log-likelihood to be unchanged for K
// cycles AND the per-block Newton step pinned at the cap.
//
// Large-scale survival marginal-slope hits a different pattern —
// the joint objective decreases monotonically by O(1) per cycle
// (so loglik is NOT frozen), the TR repeatedly clamps proposals
// with |prop|∞ >> trust_radius, and the post-step KKT residual
// oscillates in a band orders of magnitude above residual_tol
// without trending down. Burning the rest of the cycle budget on
// this pattern reaches inner_max_cycles "non-converged", which
// then drops the outer optimizer into the first-order bridge
// fallback with a stale-mode gradient that ‖g‖ ≈ 10⁷ kills BFGS
// line search at iter 0.
//
// Track the best residual seen and the number of cycles since
// any meaningful improvement (≥10% drop). Once we've burned at
// least RESIDUAL_STALL_MIN_CYCLES with no improvement AND the
// TR has been clamping aggressively, exit `converged=false` so
// the outer optimizer sees a non-converged signal while we still
// have a finite, in-range β to return (instead of running to the
// hard ceiling and then handing BFGS a junk gradient).
const RESIDUAL_STALL_NO_IMPROVE_CYCLES: usize = 30;
const RESIDUAL_STALL_MIN_CYCLES: usize = 40;
const RESIDUAL_STALL_IMPROVEMENT_FACTOR: f64 = 0.9;
const RESIDUAL_STALL_BLOCK_GRADIENT_FACTOR: f64 = 50.0;
let mut best_residual_seen: f64 = f64::INFINITY;
let mut cycles_since_residual_improved: usize = 0;
// Number of consecutive non-improving cycles after which the
// conditioning-based self-vanishing Levenberg–Marquardt damping is
// ARMED inside the spectral-range Newton solve, for EVERY family
// (#826/#808). The undamped range-restricted Newton step oscillates on a
// full-rank-but-ill-conditioned penalized Hessian at the oversmoothed-ρ
// operating point: the tiny-but-above-cutoff curvature of the lightly
// identified mean/threshold/wiggle block takes an enormous `component/λ`
// proposal that the trust region clips every cycle, so the residual on
// that block freezes while its β stays ≈0 (the exact #826 signature).
// The conditioning-gated `μ = c·‖∇L − Sβ‖∞` caps that component into a
// bounded descent step. It is SELF-VANISHING (μ → 0 as the residual → 0)
// so the converged β and the KKT certificate are byte-identical to the
// undamped solve — zero REML/LAML bias. Arming it on OBSERVED non-
// progress rather than a static per-family flag keeps the AFT /
// constant-scale endgame (which converges quadratically and never
// stalls) byte-identical: a quadratically-converging solve reaches
// tolerance in a handful of cycles and never trips this threshold, so μ
// is never engaged there. Only a genuinely oscillating ill-conditioned
// solve crosses it, which is exactly when the damping is sound. Set a
// few cycles below the stall-exit window so the damping gets a chance to
// rescue the solve well before the early-exit / budget tripwire fires.
// (The conditioning-gated self-vanishing μ this armed now lives ONLY in the
// test-retained `solve_joint_newton_step_on_spectral_range`; the production
// joint step takes the exact trust-region multiplier λ instead — gam#979.)
// Recent KKT-residual values (oldest→newest) used to detect STEADY
// geometric descent at the certificate-refusal gate. A still-converging
// Newton direction (residual dropping by a steady factor < 1 each cycle)
// must not be misclassified as a multiplier/null plateau and exited
// early (gam#787 duchon centers≥20: the logslope block converges
// geometrically — residual ~0.33×/cycle — but `linearized_rel ≥ 0.5`
// routed it into the plateau-refusal break a few cycles short of tol).
const RESIDUAL_DESCENT_WINDOW: usize = 3;
let mut residual_descent_history: std::collections::VecDeque<f64> =
std::collections::VecDeque::with_capacity(RESIDUAL_DESCENT_WINDOW);
let mut tr_clamped_during_stall: bool = false;
// Fully-rejected stall guard. The residual-stall guard below
// (post-grad-reload) only fires on cycles that produced an accepted
// step, because every termination check it gates lives after the
// `if !accepted { continue; }` exit at the bottom of the trust-region
// attempt loop. When every cycle in a row is fully rejected — all
// JOINT_TRUST_MAX_ATTEMPTS trial steps fail the line-search check —
// none of those guards ever see the iterate, the cycle loop spins
// up to `inner_loop_hard_ceiling` cycles, and the inner solver burns
// ~120 s of wall-clock per outer ρ-evaluation that the outer
// optimizer will reject anyway. The signature is exact and local:
// (i) every trust attempt this cycle was rejected on the actual
// objective check (`objective_rejects == JOINT_TRUST_MAX_ATTEMPTS`,
// `model_rejects == 0`, `likelihood_rejects == 0`), AND (ii) the joint
// trust radius has NOT shrunk relative to the previous fully-rejected
// cycle. Condition (ii) is what proves no progress is possible: β is
// reverted to its pre-cycle value on every fully-rejected cycle, so
// with an identical Newton system AND an identical trust radius the
// next cycle's trust-region search is byte-deterministically the
// same as this one's. The radius can stall above the 1e-12 floor
// when `shrink_active_joint_block_trust_radii` only shrinks blocks
// that hit their per-block boundary — an interior block keeps its
// radius forever, so `max(block_radii)` is held by that block while
// the boundary block's radius collapses to 1e-12 without changing
// the max. After `FULLY_REJECTED_STALL_MAX_CYCLES` consecutive cycles
// with both conditions, exit non-converged so the outer optimizer
// rejects this ρ cleanly instead of waiting for the cycle cap.
const FULLY_REJECTED_STALL_MAX_CYCLES: usize = 8;
let mut prev_rejected_trust_radius: Option<f64> = None;
let mut consecutive_held_rejected_cycles: usize = 0;
let mut last_joint_math: Option<JointNewtonMathDiagnostic> = None;
// Cross-cycle cache of the joint Jeffreys/Firth triple `(β_key, ∇Φ, H_Φ)`
// (gam#729/#826/#808). Computing `(∇Φ, H_Φ)` costs `p` family
// directional-derivative calls plus the `½ S Sᵀ` GEMM; for a K-block
// coupled family that is the dominant per-inner-cycle cost. The post-step
// KKT residual recomputes the triple at the just-accepted β; the NEXT
// cycle's head needs the SAME triple at that SAME β. Carry it forward
// keyed on the flattened β so the head reuses the post-step result instead
// of recomputing — collapsing two O(p)-directional-derivative evaluations
// per accepted cycle to one. The key is an exact-equality check on the
// flattened β (β is byte-identical between an accepted post-step residual
// and the next head), so the reused term is the exact term at the current
// iterate — no staleness, no tolerance fudge.
let mut jeffreys_triple_cache: Option<(Array1<f64>, Array1<f64>, Array2<f64>)> = None;
// Stash for the structured cert-REFUSED report computed inside the
// cycle loop, so the post-loop bubbled error (`coupled exact-joint
// inner solve exited the joint Newton path …`) can emit the same
// per-block + spectrum breakdown without re-materializing H_pen.
let mut last_kkt_refusal_report: Option<KktRefusalReport> = None;
let mut prev_kkt_norm: Option<f64> = None;
// Plateau streak on |Δobj| ≤ objective_tol. The scale-aware
// flatness predicate stays local to this loop; the streak/window
// discipline (grow on flat, reset on recovery) is the shared
// loop_guard::FlatStreak so it cannot drift from the other
// stagnation detectors in the tree (#968).
let mut obj_flat_streak = crate::solver::loop_guard::FlatStreak::new(
crate::solver::loop_guard::PLATEAU_DEFAULT_WINDOW,
);
// Total descent budget across the joint-Newton loop, used by
// the end-of-loop summary to report `descent_total`.
let initial_joint_objective: f64 = lastobjective;
// Per-cycle |Δobjective| history for the geometric-tail trigger of
// the constrained-stationary certificate below. When the cycles
// settle into a linear-rate plateau (|Δobj_next| / |Δobj_prev|
// approaching 1 monotonically over the window), the total
// *remaining* objective descent is rigorously bounded above by the
// geometric series sum |Δobj_now| / (1 − max_ratio). When that
// bound is below `objective_tol` the cert can fire many cycles
// earlier than waiting for any single |Δobj| to individually
// cross obj_tol — the bound is mathematically the same precision
// contract, applied to the asymptotic tail rather than one step.
const GEOMETRIC_TAIL_WINDOW: usize = 5;
let mut geometric_tail_history: std::collections::VecDeque<f64> =
std::collections::VecDeque::with_capacity(GEOMETRIC_TAIL_WINDOW);
// The exact joint-Hessian route solves the penalized Newton system
// directly. Extra damping must be wired through an accepted/rejected
// step policy before it belongs here; keep the matvec faithful to the
// objective until then.
for cycle in 0..inner_loop_hard_ceiling {
if cycle >= inner_max_cycles {
break;
}
let verbose_cycle = cycle == 0
|| cycle + 1 == inner_max_cycles
|| (cycle + 1) % JOINT_LOG_VERBOSE_PERIOD == 0;
// Pre-cycle header line removed: the post-cycle one-liner below
// carries cycle/objective/Δobj/step/residual/time and on verbose
// cadence the expanded convergence line additionally carries
// -loglik and penalty. Suppressing this avoids emitting a second
// info-level line per cycle just to repeat numbers we already
// log at end of cycle.
// Per-cycle phase-timing accumulators. Surface where the inner
// joint-Newton spends time so a 18-min silent cycle 0 (the
// bernoulli marginal-slope FLEX large-scale failure mode) becomes a
// logged timeline at the end of the cycle. Phases:
// * hessian: joint Hessian source build (matrix-free workspace
// OR dense fallback assembly)
// * pcg: matrix-free QP solve via solve_spd_pcg_with_info_into
// (already logs its own diagnostics; we accumulate
// here for the end-of-cycle summary)
// * line_search: backtracking step-size search (up to 8 attempts)
// * grad_reload: post-accept joint gradient + workspace refresh
let cycle_started = std::time::Instant::now();
// Top-of-cycle row-measure capture. The trust-region ratio
// ρ = [F(β) − F(β + δ)] / [−g·δ − ½·δᵀHδ] is only meaningful when
// every input (Hessian, gradient, objective at β, trial objective
// at β + δ) is evaluated against the same row measure. We freeze
// the measure here and re-read it at each of the four sites later
// in the cycle, then hard-fail (Err) just before ρ if any of them
// diverged. Cf. `src/solver/row_measure.rs`.
let tr_row_measure_top =
crate::solver::row_measure::RowMeasure::from_options(options, total_joint_n);
let hessian_started = std::time::Instant::now();
log::info!(
"[joint-newton-tr] phase=hessian_qp cycle={} r={:.3e}",
cycle,
joint_trust_radius,
);
let cycle_log = prelude_log;
let constraints_started = std::time::Instant::now();
let block_constraints = collect_block_linear_constraints(family, &states, specs)?;
let joint_constraints =
assemble_joint_linear_constraints(&block_constraints, &ranges, total_p)?;
if cycle_log && cycle == 0 {
log::info!(
"[STAGE] PIRLS/inner step=cycle0 block+joint constraints elapsed={:.3}s n={} p={}",
constraints_started.elapsed().as_secs_f64(),
total_joint_n,
total_p,
);
}
let workspace_build_started = std::time::Instant::now();
// Get joint Hessian and block gradients from the current evaluation.
let hessian_workspace_for_cycle: Option<Arc<dyn ExactNewtonJointHessianWorkspace>> =
None;
let joint_hessian_source = if joint_workspace_requested {
let cached_hit = cached_joint_workspace.is_some();
let workspace = match cached_joint_workspace.take() {
Some(workspace) => Some(workspace),
None => family.exact_newton_joint_hessian_workspace_with_options(
&states, specs, options,
)?,
};
if cycle_log && cycle == 0 {
log::info!(
"[STAGE] PIRLS/inner step=cycle0 hessian-workspace cached_hit={} elapsed={:.3}s n={} p={}",
cached_hit,
workspace_build_started.elapsed().as_secs_f64(),
total_joint_n,
total_p,
);
}
workspace
.as_ref()
.map(|workspace| {
exact_newton_joint_hessian_source_from_workspace(
workspace,
total_p,
MaterializationIntent::InnerSolve,
"joint Newton inner exact-newton operator mismatch",
)
})
.transpose()?
.flatten()
} else {
None
};
// Row measure observed by the Hessian build above.
let tr_row_measure_hessian =
crate::solver::row_measure::RowMeasure::from_options(options, total_joint_n);
let joint_hessian_source = match joint_hessian_source {
Some(source) => source,
None => {
// Spec-aware joint Hessian: canonical coupled-curvature
// source (see the availability gate above). Families that
// only override `_with_specs` (Dirichlet common-parameter)
// would otherwise hand back `None` from the spec-less
// default and silently drop off the joint-Newton path.
let h_joint_opt =
family.exact_newton_joint_hessian_with_specs(&states, specs)?;
let Some(h_joint) = h_joint_opt else {
break; // Fall back to blockwise if joint Hessian unavailable
};
match symmetrized_square_matrix(
h_joint,
total_p,
"joint Newton inner exact-newton Hessian shape mismatch",
) {
Ok(matrix) => JointHessianSource::Dense(matrix),
Err(_) => break,
}
}
};
// Concatenate block gradients and betas.
let Some(grad_joint) = cached_joint_gradient.clone() else {
break;
};
// Row measure observed by the gradient at β. `cached_joint_gradient`
// was loaded earlier under `options`; if the auto-subsample
// installer or any sibling path swapped the mask between then and
// now, the id captured here will diverge from the rest and the
// pre-ρ check below will Err. Cf. `src/solver/row_measure.rs`.
let tr_row_measure_gradient =
crate::solver::row_measure::RowMeasure::from_options(options, total_joint_n);
if grad_joint.len() != total_p {
break;
}
let mut beta_joint = Array1::<f64>::zeros(total_p);
for b in 0..specs.len() {
let (start, end) = ranges[b];
beta_joint
.slice_mut(ndarray::s![start..end])
.assign(&states[b].beta);
}
let trace_diagonal_ridge = joint_mode_diagonal_ridge + JOINT_TRACE_STABILITY_RIDGE;
let joint_hessian_is_dense =
matches!(&joint_hessian_source, JointHessianSource::Dense(_));
let joint_solver_diagonal_ridge = stabilized_joint_solver_diagonal_ridge(
family,
&joint_hessian_source,
&ranges,
&s_lambdas,
trace_diagonal_ridge,
options.ridge_floor,
joint_bundle,
);
// CHEAP CONDITIONING PRE-CHECK (always-on robustness, zero-cost on
// easy/large fits). Before paying for the dense joint-Hessian
// materialization + `O(p³)` reduced eigendecomposition inside the
// Jeffreys term, ask whether the term is PROVABLY skippable from a few
// matrix-free Hessian-vector products against the source we just built.
// When `true`, the exact conditioning gate is certain to return the
// zero term, so every Jeffreys call this cycle short-circuits to the
// exact-zero contribution WITHOUT forming anything dense — byte-
// identical to the gated-off path, and preserving the matrix-free path
// on wide well-conditioned fits. Only runs the estimate when a Jeffreys
// subspace exists and `total_p` is wide enough that the dense eigh is
// the cost we want to avoid (the helper itself gates on the size
// threshold and conservatively returns `false` if unsure). Computed
// once per inner cycle and reused across the cycle's head-KKT, step,
// and trial-value calls; the conditioning changes slowly across cycles
// so re-estimating per cycle (one `O(p·k)` burst) is already cheap
// against the work it guards.
let jeffreys_skippable_this_cycle: bool = if options.seed_screening {
// Seed screening only ranks seeds: skip the O(p · per-axis-Hdot)
// full Jeffreys gradient/curvature loop. The value-only Jeffreys
// term (folded into the objective baseline / trial penalties via
// `custom_family_joint_jeffreys_value`, gated independently on
// `joint_jeffreys_subspace.is_some()`) still bounds the screening
// score on separating directions; only the per-axis step curvature
// — the wrong cost class for ranking on a K-block coupled family —
// is dropped here (gam#729/#808).
true
} else if joint_jeffreys_subspace.is_some() {
jeffreys_term_skippable_for_source(&joint_hessian_source, total_p).unwrap_or(false)
} else {
false
};
let joint_trust_metric_diag = match &joint_hessian_source {
JointHessianSource::Dense(h_joint) => joint_penalty_preconditioner_diag(
&h_joint.diag().to_owned(),
&ranges,
&s_lambdas,
joint_solver_diagonal_ridge,
joint_bundle,
),
JointHessianSource::Operator { diagonal, .. } => joint_penalty_preconditioner_diag(
diagonal,
&ranges,
&s_lambdas,
joint_solver_diagonal_ridge,
joint_bundle,
),
};
// HEAD-β JEFFREYS CACHE (gam#729/#808). The full Jeffreys/Firth triple
// `(Φ, ∇Φ, H_Φ)` costs `p` family directional-derivative calls (the
// `for k in 0..p` loop in `joint_jeffreys_term`); for a K-block coupled
// family (Dirichlet/multinomial) that is the dominant per-cycle cost.
// The head-of-cycle KKT residual, the constrained-QP step, and the
// spectral/dense Newton step are ALL built at the SAME cycle-start β
// (`&states`, before any step is accepted), so they need the SAME
// triple. Compute it ONCE here and reuse, instead of three independent
// O(p)-directional-derivative evaluations per cycle. The post-step
// residual below is at the accepted β, so it correctly recomputes.
// `None` when the term is condition-gated/skippable (∇Φ=0, H_Φ=0).
let head_beta_key: Array1<f64> = flatten_state_betas(&states, specs);
let head_jeffreys_term: Option<(Array1<f64>, Array2<f64>)> =
if jeffreys_skippable_this_cycle {
None
} else if let Some((_, grad_phi, hphi)) = jeffreys_triple_cache
.as_ref()
.filter(|(key, _, _)| *key == head_beta_key)
{
// Cross-cycle cache hit: the previous cycle's post-step KKT
// residual already computed the exact triple at this β. Reuse.
Some((grad_phi.clone(), hphi.clone()))
} else if let Some(z_joint) = joint_jeffreys_subspace.as_ref() {
let term = match custom_family_joint_jeffreys_term(
family, &states, specs, &ranges, z_joint,
)? {
Some((_phi, grad_phi, hphi))
if grad_phi.len() == grad_joint.len()
&& hphi.nrows() == total_p
&& hphi.ncols() == total_p =>
{
Some((grad_phi, hphi))
}
_ => None,
};
if let Some((grad_phi, hphi)) = term.as_ref() {
jeffreys_triple_cache =
Some((head_beta_key.clone(), grad_phi.clone(), hphi.clone()));
}
term
} else {
None
};
// Fold the Firth/Jeffreys score `∇Φ` into the head-of-cycle KKT
// residual when the term is armed, for the same reason as the
// post-step residual below: the inner objective is `−ℓ + ½βᵀSβ − Φ`,
// so the certifiable stationarity is `∇L − Sβ + ∇Φ = 0`. Without
// this the head-of-cycle KKT exit (`current_stationarity_residual ≤
// residual_tol`) can never fire on the near-separating span, even
// when the iterate is the Firth optimum. No-op when the Jeffreys
// term is unavailable or condition-gated to zero.
let head_kkt_gradient: Option<Array1<f64>> = head_jeffreys_term
.as_ref()
.map(|(grad_phi, _hphi)| &grad_joint + grad_phi);
let current_kkt_norm = exact_newton_joint_stationarity_inf_norm_from_gradient(
head_kkt_gradient.as_ref().unwrap_or(&grad_joint),
&states,
specs,
&s_lambdas,
ridge,
options.ridge_policy,
&block_constraints,
Some(cached_active_sets.as_slice()),
)?;
let pcg_rel_tol = joint_pcg_eisenstat_walker_forcing(prev_kkt_norm, current_kkt_norm);
let solve_joint_constraints_dense = joint_constraints.is_some()
|| !matrix_free_joint_requested
|| joint_hessian_is_dense;
// Exact trust-region subproblem factorization (gam#979). Populated on
// the unconstrained dense-spectral path with the metric-whitened
// eigendecomposition of the penalized Hessian, so the trust loop below
// re-solves the *exact* Moré–Sorensen subproblem at each trust radius
// from one factorization — replacing the dogleg/Cauchy/box-truncation
// globalization with the single object they all approximate. `None` on
// the constrained-QP and matrix-free PCG paths, which keep their
// existing globalization untouched.
let mut joint_spectrum: Option<whitened_spectrum::WhitenedHessianSpectrum> = None;
let (candidate_beta, joint_active_set, joint_step_spectral_nullity) =
if solve_joint_constraints_dense
&& let Some(constraints) = joint_constraints.as_ref()
{
let mut lhs = match materialize_joint_hessian_source(
&joint_hessian_source,
total_p,
"joint Newton inner constrained Hessian materialization",
) {
Ok(matrix) => matrix,
Err(_) => break,
};
add_joint_penalty_to_matrix(
&mut lhs,
&ranges,
&s_lambdas,
trace_diagonal_ridge,
joint_bundle,
);
if joint_solver_diagonal_ridge != trace_diagonal_ridge {
for d in 0..lhs.nrows() {
lhs[[d, d]] += joint_solver_diagonal_ridge - trace_diagonal_ridge;
}
}
check_linear_feasibility(&beta_joint, constraints, 1e-8)
.map_err(|e| format!("joint Newton constrained solve: {e}"))?;
let warm_joint_active =
flatten_joint_active_set(&cached_active_sets, &block_constraints);
let lower_bounds = match extract_simple_lower_bounds(constraints, total_p) {
Ok(bounds) => bounds,
Err(_) => break,
};
// Newton IRLS step in absolute-β space:
//
// β_new = H_pen⁻¹ (H_L β + ∇ℓ)
//
// where H_pen = H_L + S, derived from Newton's update
// β_new = β + H_pen⁻¹(∇ℓ − Sβ)
// = H_pen⁻¹(H_pen β + ∇ℓ − Sβ)
// = H_pen⁻¹(H_L β + ∇ℓ).
//
// The QP `min 0.5 β' H_pen β − rhs_beta' β` has unconstrained
// optimum β = H_pen⁻¹ rhs_beta, so rhs_beta = H_pen β + (∇ℓ − Sβ)
// gives the correct Newton update. Passing raw grad_joint (=∇ℓ)
// would collapse to β = H_pen⁻¹ ∇ℓ, which at the true optimum
// (∇ℓ = Sβ̂) gives H_pen⁻¹ Sβ̂ ≠ β̂ — wrong fixed point.
let penalty_beta_joint = apply_joint_block_penalty(
&ranges,
&s_lambdas,
&beta_joint,
joint_mode_diagonal_ridge,
joint_bundle,
);
let mut rhs_step = &grad_joint - &penalty_beta_joint;
// Reuse the head-β Jeffreys triple (consistently attenuated in
// `head_jeffreys_term` — both ∇Φ and H_Φ scaled by one scalar,
// gam#826/#872/#715). Skipped when the cheap pre-check certifies
// well-conditioning: ∇Φ = 0 and H_Φ = 0 there, so neither
// rhs_step nor lhs change.
// PSD PROJECTION (gam#979). The exact divided-difference H_Φ is
// indefinite exactly where Φ is (mixed-sign reduced spectrum at
// off-mode trial points). The unconstrained dense-spectral path
// consumes it exactly — the Moré–Sorensen subproblem handles
// indefiniteness rigorously — but THIS active-set QP requires a
// convex model (an indefinite QP cycles its active set and the
// inner grinds the budget). Use the PSD part of H_Φ here: honest
// magnitudes (unlike the old `K²` vec-Gram phantom), guaranteed
// solvable QP, and the exact ∇Φ in the rhs keeps the fixed point
// unchanged — only the convergence rate on indefinite stretches
// degrades to the damped-Newton rate the constrained path always
// had.
if let Some((grad_phi, hphi)) = head_jeffreys_term.as_ref()
&& grad_phi.len() == rhs_step.len()
{
rhs_step += grad_phi;
lhs += &symmetric_psd_projection(hphi);
}
// Self-vanishing Levenberg–Marquardt damping for the
// CONSTRAINED active-set QP, mirroring the spectral-range
// branch below (μ = JOINT_SPECTRAL_LEVENBERG_FACTOR·‖rhs‖∞).
//
// When the joint design carries inequality constraints
// (the monotone I-spline time-warp of a survival
// location-scale / AFT fit) the spectral range step that
// drops ker(H_pen) is NOT taken — this dense active-set QP
// runs instead. On a constant-scale AFT the 12-col monotone
// time-warp's non-affine deviation is statistically
// UNIDENTIFIED, so H_pen is rank-deficient along that gauge
// direction. An undamped QP then has a continuum of optima
// differing only by the free gauge component, and the
// active set slides along the monotone constraint face
// taking an O(1) proposal step in that direction every
// cycle. The proposal `step_inf` never exhausts, so the
// identified-subspace KKT certificate (gated on
// `step_inf ≤ step_tol`) never fires and the inner
// joint-Newton grinds the full `inner_max_cycles` on EVERY
// outer ρ-eval — the survival-LS AFT "hang" (#736/#735/#721).
//
// Adding μ·I to the QP Hessian gives ker(H_pen) a tiny
// positive curvature, so the constrained minimizer is unique
// and its gauge component is driven toward zero; the proposal
// step then exhausts at the identified-subspace optimum and
// the certificate fires in a handful of cycles. Because
// μ ∝ ‖∇L − Sβ‖∞ → 0 at the KKT fixed point, the converged β
// and the well-identified flexible-scale fast path (where the
// time-warp IS identified and H_pen is non-singular) are
// unchanged — a genuinely flexible survival-LS fit still
// performs its full search.
//
// CRITICAL: the floor is only correct on a genuinely
// rank-deficient `H_pen`. Gate it strictly on
// `nullity > 0`. On a FULLY IDENTIFIED constrained fit
// (e.g. the post-reduction constant-scale loglogistic AFT,
// #736/#735/#721/#733/#734 — a 3-parameter model with
// block_widths = [1,1,1] and an empty `ker(H_pen)`) the QP
// minimizer is already unique, so the floor adds nothing it
// is needed for but everything it costs: with residual r and
// factor 1e-3 the floor is μ≈1e-3·r, and on an unpenalized
// location intercept whose likelihood curvature H is small
// at n=23 the damped Newton component shrinks the residual
// only by the GEOMETRIC ratio H/(H+μ) per cycle instead of
// quadratically. With μ≈1e-6 and a small H that ratio is far
// from 1, so the threshold-block stationarity residual
// plateaus at ~1e-3–1e-4 and the inner solve burns its whole
// cycle budget without ever reaching `residual_tol`. The
// self-vanishing μ→0 is too slow because it vanishes only as
// fast as the residual it is throttling. Disabling the floor
// when `nullity == 0` makes the constrained QP solve the
// EXACT undamped Newton/KKT system, recovering quadratic
// convergence to `residual_tol` in a handful of cycles. The
// rank-deficient case (`nullity > 0`, the pre-reduction
// unidentified time-warp gauge) keeps the floor and its hang
// fix unchanged. `None` (eigensolve failed / zero Hessian)
// falls back to the damped path conservatively.
let hpen_nullity = symmetric_penalized_hessian_nullity(&lhs);
let apply_constrained_floor = hpen_nullity.map(|n| n > 0).unwrap_or(true);
let rhs_inf = rhs_step.iter().map(|v| v.abs()).fold(0.0_f64, f64::max);
let constrained_levenberg_mu = JOINT_SPECTRAL_LEVENBERG_FACTOR * rhs_inf;
if apply_constrained_floor
&& constrained_levenberg_mu > 0.0
&& constrained_levenberg_mu.is_finite()
{
for d in 0..lhs.nrows() {
lhs[[d, d]] += constrained_levenberg_mu;
}
}
let rhs_beta = &lhs.dot(&beta_joint) + &rhs_step;
let solve_result = if let Some(bounds) = lower_bounds.as_ref() {
solve_quadratic_with_simple_lower_bounds(
&lhs,
&rhs_beta,
&beta_joint,
bounds,
warm_joint_active.as_deref(),
)
} else {
solve_quadratic_with_linear_constraints(
&lhs,
&rhs_beta,
&beta_joint,
constraints,
warm_joint_active.as_deref(),
)
.map_err(|e| e.to_string())
};
match solve_result {
Ok((beta_new, active_set)) => (beta_new, Some(active_set), 0usize),
Err(_) => break,
}
} else {
// Stationarity residual: r = S*beta - gradient (for penalized NLL)
let penalty_beta = apply_joint_block_penalty(
&ranges,
&s_lambdas,
&beta_joint,
joint_mode_diagonal_ridge,
joint_bundle,
);
let mut rhs = &grad_joint - &penalty_beta;
// Universal robustness: fold the family-general
// Jeffreys/Firth curvature `H_Φ` and score `∇Φ` into BOTH the
// matrix-free PCG step AND the dense spectral fallback below,
// scoped to the full-span basis `Z_J`. Computed ONCE here
// so the matvec closure and the RHS share the SAME term and the
// fallback does not recompute it. The inner objective is
// `−ℓ + ½βᵀSβ − Φ`, so the Newton system the step must solve is
// (H + S_λ + H_Φ) δ = (∇ℓ − S_λβ) + ∇Φ.
// Previously the PCG matvec applied only `H + S_λ` and its RHS
// omitted `∇Φ`, so on the matrix-free path (large p / large n)
// Firth was a SILENT NO-OP: the proper-prior never reached the
// step that actually moves β, leaving separation/under-
// identification uncured exactly where the dense route is not
// taken. The dense route (small p, e.g. BMS p≈51) was already
// correct. `H_Φ` is the full-span Gauss-Newton surrogate
// `½ J H_id⁻¹ Jᵀ` (Z_J = identity ⇒ p×p, not low-rank), but the
// conditioning gate in `joint_jeffreys_term` returns the zero
// term on every well-conditioned fit, so this only arms on the
// near-separating span
// — and `hphi` is materialized once per cycle regardless, so the
// matvec adds only one O(p²) HVP, preserving the matrix-free
// path's asymptotics where Firth is negligible (term = `None`).
// Cheap pre-check certified well-conditioned ⇒ the exact term
// is the zero contribution (∇Φ = 0, H_Φ = 0). Short-circuit to
// `None` WITHOUT materializing the dense joint Hessian or running
// the O(p³) reduced eigendecomposition — this is the matrix-free
// PCG hot path, where forming a dense p×p H_Φ every cycle was the
// regression. Byte-identical to the gated-off dense path: `rhs`
// is left as `∇ℓ − S_λβ` and no H_Φ is folded into the matvec.
// Reuse the head-β Jeffreys triple (computed once this cycle);
// this Newton step is built at the same cycle-start β.
let inner_jeffreys_term: Option<(Array1<f64>, Array2<f64>)> =
match head_jeffreys_term.as_ref() {
Some((grad_phi, hphi)) if grad_phi.len() == rhs.len() => {
rhs += grad_phi;
Some((grad_phi.clone(), hphi.clone()))
}
_ => None,
};
// PSD PROJECTION for the SPD-PCG matvec (gam#979): the exact
// divided-difference H_Φ can be indefinite at off-mode trial
// points, which breaks the SPD-CG contract. The matvec uses its
// PSD part; the dense spectral fallback below keeps the EXACT
// (possibly indefinite) H_Φ — the Moré–Sorensen subproblem
// handles it rigorously.
let inner_jeffreys_hphi: Option<Arc<Array2<f64>>> = inner_jeffreys_term
.as_ref()
.map(|(_grad_phi, hphi)| Arc::new(symmetric_psd_projection(hphi)));
let pcg_started = std::time::Instant::now();
let pcg_requested = matrix_free_joint_requested && !joint_hessian_is_dense;
let mut spectral_nullity_for_step = 0usize;
let mut delta = if pcg_requested {
let preconditioner_diag = match &joint_hessian_source {
JointHessianSource::Dense(h_joint) => {
joint_penalty_preconditioner_diag(
&h_joint.diag().to_owned(),
&ranges,
&s_lambdas,
joint_solver_diagonal_ridge,
joint_bundle,
)
}
JointHessianSource::Operator { diagonal, .. } => {
joint_penalty_preconditioner_diag(
diagonal,
&ranges,
&s_lambdas,
joint_solver_diagonal_ridge,
joint_bundle,
)
}
};
// Pre-allocate the penalty workspace ONCE outside the
// PCG closure so each CG iter (called hundreds-to-
// thousands of times per outer iter at large scale)
// reuses the buffer instead of allocating per call.
// RefCell because solve_spd_pcg* expects `Fn` (immutable
// borrow of captures) and we need interior mutability
// to write into the workspace.
let penalty_workspace = RefCell::new(Array1::<f64>::zeros(total_p));
// Capture the Jeffreys/Firth curvature for the matvec. When
// armed (and nonzero past the conditioning gate) the PCG
// operator becomes `H + S_λ + H_Φ`, matching the augmented
// RHS `(∇ℓ − S_λβ) + ∇Φ` set above and the dense spectral
// fallback. `None` keeps the unaugmented matvec.
let pcg_hphi_dense = inner_jeffreys_hphi.clone();
let pcg_hphi_op = inner_jeffreys_hphi.clone();
match &joint_hessian_source {
JointHessianSource::Dense(h_joint) => {
crate::linalg::utils::solve_spd_pcg_with_info_into(
|v, out| {
// h_joint * v -> out (faer-backed, no alloc)
crate::faer_ndarray::fast_av_view_into(
h_joint,
v,
out.view_mut(),
);
let mut pen = penalty_workspace.borrow_mut();
apply_joint_block_penalty_into(
&ranges,
&s_lambdas,
v,
joint_solver_diagonal_ridge,
&mut pen,
joint_bundle,
);
*out += &*pen;
if let Some(hphi) = pcg_hphi_dense.as_ref() {
*out += &hphi.dot(v);
}
},
&rhs,
&preconditioner_diag,
pcg_rel_tol,
JOINT_PCG_MAX_ITER_MULTIPLIER * total_p.max(1),
)
.map(|(solution, info)| {
log_joint_pcg_diagnostics(
cycle,
total_p,
total_joint_n,
&preconditioner_diag,
&info,
);
solution
})
}
JointHessianSource::Operator { apply_into, .. } => {
let apply_h_into = Arc::clone(apply_into);
crate::linalg::utils::solve_spd_pcg_with_info_into(
|v, out| {
if let Err(error) = apply_h_into(v, out) {
log::warn!(
"joint Newton inner operator matvec failed: {error}"
);
out.fill(0.0);
}
let mut pen = penalty_workspace.borrow_mut();
apply_joint_block_penalty_into(
&ranges,
&s_lambdas,
v,
joint_solver_diagonal_ridge,
&mut pen,
joint_bundle,
);
*out += &*pen;
if let Some(hphi) = pcg_hphi_op.as_ref() {
*out += &hphi.dot(v);
}
},
&rhs,
&preconditioner_diag,
pcg_rel_tol,
JOINT_PCG_MAX_ITER_MULTIPLIER * total_p.max(1),
)
.map(|(solution, info)| {
log_joint_pcg_diagnostics(
cycle,
total_p,
total_joint_n,
&preconditioner_diag,
&info,
);
solution
})
}
}
} else {
None
};
if pcg_requested {
log::info!(
"[PIRLS/joint-PCG] cycle {:>3} | n={} p={} solved={} elapsed={:.3}s",
cycle,
total_joint_n,
total_p,
delta.is_some(),
pcg_started.elapsed().as_secs_f64()
);
}
if delta.is_none() {
if pcg_requested {
break;
}
let mut lhs_true = match materialize_joint_hessian_source(
&joint_hessian_source,
total_p,
"joint Newton inner dense fallback Hessian materialization",
) {
Ok(matrix) => matrix,
Err(_) => break,
};
add_joint_penalty_to_matrix(
&mut lhs_true,
&ranges,
&s_lambdas,
joint_mode_diagonal_ridge,
joint_bundle,
);
// Universal robustness: add the
// family-general Jeffreys curvature `H_Phi` to the
// penalized Hessian. This is the Tier-B coupled-Newton form
// of Firth: the reduced Fisher information `Z_J^T H Z_J`
// supplies the missing O(n) curvature that bounds a
// near-separating coefficient to O(1). When the Jeffreys
// term is unavailable, the step stays unaugmented.
//
// `∇Φ` is NOT re-added here: `rhs` (and thus `spectral_rhs`)
// already carries `+∇Φ` from the single shared computation
// above, and we REUSE that same `H_Φ` here rather than
// recomputing the (O(p) directional-derivative) term — the
// dense fallback and the matrix-free PCG step now solve the
// SAME Jeffreys-augmented Newton system.
let spectral_rhs = rhs.clone();
if let Some((_grad_phi, hphi)) = inner_jeffreys_term.as_ref() {
lhs_true += hphi;
}
// Single metric-whitened eigendecomposition drives BOTH the
// seed step and every trust-region re-solve this cycle
// (gam#979). The prior code ran a SECOND O(p³)
// eigendecomposition of the raw Hessian here purely to form
// the seed step — doubling the dominant per-cycle cost on the
// ~5 s/cycle ill-conditioned survival marginal-slope inner.
// The exact trust-region multiplier λ (chosen so ‖δ‖_D = r)
// subsumes the old self-vanishing Levenberg-μ seed: `decompose`
// whitens by the trust metric so the penalty (λ~e²⁴) and the
// likelihood scales are throttled uniformly — the scale
// invariance the multiplicative μ approximated. `lhs_true`
// already carries the penalty and the Firth/Jeffreys curvature
// H_Φ and `spectral_rhs` the augmented stationarity RHS, so the
// subproblem model matches the predicted-reduction model and the
// accept/reject gain ratio exactly.
let spectrum = whitened_spectrum::WhitenedHessianSpectrum::decompose(
&lhs_true,
&spectral_rhs,
&joint_trust_metric_diag,
KKT_REFUSAL_RANK_TOL,
)?;
// Seed = the unconstrained (Moore–Penrose, range-restricted)
// exact step, so cycle 0 can take the full Newton step on a
// well-conditioned model (the cycle-0 radius bump below relies
// on this); the trust loop re-solves at finite radius for every
// subsequent attempt. An indefinite model reflects negative
// curvature to |λ|, exactly as the prior spectral solve did.
let spectral_step = spectrum.trust_region_step(f64::INFINITY);
spectral_nullity_for_step = spectral_step.nullity;
if spectral_step.reflected_negative_modes > 0 {
log::info!(
"[PIRLS/joint-Newton] cycle {cycle:>3} | indefinite inner \
Hessian: reflected {}/{} negative-curvature modes to |λ| \
(λ_min={:.3e}); proceeding with modified-Newton descent step \
under trust-region globalization",
spectral_step.reflected_negative_modes,
total_p,
spectral_step.most_negative_eigenvalue,
);
}
if spectral_step.nullity > 0 {
log::debug!(
"[PIRLS/joint-Newton] spectral reduced solve: nullity@{:.0e}={}/{} \
|P0 rhs|∞={:.3e} |P+ rhs|∞={:.3e} λ_min+={:.3e} λ_max={:.3e}",
spectral_step.rank_tol,
spectral_step.nullity,
total_p,
spectral_step.null_rhs_inf,
spectral_step.range_rhs_inf,
spectral_step.lambda_min_positive,
spectral_step.lambda_max_abs,
);
}
delta = Some(spectral_step.delta);
// The same factorization powers every trust-radius re-solve
// in the loop below (gam#979) — no second eigendecomposition.
joint_spectrum = Some(spectrum);
}
let Some(delta) = delta else {
break; // Fall back to blockwise
};
if !delta.iter().all(|v| v.is_finite()) {
break; // Fall back to blockwise
}
(beta_joint.clone() + &delta, None, spectral_nullity_for_step)
};
// Hessian-source build (and any QP solve immediately above) are
// done by the time we reach `delta`. Capture the wall-clock
// before the line-search phase so the end-of-cycle summary can
// attribute time correctly between the Hessian/QP and the
// backtracking step search.
let hessian_and_qp_elapsed = hessian_started.elapsed();
let line_search_started = std::time::Instant::now();
log::info!(
"[joint-newton-tr] phase=line_search cycle={} r={:.3e} hessian_qp_elapsed={:.3}s",
cycle,
joint_trust_radius,
hessian_and_qp_elapsed.as_secs_f64(),
);
let delta = &candidate_beta - &beta_joint;
// Trust-region globalization for the joint Newton proposal. The
// previous implementation used up to eight backtracking likelihood
// evaluations (each can build the exact joint workspace at large-scale
// scale). Here the step is truncated before evaluation and the
// single trial objective is accepted only when the actual decrease
// is positive relative to the local quadratic model.
let step_inf = delta.iter().copied().map(f64::abs).fold(0.0_f64, f64::max);
let old_beta: Vec<Array1<f64>> = states.iter().map(|s| s.beta.clone()).collect();
// Firth value Φ at the OLD (start-of-cycle) β, folded under the SAME
// skippable gate the trial uses below — so `actual_reduction =
// old_objective − trialobjective` compares two points on one objective
// `−ℓ + ½βᵀSβ − Φ` (gam#826/#872). `lastobjective` is the pure
// quadratic-penalized objective; subtract the gated old-β Φ here.
let old_phi = if !jeffreys_skippable_this_cycle {
joint_jeffreys_subspace
.as_ref()
.map(|z_joint| {
custom_family_joint_jeffreys_value(family, &states, specs, &ranges, z_joint)
})
.unwrap_or(0.0)
} else {
0.0
};
let old_objective = lastobjective - old_phi;
// Row measure observed by the objective at β. `lastobjective` was
// set on the previous cycle (or at function entry) under `options`;
// see top-of-cycle capture for rationale.
let tr_row_measure_old_objective =
crate::solver::row_measure::RowMeasure::from_options(options, total_joint_n);
let mut accepted = false;
let mut accepted_joint_workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>> =
None;
let mut line_search_attempts = 0usize;
// Pure Newton must take a full step on the first cycle of an
// exact quadratic problem (i.e. converge in one cycle when the
// model is exact). The trust-region globalization above must not
// truncate the very first proposal merely because the hard-coded
// initial radius (1.0) is smaller than the natural Newton-step
// 2-norm. Bumping the radius up to the post-barrier Newton-step
// norm on cycle 0 preserves quadratic convergence on
// well-conditioned problems while leaving the standard adaptive
// shrink/expand for subsequent cycles. Family feasibility
// constraints and the adaptive trust radius remain the safeguards
// against runaway proposals.
if cycle == 0 && joint_step_spectral_nullity == 0 {
let initial_block_norms = joint_trust_region_block_metric_norms(
&delta,
&ranges,
&joint_trust_metric_diag,
);
for (radius, norm) in joint_block_trust_radii.iter_mut().zip(initial_block_norms) {
if norm.is_finite() && norm > *radius {
*radius = norm;
}
}
joint_trust_radius = joint_block_trust_radii
.iter()
.copied()
.fold(0.0_f64, f64::max);
if !joint_trust_radius.is_finite() || joint_trust_radius <= 0.0 {
joint_trust_radius = 1.0;
}
}
let penalty_beta = apply_joint_block_penalty(
&ranges,
&s_lambdas,
&beta_joint,
joint_mode_diagonal_ridge,
joint_bundle,
);
// Stationarity RHS for the trust-region quadratic model. When the
// Jeffreys/Firth term is armed the inner objective is `−ℓ+½βᵀSβ+Φ`, so
// the model RHS is `∇L − Sβ + ∇Φ` — the SAME augmented RHS the Newton
// step solves and the H_Φ-augmented `hpen_delta` below pairs with. Using
// the bare `∇L − Sβ` here desyncs `predicted_reduction` from the
// augmented step + the Φ-augmented `actual_reduction`, which is what
// froze the coupled K-block line search (gam#729/#715). No-op when the
// term is condition-gated/unavailable (∇Φ=0).
let mut rhs = &grad_joint - &penalty_beta;
if let Some((grad_phi, _hphi)) = head_jeffreys_term.as_ref()
&& grad_phi.len() == rhs.len()
{
rhs += grad_phi;
}
let beta_inf = states
.iter()
.flat_map(|s| s.beta.iter().copied())
.map(f64::abs)
.fold(0.0_f64, f64::max);
let step_tol = inner_tol * (1.0 + beta_inf);
let objective_tol = inner_tol * (1.0 + old_objective.abs());
// Scale the KKT residual tolerance against the natural magnitude
// of ‖Sβ − ∇L‖∞ (i.e. max(‖∇L‖∞, ‖Sβ‖∞)), not the objective. The
// gradient and Sβ scale independently of the likelihood — at
// large scale with |β|∞ ~ 10²–10³ and non-trivial smoothing,
// ‖Sβ‖∞ can sit orders of magnitude above |obj| and FP noise
// alone keeps the residual above any obj-scaled tol, so KKT is
// never certified even when the iterate is the true optimum.
let grad_inf = grad_joint
.iter()
.map(|x: &f64| x.abs())
.fold(0.0_f64, f64::max);
let penalty_inf = penalty_beta
.iter()
.map(|x: &f64| x.abs())
.fold(0.0_f64, f64::max);
let residual_tol = inner_tol * (1.0 + grad_inf.max(penalty_inf));
last_residual_tol = residual_tol;
let current_stationarity_residual = current_kkt_norm;
// KKT certificate: ‖∇L − Sβ‖_∞ ≤ residual_tol together with
// ‖δ‖_∞ ≤ step_tol is sufficient first-order optimality of the
// penalized objective; no descent direction exists from the
// current point. Conditioning that exit on additional evidence
// of objective progress in the previous cycle would refuse to
// recognize convergence at a starting point that already sits
// at the optimum (e.g. balanced data with an intercept-only
// fit, where ∇ℓ vanishes by symmetry from cycle 0 and the
// Newton step is identically zero so the trust-region search
// can never produce a strictly negative actual reduction).
if current_stationarity_residual <= residual_tol && step_inf <= step_tol {
log::info!(
"[PIRLS/joint-Newton convergence] cycle {:>3} | pre-line-search converged: proposal_inf={:.3e} (tol={:.3e}) | residual={:.3e} (tol={:.3e})",
cycle,
step_inf,
step_tol,
current_stationarity_residual,
residual_tol,
);
cached_joint_workspace = hessian_workspace_for_cycle;
cycles_done = cycle;
converged = true;
break;
}
// Trust-region retries preserve the objective-decrease guarantee
// when the initial radius is too optimistic. If the Newton proposal
// is not a descent direction for the penalized quadratic model,
// switch once to a diagonally preconditioned gradient step and keep
// the same exact full-objective accept/reject test.
const JOINT_TRUST_MAX_ATTEMPTS: usize = 24;
let mut search_delta = delta.clone();
let search_joint_active_set: Option<Vec<usize>> = joint_active_set.clone();
let mut tried_preconditioned_descent = false;
// Dogleg Cauchy leg (gam#826/#808). Compute the unconstrained Cauchy
// point of the penalized (Firth-augmented) quadratic model ONCE per
// cycle: the M-metric steepest-descent direction `p_sd = M⁻¹·rhs`
// and its curvature `p_sd·H·p_sd` (a coupled Hessian-vector product,
// so it must be hoisted out of the radius-shrink loop). When the
// Newton step exceeds a block's trust radius the dogleg blends
// toward this Cauchy leg, guaranteeing at least the Cauchy decrease
// even when the spectral Newton step is numerically frozen at the
// oversmoothed seed (the high-curvature log_sigma block's Newton
// component is `O(g/λ) ≈ 5e-21`). `joint_active_set` is the
// unconstrained joint Newton path; the constrained-QP path keeps its
// own globalization, so the dogleg is only built (and used) when no
// active set is in force.
let dogleg_cauchy: Option<Array1<f64>> = if search_joint_active_set.is_none() {
let mut p_sd = Array1::<f64>::zeros(total_p);
for (i, (r, w)) in rhs.iter().zip(joint_trust_metric_diag.iter()).enumerate() {
p_sd[i] = r / positive_joint_diagonal_entry(*w);
}
let mut h_psd = Array1::<f64>::zeros(total_p);
let mut cauchy_penalty_scratch = Array1::<f64>::zeros(total_p);
match apply_joint_penalized_hessian_into_with_workspace(
&joint_hessian_source,
&ranges,
&s_lambdas,
joint_mode_diagonal_ridge,
&p_sd,
&mut h_psd,
&mut cauchy_penalty_scratch,
joint_bundle,
) {
Ok(()) => {
if let Some((_grad_phi, hphi)) = head_jeffreys_term.as_ref() {
h_psd += &hphi.dot(&p_sd);
}
let cauchy = joint_cauchy_step(&rhs, &p_sd, &h_psd);
if cauchy.iter().all(|v| v.is_finite()) {
Some(cauchy)
} else {
None
}
}
Err(_) => None,
}
} else {
None
};
let mut model_rejects = 0usize;
let mut likelihood_rejects = 0usize;
let mut objective_rejects = 0usize;
let mut first_likelihood_reject: Option<String> = None;
// Coalesce consecutive trust-region attempts whose accept/reject
// outcome and numeric signature round to the same values, so a long
// run of identical retries collapses into a single "attempts a..b
// (×N)" line at flush time instead of spamming one line per try.
let mut tr_log_sig: Option<String> = None;
let mut tr_log_first: usize = 0;
let mut tr_log_last: usize = 0;
// Hoist the two full-size scratch buffers used in the predicted-
// reduction computation outside the trust-region attempt loop.
// The loop runs up to JOINT_TRUST_MAX_ATTEMPTS times per outer
// Newton step, so allocating these per-attempt would add O(total_p)
// heap traffic on every radius shrink/expand iteration.
let mut hpen_delta = Array1::<f64>::zeros(total_p);
let mut tr_penalty_scratch = Array1::<f64>::zeros(total_p);
for trust_attempt in 0..JOINT_TRUST_MAX_ATTEMPTS {
line_search_attempts = trust_attempt + 1;
accepted_joint_workspace = None;
// Dogleg globalization (gam#826/#808): when the unconstrained
// Newton path is in force and a finite Cauchy leg was built,
// construct the dogleg blend of the Cauchy and Newton points at
// the current per-block radii. Otherwise (constrained-QP path,
// or after the preconditioned-descent fallback replaced
// `search_delta`) fall back to box-truncating the search step.
let mut trial_delta;
let mut block_step_norms = if let Some(spectrum) = joint_spectrum.as_ref() {
// Exact Moré–Sorensen trust-region step at the current radius
// (gam#979). The step already lies in the `D`-metric ball, so
// no dogleg blend or box-truncation is applied: on a shrink the
// direction is RE-SOLVED (bending toward the gradient), the
// property the dogleg/truncation lacked. Re-solving reuses the
// cached factorization at O(p) cost.
trial_delta = spectrum.trust_region_step(joint_trust_radius).delta;
joint_trust_region_block_metric_norms(
&trial_delta,
&ranges,
&joint_trust_metric_diag,
)
} else if let Some(cauchy) = dogleg_cauchy.as_ref()
&& !tried_preconditioned_descent
{
trial_delta = Array1::<f64>::zeros(total_p);
joint_dogleg_step_to_block_metric_radii(
&search_delta,
cauchy,
&ranges,
&joint_trust_metric_diag,
&joint_block_trust_radii,
&mut trial_delta,
)
} else {
trial_delta = search_delta.clone();
truncate_joint_step_to_block_metric_radii(
&mut trial_delta,
&ranges,
&joint_trust_metric_diag,
&joint_block_trust_radii,
)
};
if apply_joint_feasibility_limit(family, &states, &ranges, &mut trial_delta)
.is_err()
{
joint_trust_radius = shrink_active_joint_block_trust_radii(
&mut joint_block_trust_radii,
&block_step_norms,
0.25,
);
continue;
}
block_step_norms = joint_trust_region_block_metric_norms(
&trial_delta,
&ranges,
&joint_trust_metric_diag,
);
let step_norm = block_step_norms.iter().copied().fold(0.0_f64, f64::max);
let trial_step_inf = trial_delta
.iter()
.copied()
.map(f64::abs)
.fold(0.0_f64, f64::max);
let step_hit_trust_boundary = block_step_norms
.iter()
.zip(&joint_block_trust_radii)
.any(|(step_norm, radius)| {
joint_block_step_hit_trust_boundary(*step_norm, *radius)
});
// Predicted reduction must use the TRUE penalized Hessian
// (the one that appears in `f(β) = -ℓ + ½βᵀSβ + ½·joint_mode_diagonal_ridge·‖β‖²`),
// NOT the SPD-stabilized version. The stabilizing shift
// in `joint_solver_diagonal_ridge` is purely a solver-side
// tool to make the Newton system invertible when H_NLL
// has negative eigenvalues; it is not part of the true
// objective the trial-likelihood evaluator computes.
//
// If we use `joint_solver_diagonal_ridge` here, then for
// any Newton step lying in null(H_true) (e.g. the
// marginal-block cancellation direction in the saturated
// probit regime — see
// `marginal_block_hessian_cancels_in_saturated_regime`),
// predicted = ½·rhs·δ while actual = rhs·δ, giving ρ = 2
// exactly. The trust-region loop then accepts the step
// (ρ > 0.75 expands the radius), and the same regime
// repeats every cycle — exactly the large-scale-saturated
// failure trace. Pinned by
// `ridge_stabilization_gap_produces_exact_rho_two_in_null_direction`.
//
// `hpen_delta` and `tr_penalty_scratch` are hoisted outside
// this loop; the workspace variant reuses them without
// allocating per attempt.
hpen_delta.fill(0.0);
if apply_joint_penalized_hessian_into_with_workspace(
&joint_hessian_source,
&ranges,
&s_lambdas,
joint_mode_diagonal_ridge,
&trial_delta,
&mut hpen_delta,
&mut tr_penalty_scratch,
joint_bundle,
)
.is_err()
{
break;
}
// JEFFREYS/FIRTH CURVATURE IN THE TRUST-REGION MODEL (gam#729/#715).
// When the Jeffreys term is armed, the inner objective the merit
// (`trialobjective = −ℓ + ½βᵀSβ + Φ`) measures and the Newton step
// (`(H+Sλ+H_Φ)δ = ∇L−Sβ+∇Φ`) target both include the Firth term, so
// the trust-region quadratic model's curvature MUST include `H_Φδ`
// too. Omitting it (bare `(H+Sλ)δ`) makes `predicted_reduction`
// inconsistent with the H_Φ-augmented `rhs` and the Φ-augmented
// `actual_reduction`: for a coupled K-block family near the Firth
// optimum (residual floored at ‖∇Φ‖) the resulting trust_ratio is
// wrong, the line search rejects the genuine descent step (accepts
// ~0), and β freezes with the residual stalled at a constant ≫ tol
// — the unbounded-cycle non-convergence the inner solve exhibits on
// the Dirichlet/multinomial fits. Adding `H_Φδ` makes the model
// curvature match the augmented system the step solves and the
// merit the accept test uses, so the step is accepted and the
// residual descends. No-op when the term is condition-gated (∇Φ=0,
// H_Φ=0) or unavailable.
if let Some((_grad_phi, hphi)) = head_jeffreys_term.as_ref() {
let hphi_delta = hphi.dot(&trial_delta);
hpen_delta += &hphi_delta;
}
let predicted_reduction =
joint_quadratic_predicted_reduction(&rhs, &hpen_delta, &trial_delta);
let linearized_next_kkt_inf = hpen_delta
.iter()
.zip(rhs.iter())
.map(|(hpen, rhs)| (hpen - rhs).abs())
.fold(0.0_f64, f64::max);
// Reject only non-descent directions on the quadratic model.
// A small-but-positive predicted reduction is what Newton
// *should* produce near the optimum of a large-magnitude
// objective: ½δᵀHδ scales with curvature×step², so it can be
// far below the (relative) objective_tol = inner_tol·(1+|obj|)
// while still being a correct Newton step. Trust-region ρ
// shrink/expand handles small-but-valid Newton steps; the
// preconditioned branch below is only for model-invalid
// directions, and preserves linear constraints when present.
//
// NEAR-FLOOR CARVE-OUT (gam#787 binary matern centers=12). When
// the Newton proposal is already at the step-tolerance floor —
// `step_inf ≤ 4·step_tol`, the same round-off band the cert path
// uses — the iterate is doing KKT polishing on a flat objective,
// not global descent: there `predicted_reduction = rhs·δ − ½δᵀHδ`
// is two near-equal O(step²) quantities and its SIGN is round-off
// noise (a true Newton step gives +½δᵀHδ but the damped/range-
// restricted spectral solve leaves rhs·δ a hair below ½δᵀHδ). The
// `predicted_reduction ≤ 0` branch then mistook this for a model-
// invalid direction and substituted `joint_preconditioned_descent_delta`,
// a step sized for OBJECTIVE descent (diagonal-preconditioned
// gradient, O(900×) larger than the polishing proposal). That step
// bought a round-off-level objective gain but catapulted the KKT
// residual off a near-converged iterate (‖∇L−Sβ‖ 1.7e-4 → 4.7e-1),
// which then never recovered — every later cycle re-triggered the
// same substitution (proposal stays pred≤0), pinning the residual
// far above tol until the cycle budget exhausted → seed rejected →
// hard raise. At the step floor we instead take the tiny proposal
// as-is and let the trust-region noise-floor guard accept it at
// ρ=1 (it neither helps nor hurts the objective beyond round-off),
// so the inner keeps polishing the KKT residual to tol.
let proposal_at_step_floor = joint_proposal_at_step_floor(step_inf, step_tol);
if (!predicted_reduction.is_finite() || predicted_reduction <= 0.0)
&& !proposal_at_step_floor
{
model_rejects += 1;
if !tried_preconditioned_descent {
match joint_preconditioned_descent_delta(
&joint_hessian_source,
&ranges,
&s_lambdas,
joint_solver_diagonal_ridge,
&rhs,
joint_bundle,
) {
Ok(descent_delta) => {
search_delta = descent_delta;
}
Err(_) => {
joint_trust_radius = shrink_active_joint_block_trust_radii(
&mut joint_block_trust_radii,
&block_step_norms,
0.25,
);
}
}
tried_preconditioned_descent = true;
} else {
joint_trust_radius = shrink_active_joint_block_trust_radii(
&mut joint_block_trust_radii,
&block_step_norms,
0.25,
);
}
continue;
}
for b in 0..specs.len() {
let (start, end) = ranges[b];
let mut trial_beta = old_beta[b].clone();
trial_beta += &trial_delta.slice(ndarray::s![start..end]);
let projected =
family.post_update_block_beta(&states, b, &specs[b], trial_beta.clone())?;
reject_constrained_post_update_repair(
b,
&specs[b],
&trial_beta,
&projected,
block_constraints[b].as_ref(),
)?;
states[b].beta.assign(&projected);
}
refresh_all_block_etas(family, specs, &mut states)?;
let mut trial_penalty = total_quadratic_penalty(
&states,
&s_lambdas,
ridge,
options.ridge_policy,
joint_bundle,
Some(specs),
);
// Jeffreys objective contribution at the trial point keeps the
// accept/reject objective consistent with the Jeffreys-modified
// Newton step. `states` already holds the trial coefficients
// (assigned + eta-refreshed above). No-op when the Jeffreys term
// is unavailable or condition-gated to zero. When the cheap pre-
// check certified this cycle well-conditioned, the step used H_Φ=0
// / ∇Φ=0, so the consistent accept/reject objective also uses Φ=0:
// skipping here keeps value and step on the SAME objective (the
// value/step consistency the term exists to enforce) and avoids the
// dense H/eigh at the trial point. The 8× conditioning margin makes
// a single damped Newton step incapable of crossing the gate.
// SUBTRACT Φ: the inner NLL objective is `−ℓ + ½βᵀSβ − Φ` (Firth
// adds ½log|I| to the log-likelihood). Must match the cycle-0
// baseline, the Newton step, and the KKT residual — INCLUDING the
// `jeffreys_skippable_this_cycle` gate, so that on a well-conditioned
// cycle the trial, the step (H_Φ=0/∇Φ=0), and the residual all sit
// on the SAME Φ=0 objective (gam#729/#715 sign fix; the baseline and
// post-accept folds carry the matching skippable gate).
if !jeffreys_skippable_this_cycle
&& let Some(z_joint) = joint_jeffreys_subspace.as_ref()
{
trial_penalty -= custom_family_joint_jeffreys_value(
family, &states, specs, &ranges, z_joint,
);
}
// Cheap-LL line-search path: rejected backtracking attempts
// discard the exact-Newton workspace they build, so we evaluate
// just the scalar full-data log-likelihood for the accept/reject
// decision and only build the full state once the step is
// accepted (via the gradient reload below).
//
// EARLY-EXIT THRESHOLD MUST BOUND THE NLL, NOT THE FULL OBJECTIVE
// (was a stall — gam#787/#785, duchon centers≥20). The family's
// `bernoulli_margslope_line_search_ll_with_early_exit` short-
// circuits the row sweep when the accumulated `-Σ wᵢ log CDF` (the
// NLL ALONE — no penalty, no Jeffreys Φ) exceeds the threshold; its
// monotone-lower-bound proof is valid only for the NLL term. But the
// accept test is on the FULL augmented objective
// `F = -ℓ + ½βᵀSβ + Φ_trial`, accepted iff `F ≤ old_objective + slack`,
// i.e. iff `-ℓ_trial ≤ old_objective + slack − penalty_trial`. Passing
// the full `old_objective` as the NLL threshold therefore over-rejects
// by exactly `penalty_trial`: where the trial penalty is NEGATIVE
// (the Jeffreys term subtracts Φ, and `½βᵀSβ` can be net-negative
// under the reparam) the NLL threshold sits BELOW the true accept
// bound, so the early exit kills net-descent steps the trust region
// would accept — every backtracking attempt false-rejects, the radius
// collapses, and the inner exits non-converged at cycle ~2 (seed
// rejected pre-solver → hard raise, β pinned). Subtract the trial
// penalty so the threshold is the NLL the trial must beat.
let line_search_options =
coefficient_line_search_options(options, old_objective + 1e-10 - trial_penalty);
let trial_ll =
match joint_line_search_log_likelihood(family, &line_search_options, &states) {
Ok((value, workspace)) => {
accepted_joint_workspace = workspace;
value
}
Err(e) => {
likelihood_rejects += 1;
if first_likelihood_reject.is_none() {
first_likelihood_reject = Some(e);
}
for (b, old) in old_beta.iter().enumerate() {
states[b].beta.assign(old);
}
refresh_all_block_etas(family, specs, &mut states)?;
joint_trust_radius = shrink_active_joint_block_trust_radii(
&mut joint_block_trust_radii,
&block_step_norms,
0.25,
);
continue;
}
};
let trialobjective = -trial_ll + trial_penalty;
// Row measure observed by the trial objective at β + δ. The
// line-search helper above runs under `coefficient_line_search_options`,
// which now preserves `outer_score_subsample` and disables
// any further auto-install; if either contract is broken the
// id will diverge from `tr_row_measure_top` and we Err below.
let tr_row_measure_trial =
crate::solver::row_measure::RowMeasure::from_options(options, total_joint_n);
// Hard invariant: the trust-region ratio numerator (objective
// at β minus trial at β+δ) and denominator (rhs·δ − ½δᵀH δ)
// MUST share a row measure with the Hessian/gradient build.
// Bubble out via `Err` rather than panic; this function
// already returns `Result<_, String>`.
let top_id = tr_row_measure_top.id;
if tr_row_measure_hessian.id != top_id {
return Err(format!(
"trust-region row-measure invariant violated: \
Hessian id 0x{:016x} differs from top-of-cycle id 0x{:016x} \
(cycle {}); the joint Hessian was built against a different \
row mask than the trust-region globalization captured at the \
top of the cycle. ρ would compare ½δᵀHδ on one measure to \
F(β)−F(β+δ) on another.",
tr_row_measure_hessian.id, top_id, cycle
));
}
if tr_row_measure_gradient.id != top_id {
return Err(format!(
"trust-region row-measure invariant violated: \
gradient id 0x{:016x} differs from top-of-cycle id 0x{:016x} \
(cycle {}); `cached_joint_gradient` was loaded against a \
different row mask than the trust-region globalization \
captured at the top of the cycle. rhs·δ in the predicted \
reduction would not match the rest of the ρ inputs.",
tr_row_measure_gradient.id, top_id, cycle
));
}
if tr_row_measure_old_objective.id != top_id {
return Err(format!(
"trust-region row-measure invariant violated: \
objective-at-β id 0x{:016x} differs from top-of-cycle id \
0x{:016x} (cycle {}); `lastobjective` was computed against \
a different row mask than the trust-region globalization \
captured at the top of the cycle.",
tr_row_measure_old_objective.id, top_id, cycle
));
}
if tr_row_measure_trial.id != top_id {
return Err(format!(
"trust-region row-measure invariant violated: \
trial-objective id 0x{:016x} differs from top-of-cycle id \
0x{:016x} (cycle {}, attempt {}); the line-search trial \
likelihood evaluated against a different row mask than the \
Hessian/gradient/old-objective build. Cf. \
`coefficient_line_search_options` and \
`install_auto_outer_subsample_options`.",
tr_row_measure_trial.id, top_id, cycle, trust_attempt
));
}
let actual_reduction = old_objective - trialobjective;
let trust_update = update_joint_trust_region_radius(
joint_trust_radius,
step_norm,
actual_reduction,
predicted_reduction,
old_objective,
);
let old_radius = joint_trust_radius;
// Classify the outcome of this attempt so the diagnostic line
// says *why* the step was taken or rejected rather than just
// dumping numbers. The four phases partition the post-log
// branches below; computing them up front lets the log line
// and the dispatch agree.
let floor_reached = trust_update.accepted
&& current_stationarity_residual <= residual_tol
&& joint_objective_floor_reached(
old_objective,
trialobjective,
actual_reduction,
predicted_reduction,
objective_tol,
);
let roundoff_slack = joint_objective_roundoff_slack(old_objective, trialobjective);
let secondary_ok = !floor_reached
&& trialobjective.is_finite()
&& trust_update.accepted
&& trialobjective <= old_objective + roundoff_slack;
let phase: &'static str = if floor_reached {
"converged"
} else if secondary_ok {
"accepted"
} else if trust_update.accepted {
"stall"
} else {
"reject"
};
if floor_reached || secondary_ok {
for (block_radius, block_step_norm) in joint_block_trust_radii
.iter_mut()
.zip(block_step_norms.iter())
{
let block_update = update_joint_trust_region_radius(
*block_radius,
*block_step_norm,
actual_reduction,
predicted_reduction,
old_objective,
);
if block_update.radius >= *block_radius
|| joint_block_step_hit_trust_boundary(*block_step_norm, *block_radius)
{
*block_radius = block_update.radius;
}
}
joint_trust_radius = joint_block_trust_radii
.iter()
.copied()
.fold(0.0_f64, f64::max);
} else {
joint_trust_radius = shrink_active_joint_block_trust_radii(
&mut joint_block_trust_radii,
&block_step_norms,
0.25,
);
}
let radius_held =
(joint_trust_radius - old_radius).abs() <= 1e-12 * old_radius.abs().max(1.0);
let joint_math = JointNewtonMathDiagnostic {
old_kkt_inf: current_kkt_norm,
linearized_next_kkt_inf,
predicted_reduction,
actual_reduction,
trust_ratio: trust_update.rho,
step_inf: trial_step_inf,
proposal_inf: step_inf,
};
let radius_field = if radius_held {
format!("r={:.3e} (held)", old_radius)
} else {
format!("r={:.3e}->{:.3e}", old_radius, joint_trust_radius)
};
// Surface the TR-policy decision so future failures
// distinguish "TR is throttling Newton" from "TR is not
// the bottleneck — Newton itself finds short steps".
// For the large-scale linear-convergence pattern the policy
// is consistently `hold_inside` (ρ≈1, |δ| ≪ radius),
// which proves the TR is not what is keeping the step
// small — that came up before via "(held)" alone but
// the explicit decision label makes the inference
// immediate instead of requiring step/radius arithmetic
// in the reader's head.
let tr_attempt_sig = format!(
"{:<9} ρ={:+.3e} Δobj={:+.3e} pred={:+.3e} {} decision={:<22} |δ|={:.3e} |δ|∞={:.3e} |prop|∞={:.3e}",
phase,
trust_update.rho,
actual_reduction,
predicted_reduction,
radius_field,
trust_update.decision.label(),
step_norm,
trial_step_inf,
step_inf,
);
match tr_log_sig.as_deref() {
Some(prev) if prev == tr_attempt_sig.as_str() => {
tr_log_last = line_search_attempts;
}
Some(prev) => {
if tr_log_first == tr_log_last {
log::info!(
"[PIRLS/joint-Newton/TR cycle={} attempt={}] {}",
cycle,
tr_log_first,
prev,
);
} else {
log::info!(
"[PIRLS/joint-Newton/TR cycle={} attempts={}..{} ×{}] {}",
cycle,
tr_log_first,
tr_log_last,
tr_log_last - tr_log_first + 1,
prev,
);
}
tr_log_sig = Some(tr_attempt_sig);
tr_log_first = line_search_attempts;
tr_log_last = line_search_attempts;
}
None => {
tr_log_sig = Some(tr_attempt_sig);
tr_log_first = line_search_attempts;
tr_log_last = line_search_attempts;
}
}
if floor_reached {
if let Some(sig) = tr_log_sig.take() {
if tr_log_first == tr_log_last {
log::info!(
"[PIRLS/joint-Newton/TR cycle={} attempt={}] {}",
cycle,
tr_log_first,
sig,
);
} else {
log::info!(
"[PIRLS/joint-Newton/TR cycle={} attempts={}..{} ×{}] {}",
cycle,
tr_log_first,
tr_log_last,
tr_log_last - tr_log_first + 1,
sig,
);
}
}
for (b, old) in old_beta.iter().enumerate() {
states[b].beta.assign(old);
}
refresh_all_block_etas(family, specs, &mut states)?;
last_joint_math = Some(joint_math);
accepted = true;
converged = true;
break;
}
if secondary_ok {
if let Some(sig) = tr_log_sig.take() {
if tr_log_first == tr_log_last {
log::info!(
"[PIRLS/joint-Newton/TR cycle={} attempt={}] {}",
cycle,
tr_log_first,
sig,
);
} else {
log::info!(
"[PIRLS/joint-Newton/TR cycle={} attempts={}..{} ×{}] {}",
cycle,
tr_log_first,
tr_log_last,
tr_log_last - tr_log_first + 1,
sig,
);
}
}
current_penalty = trial_penalty;
if let Some(joint_active_set) = search_joint_active_set.as_ref() {
cached_active_sets =
scatter_joint_active_set(joint_active_set, &block_constraints);
}
last_joint_math = Some(joint_math);
last_accepted_hit_joint_trust_boundary = step_hit_trust_boundary;
accepted = true;
break;
}
for (b, old) in old_beta.iter().enumerate() {
states[b].beta.assign(old);
}
refresh_all_block_etas(family, specs, &mut states)?;
objective_rejects += 1;
}
if let Some(sig) = tr_log_sig.take() {
if tr_log_first == tr_log_last {
log::info!(
"[PIRLS/joint-Newton/TR cycle={} attempt={}] {}",
cycle,
tr_log_first,
sig,
);
} else {
log::info!(
"[PIRLS/joint-Newton/TR cycle={} attempts={}..{} ×{}] {}",
cycle,
tr_log_first,
tr_log_last,
tr_log_last - tr_log_first + 1,
sig,
);
}
}
let line_search_elapsed = line_search_started.elapsed();
if accepted && converged {
log::info!(
"[PIRLS/joint-Newton/cycle-summary] cycle={} accepted=true hessian_qp={:.3}s line_search={:.3}s line_search_attempts={} reject_model={} reject_likelihood={} reject_objective={} first_likelihood_reject={} grad_reload=0.000s total={:.3}s",
cycle,
hessian_and_qp_elapsed.as_secs_f64(),
line_search_elapsed.as_secs_f64(),
line_search_attempts,
model_rejects,
likelihood_rejects,
objective_rejects,
first_likelihood_reject.as_deref().unwrap_or("none"),
cycle_started.elapsed().as_secs_f64(),
);
cached_joint_workspace = hessian_workspace_for_cycle;
cycles_done = cycle + 1;
break;
}
if !accepted {
// Retry the joint Newton loop from the same state after a
// failed trust-region search. Falling through into blockwise
// would switch a coupled exact-Hessian problem onto a
// principal-block surrogate, which is the ridge-drift failure
// mode this path is meant to avoid. The trust-region radius
// already collapsed via the attempt loop's shrink rules, so
// the next cycle's Newton proposal will be evaluated under
// a tighter L2 bound without any parallel adaptation here.
log::info!(
"[PIRLS/joint-Newton/cycle-summary] cycle={} accepted=false hessian_qp={:.3}s line_search={:.3}s line_search_attempts={} reject_model={} reject_likelihood={} reject_objective={} first_likelihood_reject={} grad_reload=0.000s total={:.3}s",
cycle,
hessian_and_qp_elapsed.as_secs_f64(),
line_search_elapsed.as_secs_f64(),
line_search_attempts,
model_rejects,
likelihood_rejects,
objective_rejects,
first_likelihood_reject.as_deref().unwrap_or("none"),
cycle_started.elapsed().as_secs_f64(),
);
// Restore original betas
for (b, old) in old_beta.iter().enumerate() {
states[b].beta.assign(old);
}
refresh_all_block_etas(family, specs, &mut states)?;
// If the previous cycle's bookkeeping certified KKT
// stationarity (residual ≤ tol and objective change ≤
// tol), the line-search failure here is round-off on a
// rank-deficient null mode rather than non-convergence:
// the proposed `H⁻¹ g` step stays O(1) along the null
// direction at the optimum, every trial moves β along
// it without changing the objective, and round-off
// flips the sign of `actual − predicted` so the
// sufficient-decrease check rejects every trial. The
// iterate ALREADY satisfies the first-order optimality
// conditions; we accept that as convergence rather
// than fail the outer "inner solve did not converge"
// panic on a fully resolved fit.
if last_cycle_residual_below_tol && last_cycle_obj_change_below_tol {
converged = true;
break;
}
// Fully-rejected stall guard. See the constant declaration
// at the top of this function for the full rationale. The
// condition is: every trust attempt this cycle failed the
// *actual-objective* line search (model_rejects ==
// likelihood_rejects == 0, objective_rejects ==
// JOINT_TRUST_MAX_ATTEMPTS) AND the joint trust radius did
// not shrink relative to the previous fully-rejected cycle.
// Both together prove the next cycle's Newton system,
// trust radius, and trust-region search are bytewise
// identical to this cycle's — there is no descent direction
// the local quadratic model can reconcile at this β. After
// FULLY_REJECTED_STALL_MAX_CYCLES such cycles, exit
// non-converged so the outer optimizer rejects this ρ.
let all_attempts_objective_rejected = objective_rejects == JOINT_TRUST_MAX_ATTEMPTS
&& model_rejects == 0
&& likelihood_rejects == 0;
let radius_held_since_last_reject = match prev_rejected_trust_radius {
Some(prev) => {
joint_trust_radius.is_finite()
&& prev.is_finite()
&& joint_trust_radius >= prev * (1.0 - 1e-12)
}
None => false,
};
if all_attempts_objective_rejected && radius_held_since_last_reject {
consecutive_held_rejected_cycles =
consecutive_held_rejected_cycles.saturating_add(1);
} else {
consecutive_held_rejected_cycles = 0;
}
prev_rejected_trust_radius = Some(joint_trust_radius);
if consecutive_held_rejected_cycles >= FULLY_REJECTED_STALL_MAX_CYCLES {
let last_math_summary = last_joint_math
.as_ref()
.map(|math| {
format!(
"last_newton_math={{old_kkt={:.3e}, linearized_next={:.3e}, actual={:+.3e}, pred={:+.3e}, rho={:+.3e}, scalar_relerr={:.3e}, step_inf={:.3e}, proposal_inf={:.3e}}}",
math.old_kkt_inf,
math.linearized_next_kkt_inf,
math.actual_reduction,
math.predicted_reduction,
math.trust_ratio,
math.scalar_model_relative_error(),
math.step_inf,
math.proposal_inf,
)
})
.unwrap_or_else(|| "last_newton_math=<none>".to_string());
log::warn!(
"[PIRLS/joint-Newton convergence] cycle {:>3} | fully-rejected stall \
early-exit: every trust-region attempt rejected on the actual-objective \
check for {} consecutive cycles with joint trust radius held at {:.3e} \
throughout. Reverted β + held trust radius mean the next cycle's Newton \
step is byte-identical to this one's; no descent direction is reachable \
from this iterate under the current local model. {}. Returning \
unconverged with finite β so the outer optimizer rejects this ρ \
evaluation before inner_max_cycles.",
cycle,
consecutive_held_rejected_cycles,
joint_trust_radius,
last_math_summary,
);
converged = false;
break;
}
// CONTINUE rather than break (gam#826/#872/#715). The comment
// above documents the intent — "retry the joint Newton loop from
// the same state after a failed trust-region search" — but the old
// code BROKE instead, giving up after a SINGLE cycle of failed line
// search. On a severely near-separating coupled fit (matern
// binomial location-scale, quasi-separating multinomial, flexible
// linkwiggle) the cycle-0 Newton proposal is huge (the separation
// gradient ÷ the Firth-bounded curvature), the trust region clamps
// it, and the clamped step does not yet reduce the merit — so the
// FIRST cycle's backtracking exhausts without acceptance. The
// attempt loop already shrank `joint_trust_radius` /
// `joint_block_trust_radii` (carried across cycles), so the NEXT
// cycle re-proposes under the tighter radius and eventually accepts
// a productive step — standard trust-region globalization. Breaking
// at cycle 0 aborted the coupled solve ("exited the joint Newton
// path before convergence — no math snapshot") before the trust
// region could adapt. The inner cycle cap and the residual-stall /
// trust-region-floor guards above still bound the loop, so a
// genuinely stuck fit exits with a diagnosed non-convergence rather
// than spinning. Falling through to blockwise (the old `break`)
// would switch the coupled exact-Hessian problem onto a
// principal-block surrogate (the ridge-drift mode this path avoids).
continue;
}
let grad_reload_started = std::time::Instant::now();
log::info!(
"[joint-newton-tr] phase=gradient_reload cycle={} attempts={} r={:.3e}",
cycle,
line_search_attempts,
joint_trust_radius,
);
let (log_likelihood, gradient, eval, workspace) = load_joint_gradient_evaluation(
family,
specs,
options,
&states,
joint_workspace_requested,
accepted_joint_workspace.take(),
)?;
let grad_reload_elapsed = grad_reload_started.elapsed();
// Reset the fully-rejected stall guard's bookkeeping: an accepted
// cycle moved β and may have grown the trust radius, so the next
// rejected-cycle comparison must start fresh rather than carry
// forward a stale radius snapshot from the previous reject streak.
prev_rejected_trust_radius = None;
consecutive_held_rejected_cycles = 0;
// Accepted-cycle timing breakdown is debug-only. The per-cycle
// info line below already includes total cycle time; emitting a
// four-phase split on every verbose cycle adds a redundant info
// line. Rejected cycles still keep the detailed phase log since
// the reject reason and per-phase split is the diagnostic.
log::debug!(
"[PIRLS/joint-Newton/cycle-summary] cycle={} accepted=true hessian_qp={:.3}s line_search={:.3}s line_search_attempts={} grad_reload={:.3}s total={:.3}s",
cycle,
hessian_and_qp_elapsed.as_secs_f64(),
line_search_elapsed.as_secs_f64(),
line_search_attempts,
grad_reload_elapsed.as_secs_f64(),
cycle_started.elapsed().as_secs_f64(),
);
current_log_likelihood = log_likelihood;
cached_joint_gradient = gradient;
cached_eval = eval;
cached_joint_workspace = workspace;
current_penalty = total_quadratic_penalty(
&states,
&s_lambdas,
ridge,
options.ridge_policy,
joint_bundle,
Some(specs),
);
// `current_penalty` / `lastobjective` stay the pure quadratic-penalized
// objective (NO Φ folded in) — the Firth value is applied per cycle at
// each β (see `old_objective` above and `trialobjective` below). The
// gated Φ at the accepted β is captured separately so the convergence
// `objective_change` compares the augmented objective at the new vs old
// β consistently (gam#826/#872).
lastobjective = -current_log_likelihood + current_penalty;
let new_phi = if !jeffreys_skippable_this_cycle {
joint_jeffreys_subspace
.as_ref()
.map(|z_joint| {
custom_family_joint_jeffreys_value(family, &states, specs, &ranges, z_joint)
})
.unwrap_or(0.0)
} else {
0.0
};
let accepted_step_inf = states
.iter()
.zip(old_beta.iter())
.flat_map(|(state, old)| {
state
.beta
.iter()
.zip(old.iter())
.map(|(new, old)| (new - old).abs())
})
.fold(0.0_f64, f64::max);
cycles_done = cycle + 1;
// Check convergence via joint stationarity. When the family-general
// Firth/Jeffreys term is armed, the penalized objective the inner
// Newton actually optimizes is `−ℓ + ½βᵀSβ − Φ`, so its KKT
// stationarity is `∇L − Sβ + ∇Φ = 0`. The Newton STEP already folds
// `∇Φ` into its RHS (`spectral_rhs += grad_phi`), but the bare
// `exact_newton_joint_stationarity_*` residual omits it — at the
// Firth fixed point `∇L − Sβ = −∇Φ`, so the certificate floors at
// `‖∇Φ‖∞` and never certifies, stalling the inner solve on exactly
// the near-separating span Firth is meant to bound (the residual the
// outer REML then rejects). Fold `∇Φ` into the gradient used for the
// KKT residual so the convergence criterion matches the augmented
// objective the step descends. No-op when the Jeffreys term is
// unavailable or condition-gated to zero.
let Some(gradient) = cached_joint_gradient.as_ref() else {
break;
};
let jeffreys_augmented_gradient: Option<Array1<f64>> = if jeffreys_skippable_this_cycle
{
// Well-conditioned ⇒ ∇Φ = 0, so the KKT residual is the bare
// stationarity (and floors at 0, not ‖∇Φ‖) — matching the step,
// which folded H_Φ=0/∇Φ=0 this cycle. Avoids the dense H/eigh.
None
} else if let Some(z_joint) = joint_jeffreys_subspace.as_ref() {
match custom_family_joint_jeffreys_term(family, &states, specs, &ranges, z_joint)? {
Some((_phi, grad_phi, hphi))
if grad_phi.len() == gradient.len()
&& hphi.nrows() == total_p
&& hphi.ncols() == total_p =>
{
let augmented = gradient + &grad_phi;
// Cache the exact triple at the just-accepted β so the next
// cycle's head reuses it instead of recomputing the
// O(p)-directional-derivative + GEMM term (gam#729).
let post_beta_key = flatten_state_betas(&states, specs);
jeffreys_triple_cache = Some((post_beta_key, grad_phi, hphi));
Some(augmented)
}
_ => None,
}
} else {
None
};
let residual_gradient = jeffreys_augmented_gradient.as_ref().unwrap_or(gradient);
let residual = exact_newton_joint_stationarity_inf_norm_from_gradient(
residual_gradient,
&states,
specs,
&s_lambdas,
ridge,
options.ridge_policy,
&block_constraints,
Some(cached_active_sets.as_slice()),
)?;
prev_kkt_norm = Some(residual);
// Record this cycle's KKT residual for the steady-geometric-descent
// test at the certificate-refusal gate below (gam#787 centers≥20).
if residual.is_finite() {
residual_descent_history.push_back(residual);
while residual_descent_history.len() > RESIDUAL_DESCENT_WINDOW {
residual_descent_history.pop_front();
}
}
// Scale-aware tolerances. The objective check was already
// relative (`inner_tol * (1 + |obj|)`), but the step and
// residual checks were absolute against the bare `inner_tol`
// — at large scale (n ≈ 320k), β iterates can keep moving
// by ~1e-5 per cycle along the monotonicity-feasible
// manifold even after the likelihood has gone flat, and the
// joint gradient ‖·‖_∞ is O(|obj|), not O(1). Running
// 50-100 cycles past objective convergence is the
// dominant inner-PIRLS cost at large scale. Switching to
// relative scaling (`inner_tol * (1 + ‖β‖_∞)` for steps,
// `inner_tol * (1 + |obj|)` for the gradient residual)
// exits PIRLS as soon as the optimum is statistically
// resolved, without loosening behavior at small n where
// ‖β‖_∞ ≈ 1 and |obj| ≈ 1 give tolerances within 2× of
// the historical absolute 1e-6.
let beta_inf = states
.iter()
.flat_map(|s| s.beta.iter().copied())
.map(f64::abs)
.fold(0.0_f64, f64::max);
let step_tol = inner_tol * (1.0 + beta_inf);
let objective_tol = inner_tol * (1.0 + lastobjective.abs());
// KKT residual tolerance must scale with the natural magnitude of
// ‖Sβ − ∇L‖∞ (i.e. max(‖∇L‖∞, ‖Sβ‖∞)), not the objective. At
// large scale with |β|∞ in the 10²–10³ range the gradient and
// penalty norms can sit orders of magnitude above |obj| and FP
// noise alone keeps the residual above any obj-scaled tol. The
// pre-line-search check at the head of the cycle already uses
// `inner_tol * (1 + max(grad_inf, pen_inf))`; using only grad_inf
// here created an asymmetry where the same convergence criterion
// would accept at one site and reject at the other, and on
// marginal-slope models where Sβ is the larger term it shrank
// the post-accept tolerance below the achievable FP floor.
let mut block_gradient_norms = Vec::with_capacity(states.len());
let mut block_penalty_norms = Vec::with_capacity(states.len());
for (block_idx, (start, end)) in ranges.iter().copied().enumerate() {
block_gradient_norms.push(
gradient
.slice(s![start..end])
.iter()
.map(|x: &f64| x.abs())
.fold(0.0_f64, f64::max),
);
let mut penalty_block = s_lambdas[block_idx].dot(&states[block_idx].beta);
if options.ridge_policy.include_quadratic_penalty && ridge > 0.0 {
penalty_block += &states[block_idx].beta.mapv(|v| ridge * v);
}
block_penalty_norms.push(
penalty_block
.iter()
.map(|x: &f64| x.abs())
.fold(0.0_f64, f64::max),
);
}
let grad_inf = block_gradient_norms.iter().copied().fold(0.0_f64, f64::max);
let pen_inf = block_penalty_norms.iter().copied().fold(0.0_f64, f64::max);
// Firth/Jeffreys score magnitude. The convergence residual is the
// AUGMENTED stationarity `∇L − Sβ + ∇Φ`, so `∇Φ` is a first-class term
// whose own numerical scale sets the achievable KKT floor: `∇Φ` is a
// trace `½ tr(H_id⁻¹ Z_Jᵀ Ḣ Z_J)` formed from a FLOORED reduced-info
// pseudo-inverse, so its components carry O(‖∇Φ‖·ε_floor) round-off
// that the augmented residual cannot polish below. Scaling the KKT
// tolerance by `max(grad, pen, ‖∇Φ‖)` (not just grad/pen) makes the
// certificate reachable for coupled K-block Firth fits whose data
// gradient is small but whose Firth score is O(1): otherwise the
// augmented residual plateaus a few × above an unattainably tight
// `inner_tol·(1+grad)` tol and the solve refuses just short of
// convergence (gam#729/#715 — the residual stalled at ~8.8e-6 against a
// ~1e-6 tol). No-op when the term is condition-gated (∇Φ=0).
let firth_score_inf = head_jeffreys_term
.as_ref()
.map(|(grad_phi, _hphi)| grad_phi.iter().map(|v| v.abs()).fold(0.0_f64, f64::max))
.unwrap_or(0.0);
let residual_tol = inner_tol * (1.0 + grad_inf.max(pen_inf).max(firth_score_inf));
let block_stationarity_tolerances = block_gradient_norms
.iter()
.zip(&block_penalty_norms)
.map(|(grad_norm, penalty_norm)| inner_tol * (1.0 + grad_norm.max(*penalty_norm)))
.collect::<Vec<_>>();
// Active-set-projected stationarity residual vector (multiplier
// mass of every pinned bound row already subtracted). Lifted out of
// the per-block norm reduction so the constrained-stationary
// certificate below can also test its component in the *range* of
// the penalized Hessian (gam#553 penalty-null-space acceptance).
let projected_residual_vec =
exact_newton_joint_projected_stationarity_vector_from_gradient(
gradient,
&states,
specs,
&s_lambdas,
ridge,
options.ridge_policy,
&block_constraints,
Some(cached_active_sets.as_slice()),
)?;
let block_stationarity_norms = {
let mut offset = 0usize;
states
.iter()
.map(|state| {
let start = offset;
let end = start + state.beta.len();
offset = end;
projected_residual_vec
.slice(ndarray::s![start..end])
.iter()
.map(|x: &f64| x.abs())
.fold(0.0_f64, f64::max)
})
.collect::<Vec<_>>()
};
let all_block_stationarity_small = block_stationarity_norms
.iter()
.zip(&block_stationarity_tolerances)
.all(|(norm, tol)| {
norm.is_finite()
&& tol.is_finite()
&& *norm <= RESIDUAL_STALL_BLOCK_GRADIENT_FACTOR * *tol
});
let near_convergence = residual <= 10.0 * residual_tol;
// Augmented-objective change: `(quad(new) − Φ_gated(new)) −
// (quad(old) − Φ_gated(old))`. `lastobjective` is quadratic-only and
// `old_objective` already carries `−old_phi`, so subtract the accepted
// β's `new_phi` here to keep both endpoints on the Φ-augmented merit
// (gam#826/#872). On a skippable cycle both phis are 0 ⇒ identical to
// the bare quadratic change.
let signed_obj_change = (lastobjective - new_phi) - old_objective;
let objective_change = signed_obj_change.abs();
// Per-cycle observability for the convergence test. Surfaces
// WHICH criterion is binding (proposed step, accepted step,
// residual, objective change) at every iteration so CI logs
// distinguish "Newton hasn't proposed a small step yet"
// (algorithm still working) from "step is small but residual
// won't drop below tol" (tolerance scaling problem). Without
// this, the only visible signal is the objective itself,
// which is insufficient to choose the right algorithmic
// remedy.
//
// gam#979 discriminator: the PER-BLOCK projected stationarity
// breakdown. The aggregate `residual` alone cannot distinguish a
// genuinely-coupled stall from one block dragging the others — for
// the survival marginal↔logslope grind the question "is the total
// residual dominated by a single block (the multiplicative
// z·exp(logslope) coupling channel), or spread evenly (global
// conditioning)?" is answerable only from the split. `block_resid`
// is already computed above for the convergence test, so surfacing
// it per cycle is free; reading it across a 75 s repro under
// RUST_LOG=info tells whether the slowdown is a single stuck block
// (curvature/coupling channel) or an evenly slow descent
// (conditioning) — without it the four #979 candidates are not
// separable from the timeline.
let block_resid_sig = block_stationarity_norms
.iter()
.map(|n| format!("{n:.3e}"))
.collect::<Vec<_>>()
.join(",");
log::info!(
"[PIRLS/joint-Newton convergence] cycle {:>3} | step_inf={:.3e} (tol={:.3e}) | accepted_step_inf={:.3e} | residual={:.3e} (tol={:.3e}) | per_block_resid=[{}] | obj_change={:.3e} (tol={:.3e}) | beta_inf={:.3e}",
cycle,
step_inf,
step_tol,
accepted_step_inf,
residual,
residual_tol,
block_resid_sig,
objective_change,
objective_tol,
beta_inf,
);
if verbose_cycle || near_convergence {
log::info!(
"[PIRLS/JN] cyc={:>3}/{} obj={:.6e} -loglik={:.6e} pen={:.3e} Δobj={:+.3e} |δ|∞={:.3e} accepted_|δ|∞={:.3e} resid={:.3e} (tol={:.3e}) obj_tol={:.3e} step_tol={:.3e} |β|∞={:.3e} attempts={} t={:.3}s",
cycle,
inner_max_cycles,
lastobjective,
-current_log_likelihood,
current_penalty,
signed_obj_change,
step_inf,
accepted_step_inf,
residual,
residual_tol,
objective_tol,
step_tol,
beta_inf,
line_search_attempts,
cycle_started.elapsed().as_secs_f64(),
);
} else {
log::info!(
"[PIRLS/JN] cyc={:>3}/{} obj={:.6e} Δobj={:+.3e} |δ|∞={:.3e} resid={:.3e} attempts={} t={:.3}s",
cycle,
inner_max_cycles,
lastobjective,
signed_obj_change,
accepted_step_inf,
residual,
line_search_attempts,
cycle_started.elapsed().as_secs_f64(),
);
}
// Divergence guard: a non-finite KKT residual, objective, or
// log-likelihood means the inner joint Newton has diverged (NaN
// mass propagating from a near-unidentified penalized block — the
// binomial location-scale shared-basis log-σ deviation channel is
// the canonical trigger, gam#554). Every convergence and
// residual-stall exit below is gated on finite `<=` comparisons,
// which a NaN residual silently defeats; left unguarded the loop
// then grinds the full `inner_loop_hard_ceiling` on every outer
// ρ-eval and every startup seed, which is the multi-hour "hang".
// Treat it as immediate non-convergence so the outer optimizer
// rejects this point cleanly instead of burning the budget.
if !residual.is_finite()
|| !lastobjective.is_finite()
|| !current_log_likelihood.is_finite()
{
log::warn!(
"[PIRLS/joint-Newton convergence] cycle {:>3} | divergence guard: non-finite inner state (residual={:.3e}, objective={:.3e}, -loglik={:.3e}); returning unconverged so the outer optimizer rejects this ρ evaluation instead of running to inner_max_cycles.",
cycle,
residual,
lastobjective,
-current_log_likelihood,
);
converged = false;
break;
}
// KKT convergence: a small post-step residual is the
// canonical optimality certificate for the penalized
// objective. ‖∇L(β) − Sβ‖∞ ≤ residual_tol means the
// iterate is at a KKT point to numerical precision and
// further iteration cannot reduce it; the step magnitude
// is irrelevant once the residual signal has fired.
//
// Tying convergence to a small step instead would refuse
// to recognise quadratic-rate single-shot convergence:
// exact Newton on an exact quadratic produces one full
// step that lands at the optimum, so ‖delta‖∞ equals the
// initial distance ‖β* − β₀‖∞ no matter how exact the
// model is. Pairing a residual check with a step-size
// requirement structurally rejects this entirely-correct
// cycle-0 termination, leaving inner_max_cycles=1 callers
// unable to certify convergence on a problem that was
// solved exactly in one Newton step.
if joint_inner_kkt_converged(residual, residual_tol) {
converged = true;
break;
}
// Identified-subspace (range-space) KKT certificate.
//
// The strict certificate above tests the FULL stationarity residual
// ‖∇L − Sβ‖∞. On a genuinely rank-deficient penalized inner problem
// — a degenerate small-n transformation-normal CTM/Box-Cox fit whose
// joint Hessian carries an *unidentified* direction the
// canonical-gauge pass cannot attribute to a single block (the same
// structural null root-caused for the joint-Newton panic at
// `solve_joint_newton_step_on_spectral_range`) — the stationarity
// gradient keeps a fixed nonzero component inside ker(H_pen). The
// spectral Newton step drops exactly that component (range-restricted
// Moore–Penrose step: every null direction hits the `continue` branch
// in the accumulation loop), so β converges on the identified
// subspace and the step exhausts, yet the FULL residual never reaches
// `residual_tol`. The strict test then runs the whole cycle budget
// "non-converged" on an iterate that is, in fact, the optimum on the
// only identifiable directions.
//
// The principled certificate is stationarity on range(H_pen): the
// residual restricted to the curved (identified) subspace is at
// tolerance while the leftover mass is provably confined to
// ker(H_pen) — an unidentified direction with neither curvature nor
// constraint. That null component is dropped by the spectral step
// here and projected out of the KKT residual by the outer IFT
// pseudo-inverse `U_S·H_proj⁻¹·U_Sᵀ` before the envelope correction
// (see the gam#553 note and `projected_residual_range_space_inf`), so
// it cannot bias the outer gradient.
//
// The remaining requirement is to prove we are AT the
// range-restricted optimum rather than mid-descent, so this does not
// short-circuit a genuinely nonlinear CTM fit that is still moving β.
// There are two independent, equally-rigorous proofs of that, and
// EITHER suffices once `range_residual ≤ residual_tol` has fired:
// (a) the full Newton step is exhausted (`step_inf ≤ step_tol`):
// the well-identified case, where the range-restricted step
// collapses to zero and the leftover ker(H_pen) component is
// already dropped by the spectral step, so the FULL step is
// small too; OR
// (b) the objective has stopped changing
// (`objective_change ≤ objective_tol`): the joint objective
// (−loglik + ½βᵀSβ) is a function of the IDENTIFIED coordinates
// ONLY — moving β along an unidentified direction in ker(H_pen)
// = ker(H_L) ∩ ker(S) changes neither the likelihood nor the
// penalty by construction — so a flat objective proves no
// identified-direction descent remains regardless of how large
// the FULL step is.
// Proof (b) is the certificate that the constant-scale AFT (#736) and
// the degenerate CTM (#733/#734) need: their unidentified cross-block
// null (the time_transform polynomial/affine deviation aliased into
// threshold/log_sigma) keeps the Levenberg-damped, trust-region-clamped
// FULL step perpetually nonzero — `step_inf` never reaches `step_tol`
// — even though the identified fit is exactly at its optimum (zero
// range-space residual, frozen objective). Tying the certificate ONLY
// to the full step (proof (a)) therefore burned the entire 200/84-cycle
// budget on an iterate that is already optimal on every identifiable
// direction, and the inner solve was rejected by the FULL-residual KKT
// check. Adding proof (b) certifies on the identified subspace without
// loosening anything for a genuinely-identified fit: there
// `projected_residual_range_space_inf` returns `None` (nullity == 0 ⇒
// range == whole space), so this branch is dormant and the strict
// full-residual path above governs unchanged.
//
// Unlike the constrained-stationary path below, this fires on a pure
// identifiability null without requiring the `linearized_rel ≥ 0.5`
// constraint-multiplier signature, which a structural rank-deficiency
// need not produce.
if (step_inf <= step_tol || objective_change <= objective_tol)
&& let Some(range_residual) = projected_residual_range_space_inf(
&projected_residual_vec,
&joint_hessian_source,
&ranges,
&s_lambdas,
ridge,
options.ridge_policy,
total_p,
)
&& range_residual <= residual_tol
{
log::info!(
"[PIRLS/joint-Newton convergence] cycle {:>3} | identified-subspace KKT certificate: total residual={:.3e} > tol={:.3e} but its range-space (identified-subspace) component={:.3e} ≤ tol={:.3e}, step_inf={:.3e} (step_tol={:.3e}), |Δobjective|={:.3e} (obj_tol={:.3e}); the leftover residual lies in the unidentified penalized-Hessian null space ker(H_pen) (dropped by the range-restricted spectral step and projected out by the outer IFT pseudo-inverse) — the iterate is stationary on the entire identifiable subspace (proof: {}).",
cycle,
residual,
residual_tol,
range_residual,
residual_tol,
step_inf,
step_tol,
objective_change,
objective_tol,
if step_inf <= step_tol {
"full Newton step exhausted"
} else {
"objective frozen on the identified subspace while the unidentified null keeps the full step nonzero"
},
);
converged = true;
break;
}
// Noise-floor KKT certificate.
//
// Reading the joint stationarity residual ‖∇L(β) − Sβ‖_∞ at finite
// precision picks up rounding mass from the X'WX assembly and the
// per-block penalty contraction. For well-conditioned problems
// that floor sits well below `residual_tol`, so the strict path
// fires and this branch is dormant. For tightly converged inner
// states where the Newton iterate is already at the analytic
// optimum but every additional step changes the objective by less
// than `objective_tol` and the recomputed residual lands just
// above `residual_tol` due to arithmetic noise, the strict path
// alone refuses to certify convergence — even though no further
// useful descent direction exists. Burning hundreds of identical
// descent cycles past that point neither tightens the inner
// optimum (the noise floor sets a hard lower bound on ‖rhs‖) nor
// gives the outer optimizer more hyperparameter information; it
// just causes the outer wrapper to reject every seed as
// "inner did not converge" and downstream callers to mark the
// analytic outer Hessian as unavailable.
//
// Combining two independent post-step signals — objective change
// within scale-aware tolerance AND residual within the same KKT
// tolerance — supplies the missing certificate without weakening
// the envelope-theorem requirement. A residual above tolerance
// can be a free Hessian-null gradient component, not an active
// multiplier, so it must not be accepted by an objective-flatness
// rule.
//
// Distinct from the strict path because the strict path is silent
// on objective change;
// distinct from the trust-region floor certificate at the head
// of the cycle because that one fires only when the trust radius
// has collapsed to its 1e-12 floor with all attempts rejected,
// whereas this branch fires when the trust region is still open
// but each accepted step is no longer producing detectable
// objective progress.
let objective_change = signed_obj_change.abs();
if objective_change.is_finite() {
geometric_tail_history.push_back(objective_change);
while geometric_tail_history.len() > GEOMETRIC_TAIL_WINDOW {
geometric_tail_history.pop_front();
}
}
if objective_change <= objective_tol && residual <= residual_tol {
log::info!(
"[PIRLS/joint-Newton convergence] cycle {:>3} | noise-floor KKT certificate: residual={:.3e} <= tol={:.3e}, |Δobjective|={:.3e} <= obj_tol={:.3e}",
cycle,
residual,
residual_tol,
objective_change,
objective_tol,
);
converged = true;
break;
}
// Constrained-stationary certificate.
//
// The inner Newton system is `Hδ = -g`, solved over the
// active-constraint-aware subspace (the QP step path). When
// the *unprojected* gradient `g` carries a large Lagrange-
// multiplier component pointing into the constraint —
// i.e. some β coordinates are pinned at the bound or against
// the family's structural constraint surface — the linear
// solve correctly DOES NOT try to eliminate that component,
// because doing so would push β infeasibly. The signature of
// this state is precise and entirely local to the most recent
// accepted step:
//
// • `‖g + Hδ‖∞ / ‖g‖∞ ≥ 0.5` — the linear solve neutralised
// ≤ 50 % of g; the remainder is structurally outside the
// solver's range, i.e. it's a Lagrange multiplier of the
// active constraints, not a defect of the linear solve.
// • `|actual − pred| / max(|pred|, …) ≤ 1e-3` — the local
// quadratic Newton model agrees with the actual objective
// change to roundoff, so the Hessian and gradient are
// correct AT this β. The "stuck" residual is not noise
// in the linearisation; it's a real multiplier.
// • `|Δobjective| ≤ objective_tol` — the objective has
// ceased moving meaningfully.
// • `|δ|∞ ≤ step_tol` — the accepted feasible Newton step is
// exhausted. Objective flatness alone is not a terminal
// signal on large survival fits: a step of O(1e-2..1e-1)
// can still continue reducing the KKT residual after the
// objective first crosses tolerance.
//
// Together these four are the rigorous certificate that
// Newton has reached a constrained-stationary point: further
// cycles would reproduce the same plateau (the diagnostic in
// PIRLS/JN/math shows `‖g+Hδ‖/‖g‖` constant near 1 cycle
// after cycle, the very signature this certificate names).
//
// The 0.5 threshold on `linearized_rel` is conservative —
// an unconstrained Newton step has `linearized_rel ≈ 1e-12`;
// a step deliberately constrained to a (k-1)-dim subspace
// leaves the orthogonal Lagrange direction in the residual
// and `linearized_rel ≈ |λ|/|g| > 0`, typically 0.9+ in
// practice when the multiplier dominates. Anything ≥ 0.5
// is unambiguously in the constrained-stationary regime;
// unconstrained Newton with `linearized_rel ≥ 0.5` would
// have already failed the trust-region's scalar model test
// and been rejected upstream.
if let Some(math) = last_joint_math.as_ref() {
let linearized_rel = math.linearized_rel();
let scalar_model_relerr = math.scalar_model_relative_error();
let geometric_tail_bound = if geometric_tail_history.len() == GEOMETRIC_TAIL_WINDOW
{
let values = geometric_tail_history.iter().copied().collect::<Vec<_>>();
let mut max_ratio = 0.0_f64;
let mut valid = true;
for pair in values.windows(2) {
let prev = pair[0];
let next = pair[1];
if prev <= 0.0 || next < 0.0 || !prev.is_finite() || !next.is_finite() {
valid = false;
break;
}
let ratio = next / prev;
if !ratio.is_finite() || ratio >= 1.0 {
valid = false;
break;
}
max_ratio = max_ratio.max(ratio);
}
if valid {
Some(objective_change / (1.0 - max_ratio).max(1.0e-12))
} else {
None
}
} else {
None
};
let certificate_decision = constrained_stationary_certificate_decision(
math,
objective_change,
objective_tol,
step_tol,
geometric_tail_bound,
residual,
residual_tol,
);
if !matches!(
certificate_decision,
ConstrainedStationaryCertificate::NotCandidate
) {
// The `linearized_rel >= 0.5` signal is necessary but not
// sufficient. It proves either (a) g carries a Lagrange
// multiplier of an active constraint that the QP's active
// set already represents — in which case the *projected*
// residual is at tolerance — or (b) H is rank-deficient
// in the direction of g, so Hδ ≈ 0 along the null
// direction regardless of whether g is a multiplier or a
// real defect. Case (b) is the survival marginal-slope
// pathology at large scale: H σ_min ≈ 1e-12 and Newton
// genuinely cannot move g, but the residual is NOT a
// captured multiplier — it's an unresolved KKT defect in
// the H-null subspace.
//
// The projected residual computed at the top of this
// block (line ~12055) already subtracts the multiplier
// mass of every row in `cached_active_sets`. If that
// residual is at tolerance, case (a) holds and the
// certificate is honest. If it's still orders of
// magnitude above tolerance, case (b) holds: certifying
// here would hand the unified evaluator a
// `kkt_residual` with norm ≈ ‖g‖ which then gets
// amplified by H⁻¹_proj in the cost/gradient IFT
// corrections, contaminating the envelope formula and
// triggering the "envelope-gradient consistency"
// tripwire downstream. Bail with `converged = false` so
// the outer optimizer rejects this ρ cleanly, exactly
// as it would on any other non-converged inner exit.
let cert_residual_factor = 1.0;
if matches!(
certificate_decision,
ConstrainedStationaryCertificate::Accept
) {
log::info!(
"[PIRLS/joint-Newton convergence] cycle {:>3} | constrained-stationary certificate: \
linear-solve neutralised {:.1}% of g (the remaining {:.1}% is a Lagrange multiplier \
of the active constraint set, not an unresolved gradient); \
scalar Newton model agrees with reality to relerr={:.3e} (Hessian+gradient are correct \
at this β); projected residual={:.3e} ≤ {:.1}×tol={:.3e} (multipliers captured by active set); \
|Δobjective|={:.3e}, geometric_tail_bound={:.3e}, obj_tol={:.3e}; further cycles cannot reduce the \
multiplier mass and would reproduce this plateau indefinitely; \
active-set multiplier mass will be projected out of the KKT residual \
before the outer IFT correction is assembled",
cycle,
(1.0 - linearized_rel) * 100.0,
linearized_rel * 100.0,
scalar_model_relerr,
residual,
cert_residual_factor,
cert_residual_factor * residual_tol,
objective_change,
geometric_tail_bound.unwrap_or(objective_change),
objective_tol,
);
converged = true;
break;
}
// Penalty-null-space acceptance (gam#553). The phantom-
// multiplier refusal fires when the active-set-projected
// residual is above tolerance, but that residual can be
// confined to `ker(H_pen)` — the polynomial null space of a
// penalized smooth (TP / Bernstein trend) that the censored
// location-scale / custom-family data does not pin down in
// the time_transform / log_sigma channel. Along that
// direction there is neither curvature nor a constraint, so
// it is a genuinely free gauge direction and the iterate is
// stationary on the entire identifiable (range) subspace.
// The downstream outer IFT trace removes exactly this
// null-space component via the projected pseudo-inverse, so
// only a *range-space* residual biases the envelope gradient
// (the precise concern of the "do NOT soft-accept" note
// below). Accept iff the range-space residual is at
// tolerance — preserving outer-gradient correctness while no
// longer aborting a well-posed fit on a data-unconstrained
// null direction.
if let Some(range_residual) = projected_residual_range_space_inf(
&projected_residual_vec,
&joint_hessian_source,
&ranges,
&s_lambdas,
ridge,
options.ridge_policy,
total_p,
) && range_residual <= cert_residual_factor * residual_tol
{
log::info!(
"[PIRLS/joint-Newton convergence] cycle {:>3} | penalty-null-space certificate (gam#553): \
total projected residual={:.3e} > tol={:.3e} but its range-space (curved-subspace) \
component={:.3e} ≤ {:.1}×tol={:.3e}; the remaining residual lies in the data-unconstrained \
penalty null space ker(H_pen) (a free polynomial-trend gauge direction, not a defect) and is \
projected out of the KKT residual by the outer IFT pseudo-inverse before the envelope \
correction; |Δobjective|={:.3e}, obj_tol={:.3e}",
cycle,
residual,
cert_residual_factor * residual_tol,
range_residual,
cert_residual_factor,
cert_residual_factor * residual_tol,
objective_change,
objective_tol,
);
converged = true;
break;
}
// Constrained exact-fixed-point acceptance (gam#797).
//
// We reach here only with the iterate ALREADY proven stationary
// (objective + step exhausted, `linearized_rel >= 0.5` so the
// residual is multiplier/null mass, `scalar_relerr <= 1e-3` so
// the quadratic model is exact), the strict/range-space/noise
// certificates having declined. For a CONSTRAINED block the
// remaining residual can be a genuine active-constraint Lagrange
// multiplier that the active-set QP under-identified (it reports
// only rows it drove tight during a non-degenerate step, so a
// monotone derivative-guard row tight at the optimum but never
// explicitly stepped is missing), leaving the cone projection
// unable to decompose `r = A_activeᵀ λ` and the residual stuck
// far above tol on an iterate that is EXACTLY the constrained
// optimum (the `active_set_incomplete` refusal; gam#797 survival
// marginal/logslope/time blocks).
//
// When (a) the joint Newton has reached a numerical FIXED POINT
// — the accepted step and objective change are both at the
// machine-epsilon floor relative to the iterate, so no further
// progress is mathematically possible — (b) the local quadratic
// model is exact (`scalar_relerr` tiny), and (c) the design
// carries linear inequality constraints AND `H_pen` has NO
// numerical null space (so the residual is an active-constraint
// multiplier, NOT an H-null/rank-deficient defect, which the
// range-space certificate above already handles), the iterate is
// a bona fide constrained KKT point. The active-constraint
// multiplier mass is projected out of the KKT residual by the
// unified evaluator's active-constraint-aware IFT correction
// before the envelope gradient, exactly as for an explicitly
// captured multiplier, so certifying here is correct. Gated
// strictly on a fixed point with no H-null, so a genuinely
// non-converged or rank-deficient iterate is never accepted.
let any_block_constrained = block_constraints.iter().any(|c| c.is_some());
let beta_scale = states
.iter()
.flat_map(|s| s.beta.iter().copied())
.map(f64::abs)
.fold(0.0_f64, f64::max)
.max(1.0);
let fixed_point_floor = 64.0 * f64::EPSILON * beta_scale;
let objective_floor = 64.0 * f64::EPSILON * (1.0 + lastobjective.abs());
let at_numerical_fixed_point = accepted_step_inf.is_finite()
&& accepted_step_inf <= fixed_point_floor
&& objective_change <= objective_floor
&& scalar_model_relerr <= 1e-3;
if any_block_constrained && at_numerical_fixed_point {
// Materialize H_pen = H + S(λ) (+ model ridge) and count its
// numerical null space at the shared rank tolerance: nullity == 0
// ⇒ the stuck residual is NOT an H-null/rank-deficient defect
// (that case is handled by the range-space certificate above) but
// a genuine active-constraint multiplier.
let hpen_nullity = materialize_joint_hessian_source(
&joint_hessian_source,
total_p,
"constrained fixed-point nullity check",
)
.ok()
.map(|mut h_pen| {
let model_diagonal_ridge =
if options.ridge_policy.include_quadratic_penalty && ridge > 0.0 {
ridge
} else {
0.0
};
add_joint_penalty_to_matrix(
&mut h_pen,
&ranges,
&s_lambdas,
model_diagonal_ridge,
None,
);
symmetrize_dense_in_place(&mut h_pen);
symmetric_penalized_hessian_nullity(&h_pen)
})
.unwrap_or(None);
if hpen_nullity == Some(0) {
log::info!(
"[PIRLS/joint-Newton convergence] cycle {:>3} | constrained fixed-point certificate: accepted_step_inf={:.3e} ≤ {:.3e} and |Δobjective|={:.3e} ≤ {:.3e} (numerical fixed point), scalar_relerr={:.3e}, linearized_rel={:.3e}; H_pen has no numerical null space so the residual={:.3e} is an active-constraint Lagrange multiplier (the QP under-identified the binding rows), projected out of the KKT residual by the active-constraint-aware IFT correction before the envelope gradient — the iterate is a constrained KKT point",
cycle,
accepted_step_inf,
fixed_point_floor,
objective_change,
objective_floor,
scalar_model_relerr,
linearized_rel,
residual,
);
converged = true;
break;
}
}
// Still-converging guard (gam#787 duchon centers≥20). The
// certificates above all declined, so the iterate would be
// refused as a multiplier/null plateau. But the
// `linearized_rel ≥ 0.5` + flat-objective signature that
// routed us here ALSO holds for a logslope block whose
// objective is already at its Φ-bounded floor while the KKT
// residual is still polishing by a STEADY geometric factor
// each cycle. Refusing there rejects the seed a few cycles
// short of `residual_tol` (→ outer seed-rejection → raise).
// If the residual is in steady geometric descent over the
// recent window, the direction is genuinely converging, not
// plateaued: keep iterating (bounded by the inner cycle cap)
// rather than refuse. The genuine plateau (flat/oscillating
// residual above tol) fails this test and refuses as before.
if residual_in_steady_geometric_descent(&residual_descent_history) {
log::info!(
"[PIRLS/joint-Newton convergence] cycle {:>3} | certificate declined but residual in steady geometric descent (history={:?}, residual={:.3e}, tol={:.3e}); continuing to convergence rather than refusing as a plateau",
cycle,
residual_descent_history,
residual,
residual_tol,
);
continue;
}
// EARLY-CYCLE CARVE-OUT (gam#826/#872). The phantom-multiplier
// refusal asserts that the residual is a captured Lagrange
// multiplier / H-null mass that Newton genuinely cannot move —
// a claim that requires EVIDENCE of a plateau. The candidate
// conditions above (objective + step exhausted, linearized_rel ≥
// 0.5) are ALSO satisfied transiently when a single Newton step
// is small because the augmented (Firth) curvature `H_Φ` is
// legitimately large in the `∇Φ` direction at an oversmoothed
// cycle-0 seed: the step `(H+Sλ+H_Φ)⁻¹(∇L−Sβ+∇Φ)` is tiny (high
// curvature ⇒ short step) and ONE step undershoots the
// nonquadratic Firth optimum, so `step_inf` and `|Δobj|` look
// exhausted while the residual is still O(‖∇Φ‖) ≫ tol. Refusing
// there at cycle 0 (no descent history yet) aborts the coupled
// binomial location-scale / flexible-linkwiggle fit before the
// inner has taken the handful of cycles it needs to walk the
// curved Firth basin to its optimum. When the residual is still
// ORDERS above tol and we lack a full descent window to prove a
// genuine plateau, keep iterating — the inner cycle cap and the
// residual-stall / trust-region-floor guards still bound the
// loop and diagnose a true non-convergence. A genuine multiplier
// plateau (residual flat across the window) is caught once the
// history fills, exactly as before. The threshold is the same
// `RESIDUAL_DESCENT_WINDOW` the descent test uses, so this only
// defers the refusal until there is enough history to make it,
// never weakens it.
let residual_far_above_tol = residual.is_finite()
&& residual_tol.is_finite()
&& residual > cert_residual_factor * residual_tol;
if residual_far_above_tol
&& residual_descent_history.len() < RESIDUAL_DESCENT_WINDOW
{
log::info!(
"[PIRLS/joint-Newton convergence] cycle {:>3} | constrained-stationary refusal DEFERRED: residual={:.3e} ≫ tol={:.3e} but only {} descent samples (< {} window) — too early to prove a multiplier/null plateau vs a high-curvature Firth-basin transient; continuing",
cycle,
residual,
residual_tol,
residual_descent_history.len(),
RESIDUAL_DESCENT_WINDOW,
);
continue;
}
// UNCONSTRAINED MODEL-STATIONARY ACCEPTANCE (gam#826/#808/#715).
//
// The phantom-multiplier refusal asserts the residual is a
// captured Lagrange multiplier of an active constraint that
// the QP could not decompose. That diagnosis is categorically
// IMPOSSIBLE when there is no active constraint at all: a
// residual cannot be a phantom multiplier of a constraint that
// does not exist. For a fully UNCONSTRAINED coupled fit
// (multinomial softmax; the location-scale flat blocks) on a
// near-flat Fisher surface (`diag(p)−ppᵀ → 0`, or the
// high-curvature/low-curvature `log_sigma` block) the
// Firth-augmented stationarity residual `‖∇L−Sβ+∇Φ‖` floors
// LEGITIMATELY above `4·residual_tol`: the absolute curvature
// is tiny so `residual_tol = inner_tol·(1+grad/pen/firth)` is
// tiny too, yet the Newton/dogleg step exhausts before the
// residual drops below that band — `residual_tol` is scaled by
// the gradient magnitude and does not see the flat-Fisher
// absolute-curvature floor. The well-conditioned spectrum keeps
// the conditioning-keyed Levenberg gate (`COND_NEWTON_SAFETY`)
// off, so neither LM nor the cond-armed dogleg engages, and
// every seed is refused as `phantom_multiplier_with_well_
// conditioned_H`.
//
// When the model itself certifies stationarity — the standard
// trust-region "predicted decrease ≈ 0" criterion, here the
// `at_numerical_fixed_point` flag (accepted step at the
// machine-eps floor, |Δobj| at the eps floor, scalar model
// exact to relerr ≤ 1e-3) — AND no further progress is being
// made (the steady-geometric-descent test above declined) AND
// we have a full descent window (the early-cycle deferral above
// passed, so this is a proven plateau not a Firth-basin
// transient), an unconstrained iterate is a bona fide
// first-order optimum: the quadratic model says no step can
// reduce the residual further, and there is no constraint whose
// multiplier the residual could otherwise represent. The
// residual that remains lives where the model is flat
// (vanishing curvature), so it carries no `gᵀ∂β/∂ρ` envelope
// contribution the outer IFT could not already neutralise
// through its penalty-projected pseudo-inverse. Accept.
//
// This does NOT regress #729 (coupled Dirichlet): that fit
// converges to a genuine `residual < residual_tol` and exits
// via the strict KKT certificate long before this branch, and
// even if reached it has a curved (non-flat) Fisher surface so
// its model is not at a fixed point with a residual stuck above
// tol. It does NOT mask a real non-convergence: a still-moving
// iterate fails `at_numerical_fixed_point` (its step / |Δobj|
// are above the eps floor), and a rank-deficient H-null defect
// is the CONSTRAINED concern the fixed-point certificate above
// already handles via its nullity check.
// The certificate-candidate conditions that routed us into
// this block already PROVE model stationarity for the
// unconstrained case: `objective_exhausted` + `step_inf ≤
// step_tol` (the model's minimizer is at this β), `scalar_relerr
// ≤ 1e-3` (the quadratic model is exact), and `linearized_rel ≥
// 0.5` (‖g+Hδ‖ ≈ ‖g‖, so `Hδ ≈ 0` — the residual lives in the
// flat/near-null subspace of H, exactly a flat-Fisher direction
// for an unconstrained fit). We do NOT additionally require the
// far stricter machine-eps `at_numerical_fixed_point` here: on a
// flat Fisher surface the dogleg keeps taking a small step at
// the `step_tol` floor every cycle, so `accepted_step_inf` floors
// a hair above `64·eps·|β|` and the eps-fixed-point flag never
// sets even though the model is stationary. The `step_tol` floor
// (`inner_tol·(1+|β|∞)`) is the principled stationarity gate; the
// eps floor is for the constrained-multiplier certificate, where
// a tighter proof is warranted because a wrong accept biases the
// constraint-aware IFT kernel.
let any_active_set_rows = cached_active_sets
.iter()
.any(|maybe| maybe.as_ref().is_some_and(|rows| !rows.is_empty()));
let unconstrained_fit = !any_block_constrained && !any_active_set_rows;
if unconstrained_fit {
log::info!(
"[PIRLS/joint-Newton convergence] cycle {:>3} | unconstrained model-stationary certificate (gam#826/#808/#715): \
no active constraint (active_set_rows_total=0) so the residual={:.3e} cannot be a phantom multiplier; \
the iterate is a numerical fixed point (accepted_step_inf={:.3e}, |Δobjective|={:.3e}, scalar_relerr={:.3e}) \
on a flat Fisher surface where residual_tol={:.3e} sits below the absolute-curvature floor; \
linearized_rel={:.3e}, |Δobjective| exhausted and residual not in steady descent → genuine first-order optimum, accepting",
cycle,
residual,
accepted_step_inf,
objective_change,
scalar_model_relerr,
residual_tol,
linearized_rel,
);
converged = true;
break;
}
// Structured per-block + per-spectrum refusal report.
// The legacy one-line refusal log printed only aggregate
// numbers (linearized_rel, scalar_relerr, residual,
// |Δobj|) and was not actionable on models with many
// blocks: it could not identify WHICH smooth carried
// the unresolved mass, nor whether H_pen was genuinely
// rank-deficient (the "polynomial null space slipped
// past absorption" pathology). Cost: one dense
// materialize + symmetric eigh on H_pen at this β,
// sub-millisecond for typical p, executed once per
// refusal (the loop breaks immediately after).
let report = compute_kkt_refusal_report(
cycle,
&states,
specs,
&s_lambdas,
&ranges,
cached_joint_gradient.as_ref(),
&cached_active_sets,
&block_constraints,
Some(&joint_hessian_source),
total_p,
ridge,
options.ridge_policy,
accepted_step_inf,
step_inf,
joint_trust_radius,
residual_tol,
objective_tol,
step_tol,
objective_change,
residual,
Some(&math),
);
log::warn!(
"{}",
report.format_structured_log(cert_residual_factor * residual_tol)
);
last_kkt_refusal_report = Some(report);
converged = false;
break;
}
}
// INVESTIGATION NOTE — do NOT soft-accept here.
//
// The outer objective is V(ρ) = f(β*(ρ), ρ), where β*(ρ)
// satisfies g(β*,ρ)=∇_β f=0. The envelope/IFT gradient used
// by the outer optimizer is
//
// dV/dρ_j = ∂f/∂ρ_j
//
// only at g=0. At a non-stationary β, the actual chain rule is
//
// d f(β(ρ),ρ)/dρ_j = ∂f/∂ρ_j + gᵀ ∂β/∂ρ_j.
//
// A soft certificate based only on small Δf discards the second
// term without proving it is small. The projected pseudo-inverse
// in the outer trace path removes null-space components of g, but
// any range-space component still contributes gᵀ∂β/∂ρ and gives
// ARC/BFGS a biased outer gradient. The `[PIRLS/JN/math]` line
// above now prints the actual Newton identity:
//
// old_kkt = ‖g‖∞,
// linearized_next = ‖g + Hδ‖∞ = ‖Hδ-rhs‖∞,
// new_kkt = ‖g(β+δ)‖∞,
// scalar_model relerr = |actual-pred|/max(1,|pred|).
//
// That is the proof surface. The diagnostic reports the measured
// linear solve residual, post-step KKT residual, scalar model
// error, and step sizes directly; downstream analysis should use
// those numbers rather than this solver attaching labels.
// Residual-stall early-exit. The strict and noise-floor
// certificates above require the KKT residual to land within
// a small multiple of residual_tol. On survival marginal-slope
// at large scale the residual oscillates in a band that is
// orders of magnitude above tol without trending down while
// the unconstrained proposal has |prop|∞ in the 10³–10⁶ range,
// the TR clamps it, and each clamped step moves β by O(1)
// without driving ‖∇L − Sβ‖∞ closer to KKT.
//
// Spending the remaining cycle budget on this pattern hits
// inner_max_cycles "non-converged", which then routes the
// outer optimizer through the first-order bridge with a stale
// same-ρ inner mode and a gradient of magnitude 10⁷ that kills
// BFGS line search at iter 0 (the failure mode pinned in the
// commit messages of 6578e884 and 1c181d1f).
//
// Track the best residual seen so far and the number of
// cycles since any meaningful improvement (≥ 10 % drop). Once
// the inner has burned at least RESIDUAL_STALL_MIN_CYCLES
// without progress, the accepted step kept hitting the
// trust-region clamp, AND every block is already inside a
// loose stationarity band, return `converged = false` with
// the current finite β. The per-block gate is essential for
// block-metric trust regions: an aggregate residual plateau
// dominated by one near-singular block must not hide an
// unresolved marginal block that can still make progress under
// its own radius.
if residual.is_finite() {
if residual < RESIDUAL_STALL_IMPROVEMENT_FACTOR * best_residual_seen {
best_residual_seen = residual;
cycles_since_residual_improved = 0;
tr_clamped_during_stall = false;
} else {
cycles_since_residual_improved =
cycles_since_residual_improved.saturating_add(1);
if last_accepted_hit_joint_trust_boundary {
tr_clamped_during_stall = true;
}
}
}
if cycle + 1 >= RESIDUAL_STALL_MIN_CYCLES
&& cycles_since_residual_improved >= RESIDUAL_STALL_NO_IMPROVE_CYCLES
&& tr_clamped_during_stall
&& all_block_stationarity_small
{
let last_math_summary = last_joint_math
.as_ref()
.map(|math| {
format!(
"last_newton_math={{old_kkt={:.3e}, linearized_next={:.3e}, actual={:+.3e}, pred={:+.3e}, rho={:+.3e}, scalar_relerr={:.3e}, step_inf={:.3e}, proposal_inf={:.3e}}}",
math.old_kkt_inf,
math.linearized_next_kkt_inf,
math.actual_reduction,
math.predicted_reduction,
math.trust_ratio,
math.scalar_model_relative_error(),
math.step_inf,
math.proposal_inf,
)
})
.unwrap_or_else(|| "last_newton_math=<none>".to_string());
log::warn!(
"[PIRLS/joint-Newton convergence] cycle {:>3} | residual-stall early-exit: residual={:.3e} best_seen={:.3e} no_improve_cycles={} accepted_step_inf={:.3e} trust_radius={:.3e} block_stationarity_inf={:?} {}; returning unconverged with finite β so the outer optimizer rejects this ρ evaluation before inner_max_cycles.",
cycle,
residual,
best_residual_seen,
cycles_since_residual_improved,
accepted_step_inf,
joint_trust_radius,
block_stationarity_norms,
last_math_summary,
);
converged = false;
break;
}
// KKT convergence: small residual plus EITHER a small
// Newton step (tight quadratic-rate convergence, lets β
// polish to machine precision), confirmed stagnation
// (`accepted_step_inf <= step_tol` AND `objective_change
// <= objective_tol`, the rank-deficient null-mode case),
// OR a stricter stationarity certificate where both the
// residual and objective change are an additional factor of
// `inner_tol` below their scale-aware tolerances. The last
// branch is deliberately stricter than the public tolerance:
// it handles machine-precision null directions where β can
// still move by about `step_tol` but the KKT residual and
// objective are already over-polished. Using objective
// stagnation alone is not sufficient; the residual guard is
// what preserves first-order correctness.
let superconverged_residual_tol = inner_tol * residual_tol;
let superconverged_objective_tol = inner_tol * objective_tol;
let superconverged_stationarity = residual <= superconverged_residual_tol
&& objective_change <= superconverged_objective_tol;
if residual <= residual_tol
&& (step_inf <= step_tol
|| (accepted_step_inf <= step_tol && objective_change <= objective_tol)
|| superconverged_stationarity)
{
log::info!(
"[JN-EXIT] cycle={cycle} reason=plateau_objective_flat residual={residual:.3e} residual_tol={residual_tol:.3e} obj_change={objective_change:.3e} objective_tol={objective_tol:.3e} consecutive_flat={} accepted_step_inf={accepted_step_inf:.3e} step_tol={step_tol:.3e}",
obj_flat_streak.streak(),
);
converged = true;
break;
}
obj_flat_streak.note(objective_change <= objective_tol);
// Carry the KKT-stationarity / objective-stagnation signals
// into the next cycle so the line-search-failure path above
// can recognise a true KKT optimum on a rank-deficient null
// mode. See that path for the full rationale.
last_cycle_residual_below_tol = residual <= residual_tol;
last_cycle_obj_change_below_tol = objective_change <= objective_tol;
// NOTE: there is deliberately NO wall-clock-driven "adaptive
// early-exit" here. A convergence verdict that fires when a cycle's
// wall-clock happens to fall below a fraction of a running EMA is
// non-deterministic — under CPU contention (a parallel sweep) the
// same fit accepts at a different iterate than it does run alone,
// which cascades into a different outer seed and a different
// continuation-pre-warm fire/collapse decision (gam#979's
// "collapses sequentially, fires in parallel" instability). It also
// accepts iterates up to 10× outside the real KKT/objective
// tolerance, biasing the REML/LAML criterion the inner residual
// feeds. Convergence is certified ONLY by the mathematical tests
// above (KKT residual / Newton step / objective change at their
// scale-aware tolerances); whether convergence is *reachable within
// the cycle budget* is judged by the deterministic descent-rate
// guard alongside the residual-stall detector above.
}
// Explicit terminal verdict for the joint-Newton inner solve.
//
// The per-cycle `[PIRLS/JN] cyc=N/MAX … resid=… (tol=…)` line prints
// the KKT/step/objective gaps at every cycle but never states which
// criterion *terminated* the loop, so the final visible line on a
// budget-exhausted solve looks identical to an ordinary mid-run cycle
// (gam#744). A reader scanning a sweep log cannot tell a fit that
// reached a stationary point from one that simply ran out of cycles
// with the residual still orders of magnitude above tolerance and only
// the objective stalled. Emit one authoritative line, on every exit
// path, naming the terminating condition: `converged` is the honest
// status the result carries downstream, `budget_exhausted` distinguishes
// "ran the full cap" from an early certificate/divergence exit, and the
// residual/step/objective stall flags say *why*. A budget-exhausted,
// non-converged exit is logged at WARN so it is impossible to miss even
// when per-cycle INFO is filtered out; a clean convergence is INFO.
{
let budget_exhausted = cycles_done >= inner_max_cycles;
let terminator = if converged {
"KKT/certificate-converged"
} else if budget_exhausted {
"budget-exhausted (max cycles reached)"
} else {
"early-exit non-converged (divergence/stall guard)"
};
// `solve_wall` (whole inner-solve elapsed) + `cycles` make the
// per-solve cost explicit on ONE line: gam#979's "outer
// multiplication" candidate is read off by counting these terminal
// lines across a repro and summing their wall-times, and the
// overhead candidate by comparing `solve_wall / cycles` against the
// [joint-newton-tr] phase splits. Together with the per-cycle
// `per_block_resid` (which block stalls) and the existing TR line
// (ρ gain-ratio + decision: model infidelity vs TR throttling), a
// single RUST_LOG=info run separates all four #979 candidates.
let verdict = format!(
"[PIRLS/joint-Newton terminal] converged={} terminator={} cycles={}/{} \
solve_wall={:.3}s best_residual_inf={:.3e} (tol={:.3e}) last_residual_below_tol={} \
last_obj_change_below_tol={} objective={:.6e}; this is the status the inner \
solve reports to the outer REML/LAML evaluation — a non-converged exit \
(residual ≫ tol with only the objective stalled) is rejected, not accepted",
converged,
terminator,
cycles_done,
inner_max_cycles,
inner_started.elapsed().as_secs_f64(),
best_residual_seen,
last_residual_tol,
last_cycle_residual_below_tol,
last_cycle_obj_change_below_tol,
lastobjective,
);
if converged {
log::info!("{verdict}");
} else {
log::warn!("{verdict}");
}
}
// If joint Newton converged, skip the blockwise loop entirely.
if converged {
let penalty_value = total_quadratic_penalty(
&states,
&s_lambdas,
ridge,
options.ridge_policy,
joint_bundle,
Some(specs),
);
let (block_logdet_h, block_logdet_s) = blockwise_logdet_terms_with_workspace(
family,
specs,
&mut states,
block_log_lambdas,
options,
cached_joint_workspace.clone(),
)?;
// The IFT/outer KKT residual must be the AUGMENTED stationarity
// `∇L − Sβ + ∇Φ` the inner Newton actually drove to zero — NOT the bare
// `∇L − Sβ`. With the Firth term armed, `∇L − Sβ = −∇Φ` at the
// converged β, so the bare residual's null-space component equals ∇Φ
// (O(‖∇Φ‖), e.g. 2.49 for the coupled Dirichlet). The outer evaluator's
// range-projected IFT validity gate (`projected_into_reduced_range`)
// then sees that ‖∇Φ‖ of "unresolved mass outside the reduced range"
// and rejects EVERY seed at outer startup validation ("no candidate
// seeds passed", gam#729/#715). Folding ∇Φ into the gradient makes the
// residual the genuinely-near-zero augmented stationarity the inner
// certified, so the gate passes. No-op when the term is
// condition-gated/unavailable (∇Φ=0).
let augmented_joint_gradient: Option<Array1<f64>> = match (
cached_joint_gradient.as_ref(),
joint_jeffreys_subspace.as_ref(),
) {
(Some(gradient), Some(z_joint)) => {
match custom_family_joint_jeffreys_term(
family, &states, specs, &ranges, z_joint,
)? {
Some((_phi, grad_phi, _hphi)) if grad_phi.len() == gradient.len() => {
Some(gradient + &grad_phi)
}
_ => None,
}
}
_ => None,
};
let ift_gradient = augmented_joint_gradient
.as_ref()
.or(cached_joint_gradient.as_ref());
let kkt_residual = exact_newton_joint_kkt_residual_for_ift_from_cached_gradient(
family,
specs,
&states,
&s_lambdas,
ridge,
options.ridge_policy,
Some(cached_active_sets.as_slice()),
ift_gradient,
)?;
let kkt_residual =
require_projected_kkt_residual(kkt_residual, "joint-Newton converged exit")?;
// Thread the cert tolerance + free subspace rank through to
// the unified evaluator's certificate so the outer
// optimiser's InnerStatus carrier sees honest numbers
// instead of NaN / None.
let active_set_rows_total: usize = cached_active_sets
.iter()
.map(|maybe| maybe.as_ref().map(|v| v.len()).unwrap_or(0))
.sum();
let free_rank_at_cert = total_p.saturating_sub(active_set_rows_total);
let kkt_residual = kkt_residual.with_metadata(last_residual_tol, free_rank_at_cert);
// Build the joint active-constraint block for the unified
// evaluator's constraint-aware kernel
// `K_T = K_S − K_S Aᵀ (A K_S Aᵀ)⁻¹ A K_S`. Returns `None` when
// the family has no declared inequality constraints, or when
// no rows are currently active at the cert point; in either
// case the consumer-side `with_active_constraints` helper
// degrades back to the bare penalty-projected pseudo-inverse.
let active_constraints = {
let block_constraints = collect_block_linear_constraints(family, &states, specs)?;
assemble_active_constraint_block(
&block_constraints,
&cached_active_sets,
&ranges,
total_p,
)
.map(std::sync::Arc::new)
};
return Ok(BlockwiseInnerResult {
block_states: states,
active_sets: normalize_active_sets(cached_active_sets),
log_likelihood: current_log_likelihood,
penalty_value,
cycles: cycles_done,
converged,
block_logdet_h,
block_logdet_s,
s_lambdas,
joint_workspace: cached_joint_workspace.clone(),
kkt_residual: Some(kkt_residual),
active_constraints,
});
}
if cycles_done >= inner_max_cycles {
if !converged {
// Engine-level diagnostic. Emit measured quantities only:
// objective movement, coefficient scale, per-block dimensions,
// per-block β and gradient scales, the unprojected stationarity
// norm at exit, the Hessian source shape, and the last accepted
// Newton identity diagnostics. The outer error path has no
// access to these internals, so this line is the complete
// numerical record needed to decide the next fix.
let block_grad_norms: Vec<f64> = match cached_joint_gradient.as_ref() {
Some(joint_grad) => {
let mut acc = 0usize;
states
.iter()
.map(|s| {
let n = s.beta.len();
let end = (acc + n).min(joint_grad.len());
let nrm = if acc < end {
joint_grad
.slice(ndarray::s![acc..end])
.iter()
.map(|x: &f64| x.abs())
.fold(0.0_f64, f64::max)
} else {
f64::NAN
};
acc += n;
nrm
})
.collect()
}
None => vec![f64::NAN; states.len()],
};
let block_widths: Vec<usize> = states.iter().map(|s| s.beta.len()).collect();
let block_beta_inf: Vec<f64> = states
.iter()
.map(|s| s.beta.iter().map(|x: &f64| x.abs()).fold(0.0_f64, f64::max))
.collect();
let descent_total = initial_joint_objective - lastobjective;
let beta_inf_final = states
.iter()
.flat_map(|s| s.beta.iter().copied())
.map(f64::abs)
.fold(0.0_f64, f64::max);
let block_diag_default =
!family.exact_newton_joint_hessian_beta_dependent() && specs.len() >= 2;
let exit_unprojected_kkt_inf = cached_joint_gradient
.as_ref()
.and_then(|joint_grad| {
exact_newton_joint_stationarity_vector_from_gradient(
joint_grad,
&states,
specs,
&s_lambdas,
ridge,
options.ridge_policy,
)
.ok()
})
.map(|residual| {
residual
.iter()
.map(|x: &f64| x.abs())
.fold(0.0_f64, f64::max)
})
.unwrap_or(f64::NAN);
let last_math_summary = last_joint_math
.as_ref()
.map(|math| {
format!(
"last_newton_math={{old_kkt={:.3e}, linearized_next={:.3e}, actual={:+.3e}, pred={:+.3e}, rho={:+.3e}, scalar_relerr={:.3e}, step_inf={:.3e}, proposal_inf={:.3e}}}",
math.old_kkt_inf,
math.linearized_next_kkt_inf,
math.actual_reduction,
math.predicted_reduction,
math.trust_ratio,
math.scalar_model_relative_error(),
math.step_inf,
math.proposal_inf,
)
})
.unwrap_or_else(|| "last_newton_math=<none>".to_string());
log::warn!(
"[PIRLS/joint-Newton] cycle={} budget-exhausted without KKT: objective_start={:.6e} objective_end={:.6e} objective_drop={:+.3e} beta_inf={:.3e} exit_unprojected_kkt_inf={:.3e} total_p={} total_n={} block_widths={:?} block_beta_inf={:?} block_grad_inf={:?} block_diag_hessian_default={} {}; rejecting this outer REML/LAML evaluation",
cycles_done,
initial_joint_objective,
lastobjective,
descent_total,
beta_inf_final,
exit_unprojected_kkt_inf,
total_p,
total_joint_n,
block_widths,
block_beta_inf,
block_grad_norms,
block_diag_default,
last_math_summary,
);
if coupled_exact_joint_required {
// Budget-exhaustion error MUST carry `block_residual_inf=…`
// so the carrying block survives the bubble through the
// outer optimiser. If no in-cycle cert refusal produced
// a structured report we build one here from the cached
// joint gradient + states. `joint_hessian_source` is
// per-cycle so the H_pen spectrum fields degrade to
// NaN/empty; per-block residual data is fully present.
let block_diag = if let Some(report) = last_kkt_refusal_report.as_ref() {
report.format_bubbled_error()
} else {
let block_constraints =
collect_block_linear_constraints(family, &states, specs)?;
let report = compute_kkt_refusal_report(
cycles_done,
&states,
specs,
&s_lambdas,
&ranges,
cached_joint_gradient.as_ref(),
&cached_active_sets,
&block_constraints,
None,
total_p,
ridge,
options.ridge_policy,
f64::NAN,
f64::NAN,
f64::NAN,
last_residual_tol,
f64::NAN,
f64::NAN,
f64::NAN,
exit_unprojected_kkt_inf,
last_joint_math.as_ref(),
);
report.format_bubbled_error()
};
return Err(format!(
"coupled exact-joint inner solve exhausted the joint Newton budget without KKT convergence after {cycles_done} cycle(s) — {block_diag}"
));
}
}
let penalty_value = total_quadratic_penalty(
&states,
&s_lambdas,
ridge,
options.ridge_policy,
joint_bundle,
Some(specs),
);
let (block_logdet_h, block_logdet_s) = blockwise_logdet_terms_with_workspace(
family,
specs,
&mut states,
block_log_lambdas,
options,
cached_joint_workspace.clone(),
)?;
let active_constraints = {
let local_ranges = block_param_ranges(specs);
let local_total_p = local_ranges.last().map(|(_, end)| *end).unwrap_or(0);
let block_constraints = collect_block_linear_constraints(family, &states, specs)?;
assemble_active_constraint_block(
&block_constraints,
&cached_active_sets,
&local_ranges,
local_total_p,
)
.map(std::sync::Arc::new)
};
return Ok(BlockwiseInnerResult {
block_states: states,
active_sets: normalize_active_sets(cached_active_sets),
log_likelihood: current_log_likelihood,
penalty_value,
cycles: cycles_done,
converged,
block_logdet_h,
block_logdet_s,
s_lambdas,
joint_workspace: cached_joint_workspace.clone(),
kkt_residual: None,
active_constraints,
});
}
if coupled_exact_joint_required {
// Bubble the structured KKT refusal report (per-block residual
// breakdown + H_pen spectrum + diagnosis) so the cause of the
// refusal survives serialization through the outer optimizer,
// the seed-validation cascade, and gamfit. When the cert refused
// inside the cycle loop we already computed a `KktRefusalReport`
// at the refusing iterate; reuse it verbatim. If a different
// early-exit path reaches this branch, build the same structured
// report from the last Newton math snapshot rather than routing
// through a second diagnostic string format.
let block_diag = last_kkt_refusal_report
.as_ref()
.map(KktRefusalReport::format_bubbled_error)
.unwrap_or_else(|| {
"structured KKT refusal report unavailable: no joint Newton math snapshot"
.to_string()
});
return Err(format!(
"coupled exact-joint inner solve exited the joint Newton path before convergence — {block_diag}"
));
}
// Otherwise fall through to blockwise iteration below.
}
let mut cached_eval = match cached_eval {
Some(eval) => eval,
None => family.evaluate(&states)?,
};
lastobjective = -cached_eval.log_likelihood + current_penalty;
// Divergence-detection state for the blockwise loop.
//
// Some family parameterizations (e.g. BernoulliMarginalSlopeFamily with
// linkwiggle + scorewarp) carry a near-null direction in the joint
// Hessian when the link-deviation basis's empirical anchor — fixed at
// the rigid-pilot η₀ when the basis is constructed — drifts during
// PIRLS as the location/spatial blocks update η₀. The Newton step
// becomes dominated by that null direction and is clamped at
// MAX_NEWTON_STEP every cycle while β grows linearly along it; the
// log-likelihood stays frozen, only the penalty changes (slowly).
// Without an early-exit the loop runs to inner_max_cycles producing
// the same -loglik over and over, which at large scale (each cycle
// ~0.5s) burns ~50s per ρ-cost call and stacks up to a 2400s timeout.
//
// Detect the pattern and bail with `converged = false` so the cost
// call returns Err / +∞, BFGS κ-optim backs off the divergent ρ
// region, and the outer loop progresses instead of grinding.
// Per-block trust-region radius in the block's penalized-Hessian metric.
// Updated each cycle by `update_joint_trust_region_radius` (the same
// function the joint-Newton path uses) on a real model-vs-truth rho
// computed from each block's penalized quadratic. Using the curvature
// metric here avoids the same starvation mechanism fixed in the joint
// path: one near-null coordinate in a block must not raw-rescale every
// other coordinate in that block. The η-overflow safety half of the
// previous static `MAX_NEWTON_STEP = 20.0` is owned by the family's
// `max_feasible_step_size` barrier check, called by the line search below;
// this variable handles only the algorithmic trust-region half. The
// initial seed value is the family-declared safe step for a fresh fit; the
// function then adapts it freely (clamped to [1e-12, 1e6] by the function
// itself, same as the joint path).
const BLOCK_NEWTON_STEP_INITIAL: f64 = 20.0;
let mut block_max_step: Vec<f64> = vec![BLOCK_NEWTON_STEP_INITIAL; specs.len()];
let mut prev_log_likelihood_for_divergence_check = cached_eval.log_likelihood;
// Frozen-loglik streak rides the shared window discipline
// (loop_guard::FlatStreak, #968); the frozen-loglik predicate and the
// clamped-step side condition below stay local — they are policy about
// what counts as flat, which this loop rightly owns.
let mut frozen_loglik_streak =
crate::solver::loop_guard::FlatStreak::new(DIVERGENCE_FROZEN_LOGLIK_CYCLES);
// Coordinate descent visits each block in turn, so `max_proposed_step`
// (the per-cycle max across blocks) only fires the cap on cycles where
// the divergent block is the active one. On a near-null direction this
// produces an alternation pattern (e.g. cap, cap, small, cap, small,
// cap, …) and a strict "consecutive cycles where step is clamped"
// requirement resets the counter every time another block's smaller
// step dominates the per-cycle maximum. The frozen-loglik signal,
// however, is a property of the joint state — it stays true across
// every cycle of the alternation. Track frozen-loglik consecutively
// and require that `step_clamped` was observed AT LEAST ONCE inside
// the frozen run (rather than EVERY cycle).
let mut clamped_step_in_frozen_run: bool = false;
const DIVERGENCE_FROZEN_LOGLIK_CYCLES: usize = 8;
let is_dynamic = family.block_geometry_is_dynamic();
// EMA of per-cycle wall-clock for timing-driven adaptive early-exit (#289).
// α = 0.3 gives a short memory (~3 cycles) so the EMA tracks recent cost.
let mut ema_cycle_secs: Option<f64> = None;
// Initial objective for the grad-ratio predicate.
let initial_objective = lastobjective;
for cycle in 0..inner_max_cycles {
let cycle_start = std::time::Instant::now();
// Fires at the top of each blockwise coordinate cycle so we can count
// iterations from CI logs when a benchmark hangs inside the first
// outer-eval. Emitted at info-level: same rationale as the joint-Newton
// sibling above — silent-grind diagnosis without debug logs.
log::info!(
"[PIRLS/blockwise coord] cycle {:>3}/{} | -loglik {:.6e} | penalty {:.6e} | objective {:.6e}",
cycle,
inner_max_cycles,
-cached_eval.log_likelihood,
current_penalty,
lastobjective,
);
let mut max_proposed_beta_step = 0.0_f64;
let mut max_accepted_beta_step = 0.0_f64;
let mut trust_boundary_hit_in_cycle = false;
let mut objective_cycle_prev = lastobjective;
// Reuse cached evaluation from end of previous cycle (or initial eval).
// For dynamic families, the end-of-cycle evaluation is also reused here
// instead of re-evaluating redundantly — the state hasn't changed since
// the last cycle's final evaluate.
let mut cycle_eval = std::mem::replace(
&mut cached_eval,
FamilyEvaluation {
log_likelihood: 0.0,
blockworking_sets: Vec::new(),
},
);
if cycle_eval.blockworking_sets.len() != specs.len() {
return Err(format!(
"family returned {} block working sets, expected {}",
cycle_eval.blockworking_sets.len(),
specs.len()
));
}
// Track whether any block was modified this cycle (for dynamic families,
// we only need to re-evaluate before block b if a previous block changed).
let mut any_block_modified = false;
for b in 0..specs.len() {
if is_dynamic && any_block_modified {
// Only re-evaluate if a previous block in this cycle actually
// modified coefficients. Skips the redundant evaluate for the
// first block (b=0) since cached_eval is still valid.
refresh_all_block_etas(family, specs, &mut states)?;
cycle_eval = family.evaluate(&states)?;
if cycle_eval.blockworking_sets.len() != specs.len() {
return Err(format!(
"family returned {} block working sets, expected {}",
cycle_eval.blockworking_sets.len(),
specs.len()
));
}
}
let spec = &specs[b];
let work = &cycle_eval.blockworking_sets[b];
let linear_constraints = family.block_linear_constraints(&states, b, spec)?;
let s_lambda = &s_lambdas[b];
let updater = work.updater();
let update = updater.compute_update_step(&BlockUpdateContext {
family,
states: &states,
spec,
block_idx: b,
s_lambda,
options,
linear_constraints: linear_constraints.as_ref(),
cached_active_set: cached_active_sets[b].as_deref(),
})?;
if let Some(active_set) = update.active_set {
cached_active_sets[b] = Some(active_set);
}
let beta_new_raw = update.beta_new_raw;
let beta_new = family.post_update_block_beta(&states, b, spec, beta_new_raw.clone())?;
reject_constrained_post_update_repair(
b,
spec,
&beta_new_raw,
&beta_new,
linear_constraints.as_ref(),
)?;
let beta_old = states[b].beta.clone();
let raw_delta = &beta_new - &beta_old;
// Per-block trust-region radius in the block's local
// penalized-Hessian metric. The cap is the current value of
// `block_max_step[b]`, updated below via
// `update_joint_trust_region_radius` once we know rho.
let block_cap = block_max_step[b];
let (delta, step_metric_norm) = truncate_block_step_to_metric_radius(
spec,
work,
s_lambda,
raw_delta,
block_cap,
ridge,
options.ridge_policy,
)?;
let step_hit_trust_boundary =
joint_block_step_hit_trust_boundary(step_metric_norm, block_cap);
trust_boundary_hit_in_cycle |= step_hit_trust_boundary;
// Capture the objective at the start of this block update so
// we can compute the true `actual_reduction` once the line
// search has finished. `objective_cycle_prev` is the running
// total: it advances inside the line search whenever a trial
// is accepted, so we must snapshot it here.
let obj_before_block = objective_cycle_prev;
let old_block_penalty =
block_quadratic_penalty(&beta_old, s_lambda, ridge, options.ridge_policy);
let step_beta_inf = delta.iter().copied().map(f64::abs).fold(0.0, f64::max);
max_proposed_beta_step = max_proposed_beta_step.max(step_beta_inf);
if step_beta_inf <= inner_tol {
continue;
}
// Damped update: require non-increasing penalized objective under dynamic geometry.
// Precompute X * delta once so line-search eta updates are O(n) not O(np).
// Reuse pre-allocated eta backup to avoid O(n) allocation per block per cycle.
let eta_checkpoint = BlockEtaCheckpoint::capture_reuse(&states[b], &mut eta_backups[b]);
let x_delta = if !is_dynamic {
Some(spec.solver_design().matrixvectormultiply(&delta))
} else {
None
};
let mut accepted = false;
// Barrier-aware step ceiling: families with natural log-barrier
// terms (e.g. log(h') in transformation-normal) report the maximum
// feasible step fraction so the line search never evaluates the
// likelihood outside its domain.
let barrier_ceiling = family
.max_feasible_step_size(&states, b, &delta)?
.unwrap_or(1.0);
// Reuse trial_beta_buf to avoid allocation per backtracking trial.
let mut trial_beta_buf = beta_old.clone();
let mut accepted_bt: usize = usize::MAX;
for bt in 0..8 {
let alpha = (0.5f64.powi(bt)).min(barrier_ceiling);
trial_beta_buf.assign(&beta_old);
trial_beta_buf.scaled_add(alpha, &delta);
let trial_beta =
family.post_update_block_beta(&states, b, spec, trial_beta_buf.clone())?;
reject_constrained_post_update_repair(
b,
spec,
&trial_beta_buf,
&trial_beta,
linear_constraints.as_ref(),
)?;
states[b].beta = trial_beta;
// Use precomputed X*delta when geometry is static and beta wasn't modified.
if let Some(ref xd) = x_delta {
if states[b].beta == trial_beta_buf {
eta_checkpoint.restore_eta_with_step(&mut states[b], alpha, xd);
} else {
refresh_single_block_eta(family, specs, &mut states, b)?;
}
} else {
refresh_single_block_eta(family, specs, &mut states, b)?;
}
let trial_block_penalty =
block_quadratic_penalty(&states[b].beta, s_lambda, ridge, options.ridge_policy);
let trial_penalty = current_penalty - old_block_penalty + trial_block_penalty;
let line_search_options = coefficient_line_search_options(
options,
objective_cycle_prev - trial_penalty + 1e-10,
);
let trial_ll =
match family.log_likelihood_only_with_options(&states, &line_search_options) {
Ok(value) => value,
Err(_) => {
states[b].beta.assign(&beta_old);
eta_checkpoint.restore_eta(&mut states[b]);
continue;
}
};
let trialobjective = -trial_ll + trial_penalty;
if trialobjective.is_finite() && trialobjective <= objective_cycle_prev + 1e-10 {
objective_cycle_prev = trialobjective;
current_penalty = trial_penalty;
accepted = true;
accepted_bt = bt as usize;
break;
}
}
// Trust-region update for this block, using the same
// `update_joint_trust_region_radius` strategy the
// joint-Newton path uses. Predicted reduction is computed
// from the per-block penalized quadratic model:
//
// Q(β + αδ) ≈ Q(β) − α·rhs·δ + 0.5·α²·δ·H_pen·δ
// predicted_reduction(α) = α·(rhs·δ) − 0.5·α²·(δ·H_pen·δ)
//
// where `rhs = score − S·β (− ridge·β)` is the penalized
// gradient (in maximize-direction) and `H_pen = H + S
// (+ ridge·I)` is the penalized observed information.
// Actual reduction is the true penalized objective change
// measured by the line search; rho = actual / predicted is
// the standard model-vs-truth ratio that drives the same
// 0.25 / 0.75 grow-shrink rules `update_joint_trust_region_radius`
// already implements for the joint path.
let alpha_accepted = if accepted {
0.5_f64.powi(accepted_bt as i32)
} else {
0.0
};
let (rhs_block, hpen_delta_full): (Array1<f64>, Array1<f64>) = match work {
BlockWorkingSet::ExactNewton { gradient, .. } => {
let mut rhs = gradient - &s_lambda.dot(&beta_old);
if options.ridge_policy.include_quadratic_penalty && ridge > 0.0 {
rhs.scaled_add(-ridge, &beta_old);
}
let hpen = block_penalized_hessian_vector(
spec,
work,
s_lambda,
&delta,
ridge,
options.ridge_policy,
);
(rhs, hpen)
}
BlockWorkingSet::Diagonal {
working_response,
working_weights,
} => {
// IRLS local-quadratic gradient and Hessian:
// rhs = X^T W (z − Xβ) − Sβ
// H_pen δ = X^T W X δ + Sδ
let solver_design = spec.solver_design();
let xb = solver_design.matrixvectormultiply(&beta_old);
let resid = working_response - &xb;
let w_resid = &resid * working_weights;
let mut rhs = solver_design.transpose_vector_multiply(&w_resid);
rhs -= &s_lambda.dot(&beta_old);
if options.ridge_policy.include_quadratic_penalty && ridge > 0.0 {
rhs.scaled_add(-ridge, &beta_old);
}
let hpen = block_penalized_hessian_vector(
spec,
work,
s_lambda,
&delta,
ridge,
options.ridge_policy,
);
(rhs, hpen)
}
};
let rhs_dot_delta = rhs_block.dot(&delta);
let delta_dot_hpen = delta.dot(&hpen_delta_full);
let predicted_reduction = alpha_accepted * rhs_dot_delta
- 0.5 * alpha_accepted * alpha_accepted * delta_dot_hpen;
let actual_reduction = obj_before_block - objective_cycle_prev;
let trust_update = update_joint_trust_region_radius(
block_max_step[b],
alpha_accepted * step_metric_norm,
actual_reduction,
predicted_reduction,
obj_before_block,
);
block_max_step[b] = trust_update.radius;
if !accepted {
states[b].beta.assign(&beta_old);
eta_checkpoint.restore_eta(&mut states[b]);
if let BlockWorkingSet::ExactNewton { gradient, .. } = work {
let mut raw_descent = gradient - &s_lambda.dot(&beta_old);
if options.ridge_policy.include_quadratic_penalty && ridge > 0.0 {
raw_descent -= &beta_old.mapv(|v| ridge * v);
}
let (descent_dir, descent_metric_norm) = truncate_block_step_to_metric_radius(
spec,
work,
s_lambda,
raw_descent,
block_cap,
ridge,
options.ridge_policy,
)?;
trust_boundary_hit_in_cycle |=
joint_block_step_hit_trust_boundary(descent_metric_norm, block_cap);
let dir_norm = descent_dir.iter().fold(0.0_f64, |m, &v| m.max(v.abs()));
if dir_norm > inner_tol {
// Precompute X * descent_dir once for incremental eta updates.
let x_descent = if !is_dynamic {
Some(spec.solver_design().matrixvectormultiply(&descent_dir))
} else {
None
};
let descent_barrier_ceiling = family
.max_feasible_step_size(&states, b, &descent_dir)?
.unwrap_or(1.0);
for bt in 0..12 {
let alpha = (0.5f64.powi(bt)).min(descent_barrier_ceiling);
trial_beta_buf.assign(&beta_old);
trial_beta_buf.scaled_add(alpha, &descent_dir);
let trial_beta = family.post_update_block_beta(
&states,
b,
spec,
trial_beta_buf.clone(),
)?;
reject_constrained_post_update_repair(
b,
spec,
&trial_beta_buf,
&trial_beta,
linear_constraints.as_ref(),
)?;
states[b].beta = trial_beta;
if let Some(ref xd) = x_descent {
if states[b].beta == trial_beta_buf {
eta_checkpoint.restore_eta_with_step(&mut states[b], alpha, xd);
} else {
refresh_single_block_eta(family, specs, &mut states, b)?;
}
} else {
refresh_single_block_eta(family, specs, &mut states, b)?;
}
let trial_block_penalty = block_quadratic_penalty(
&states[b].beta,
s_lambda,
ridge,
options.ridge_policy,
);
let trial_penalty =
current_penalty - old_block_penalty + trial_block_penalty;
let line_search_options = coefficient_line_search_options(
options,
objective_cycle_prev - trial_penalty + 1e-10,
);
let trial_ll = match family
.log_likelihood_only_with_options(&states, &line_search_options)
{
Ok(value) => value,
Err(_) => {
states[b].beta.assign(&beta_old);
eta_checkpoint.restore_eta(&mut states[b]);
continue;
}
};
let trialobjective = -trial_ll + trial_penalty;
if trialobjective.is_finite()
&& trialobjective <= objective_cycle_prev + 1e-10
{
objective_cycle_prev = trialobjective;
current_penalty = trial_penalty;
accepted = true;
break;
}
states[b].beta.assign(&beta_old);
eta_checkpoint.restore_eta(&mut states[b]);
}
}
}
}
if !accepted {
states[b].beta.assign(&beta_old);
eta_checkpoint.restore_eta(&mut states[b]);
} else {
let accepted_step = states[b]
.beta
.iter()
.zip(beta_old.iter())
.map(|(new, old)| (new - old).abs())
.fold(0.0_f64, f64::max);
max_accepted_beta_step = max_accepted_beta_step.max(accepted_step);
any_block_modified = true;
}
// Recycle the checkpoint's buffer back into the pre-allocated pool.
eta_backups[b] = eta_checkpoint.into_buffer();
}
// For non-dynamic families, incremental eta updates within the block loop
// maintain correct etas. Only refresh from scratch for dynamic-geometry families
// where block interactions may require recomputation.
if is_dynamic {
refresh_all_block_etas(family, specs, &mut states)?;
}
cached_eval = family.evaluate(&states)?;
current_penalty = total_quadratic_penalty(
&states,
&s_lambdas,
ridge,
options.ridge_policy,
joint_bundle,
Some(specs),
);
let objective = -cached_eval.log_likelihood + current_penalty;
let objective_change = (objective - lastobjective).abs();
lastobjective = objective;
cycles_done = cycle + 1;
// Divergence guard (mirrors the joint-Newton sibling, gam#554): a
// non-finite objective / log-likelihood means a near-unidentified
// penalized block has propagated NaN mass through the coordinate
// descent. Every convergence and divergence-frozen exit below is a
// finite `<=` comparison that NaN silently defeats, so without this
// the loop grinds the full `inner_max_cycles` on every outer ρ-eval
// and startup seed. Break unconverged so the outer optimizer rejects
// this point immediately instead of burning the budget.
if !objective.is_finite() || !cached_eval.log_likelihood.is_finite() {
log::warn!(
"[PIRLS/blockwise convergence] cycle {:>3} | divergence guard: non-finite inner state (objective={:.3e}, -loglik={:.3e}); returning unconverged so the outer optimizer rejects this ρ evaluation instead of running to inner_max_cycles.",
cycle,
objective,
-cached_eval.log_likelihood,
);
converged = false;
break;
}
// Scale-aware tolerances — see the matching joint-Newton path
// above for the rationale. At large scale absolute step/residual
// tolerances against `inner_tol = 1e-6` keep this loop spinning
// long after the objective has gone flat.
let beta_inf = states
.iter()
.flat_map(|s| s.beta.iter().copied())
.map(f64::abs)
.fold(0.0_f64, f64::max);
let step_tol = inner_tol * (1.0 + beta_inf);
let objective_tol = inner_tol * (1.0 + objective.abs());
let residual_tol = objective_tol;
// For single-block models the blockwise iteration IS the joint
// iteration, so block-conditional convergence implies joint
// convergence. The exact_newton_joint_stationarity check can
// stall at ~10x the tolerance due to numerical differences
// between the block-conditional and joint gradient formulations,
// causing 100s of wasted cycles on an already-converged solution.
let exact_joint_stationarity_ok = if has_joint_exacthessian && specs.len() >= 2 {
exact_newton_joint_stationarity_inf_norm(
family,
specs,
&cached_eval,
&states,
&s_lambdas,
ridge,
options.ridge_policy,
None,
)?
.map(|residual| residual <= residual_tol)
.unwrap_or(true)
} else {
true
};
log::info!(
"[PIRLS/blockwise convergence] cycle {:>3} | max_proposed_step={:.3e} (tol={:.3e}) | max_accepted_step={:.3e} | obj_change={:.3e} (tol={:.3e}) | beta_inf={:.3e} | joint_stationarity_ok={}",
cycle,
max_proposed_beta_step,
step_tol,
max_accepted_beta_step,
objective_change,
objective_tol,
beta_inf,
exact_joint_stationarity_ok,
);
// Divergence early-exit. See the rationale block at the top of
// this loop. We treat "log-likelihood unchanged + Newton step
// pinned at the trust-region cap" as a near-null direction
// signature and break out unconverged once it persists for
// DIVERGENCE_FROZEN_LOGLIK_CYCLES consecutive iterations. Tracking
// log-likelihood (not objective) is essential: when the null mode
// dominates, only the penalty drifts cycle-to-cycle, so
// `objective_change` stays above tol while -loglik is genuinely
// frozen.
let loglik_change_for_divergence_check =
(cached_eval.log_likelihood - prev_log_likelihood_for_divergence_check).abs();
let loglik_frozen_tol_for_divergence_check =
inner_tol * (1.0 + cached_eval.log_likelihood.abs());
let step_clamped_for_divergence_check = trust_boundary_hit_in_cycle;
let loglik_frozen =
loglik_change_for_divergence_check <= loglik_frozen_tol_for_divergence_check;
let frozen_verdict = frozen_loglik_streak.note(loglik_frozen);
if loglik_frozen {
if step_clamped_for_divergence_check {
clamped_step_in_frozen_run = true;
}
} else {
clamped_step_in_frozen_run = false;
}
prev_log_likelihood_for_divergence_check = cached_eval.log_likelihood;
if frozen_verdict == crate::solver::loop_guard::LoopVerdict::Plateaued
&& clamped_step_in_frozen_run
{
log::warn!(
"[PIRLS/blockwise convergence] divergence early-exit at cycle {} | -loglik={:.6e} frozen for {} consecutive cycles | max_proposed_step={:.3e} (trust-boundary hit observed in frozen run) | step_tol={:.3e}; near-null Hessian direction detected — returning unconverged so the outer optimizer backs off this region instead of running to inner_max_cycles.",
cycle,
-cached_eval.log_likelihood,
frozen_loglik_streak.streak(),
max_proposed_beta_step,
step_tol,
);
converged = false;
break;
}
// ── Timing-driven adaptive early-exit (#289) ────────────────────────
// Mirror the EMA predicate from the PIRLS LM loop: when iterations
// become trivially cheap AND the objective/step are near-stationary,
// accept convergence rather than spinning to inner_max_cycles.
// Only fires after ≥2 data points so the EMA is meaningful.
let cycle_secs = cycle_start.elapsed().as_secs_f64();
let ema = match ema_cycle_secs {
None => cycle_secs,
Some(prev) => 0.3 * cycle_secs + 0.7 * prev,
};
ema_cycle_secs = Some(ema);
if cycle >= 2 {
let cycle_cheap = ema > 0.0 && cycle_secs < 0.25 * ema;
let f_abs = lastobjective.abs().max(1.0);
let deviance_ok = (objective_change / f_abs) < inner_tol * 10.0;
let step_ok = if initial_objective.abs() > 0.0 && objective_change.is_finite() {
(objective_change / initial_objective.abs().max(1.0)) < inner_tol * 10.0
} else {
false
};
if cycle_cheap && deviance_ok && step_ok {
log::info!(
"[PIRLS/blockwise] cycle {} timing-driven adaptive early-exit: \
cycle={:.4}s ema={:.4}s obj_rel={:.3e}",
cycle,
cycle_secs,
ema,
objective_change / f_abs,
);
converged = true;
break;
}
}
// ── end timing-driven adaptive early-exit ────────────────────────────
if max_accepted_beta_step <= step_tol && objective_change <= objective_tol {
if exact_joint_stationarity_ok || max_proposed_beta_step <= step_tol {
converged = true;
}
break;
}
}
// ── Polishing joint Newton step ──
//
// For block-coupled multi-block families (e.g. GAMLSS wiggle), Gauss-Seidel
// blockwise iteration can reach step_inf < inner_tol while the joint KKT
// residual (||Sβ − grad_ℓ||_∞) remains at ~10× inner_tol. This is because
// each block is solved conditionally on other blocks' current values —
// block-conditional stationarity does not imply joint stationarity when
// the likelihood couples blocks off-diagonally.
//
// Once blockwise has placed β near the true joint optimum, a single (or
// a few) damped joint Newton steps can tighten the joint residual to the
// floor set by β magnitudes. This polishing phase is essential for the
// outer REML gradient formula (which assumes exact β̂ stationarity); a
// non-converged β̂ produces large envelope-theorem violations in the
// analytic outer gradient.
if use_joint_newton && !converged {
let ranges_joint: Vec<(usize, usize)> = {
let mut offset = 0;
specs
.iter()
.map(|s| {
let start = offset;
offset += s.design.ncols();
(start, offset)
})
.collect()
};
let total_p_joint: usize = ranges_joint.last().map_or(0, |r| r.1);
let joint_mode_diagonal_ridge =
if ridge > 0.0 && options.ridge_policy.include_quadratic_penalty {
ridge
} else {
0.0
};
let trace_diagonal_ridge = joint_mode_diagonal_ridge + JOINT_TRACE_STABILITY_RIDGE;
// Allow up to a few polishing steps. The blockwise endpoint is close
// to optimum, so step sizes should be small and line search should
// accept full steps quickly.
const POLISH_MAX_ITER: usize = 16;
for _polish_iter in 0..POLISH_MAX_ITER {
// Re-evaluate at current β to get the joint gradient and Hessian.
refresh_all_block_etas(family, specs, &mut states)?;
let eval_for_polish = family.evaluate(&states)?;
let grad_full =
match exact_newton_joint_gradient_from_eval(&eval_for_polish, specs, &states)? {
Some(g) => g,
None => break,
};
// Spec-aware joint Hessian: canonical coupled-curvature source
// (see the joint-Newton availability gate). Families overriding
// only `_with_specs` return `None` from the spec-less default.
let h_joint_opt = family.exact_newton_joint_hessian_with_specs(&states, specs)?;
let Some(h_joint) = h_joint_opt else { break };
let mut h_dense = match symmetrized_square_matrix(
h_joint,
total_p_joint,
"joint polish Hessian shape mismatch",
) {
Ok(matrix) => matrix,
Err(_) => break,
};
add_joint_penalty_to_matrix(
&mut h_dense,
&ranges_joint,
&s_lambdas,
trace_diagonal_ridge,
joint_bundle,
);
let mut beta_joint = Array1::<f64>::zeros(total_p_joint);
for b in 0..specs.len() {
let (start, end) = ranges_joint[b];
beta_joint
.slice_mut(ndarray::s![start..end])
.assign(&states[b].beta);
}
let penalty_beta = apply_joint_block_penalty(
&ranges_joint,
&s_lambdas,
&beta_joint,
joint_mode_diagonal_ridge,
joint_bundle,
);
let rhs = &grad_full - &penalty_beta;
// Respect constraints that block line search on the boundary.
// Gauss-Seidel blockwise leaves the joint KKT residual at a floor
// around |λ_k S_k β̂| for boundary-active components. The residual
// magnitude on FREE components is a better measure of whether we
// should keep polishing: if β_i is clipped at the boundary and
// KKT multiplier μ_i > 0, then rhs[i] is the multiplier, not a
// free-space gradient violation.
let block_constraints_now = collect_block_linear_constraints(family, &states, specs)?;
let joint_constraints_now = assemble_joint_linear_constraints(
&block_constraints_now,
&ranges_joint,
total_p_joint,
)?;
let mut active_mask: Vec<bool> = vec![false; total_p_joint];
if let Some(ref constraints) = joint_constraints_now
&& let Ok(Some(bounds)) = extract_simple_lower_bounds(constraints, total_p_joint)
{
for (idx, (bound, beta_val)) in bounds
.lower_bounds
.iter()
.zip(beta_joint.iter())
.enumerate()
{
if *bound > f64::NEG_INFINITY && (*beta_val - *bound).abs() < 1e-12 {
active_mask[idx] = true;
}
}
}
let res_inf_free = rhs
.iter()
.zip(active_mask.iter())
.filter(|(_, active)| !**active)
.map(|(v, _)| v.abs())
.fold(0.0_f64, f64::max);
// Scale-aware residual tolerance — the joint stationarity
// residual ‖∇ℓ − Sβ‖_∞ scales with |obj| (≈ O(n) at large-scale
// scale), so the historical absolute `inner_tol = 1e-6` is
// unachievable here even at the true minimum. Same rationale
// as the joint-Newton convergence test above.
let polish_obj = -cached_eval.log_likelihood + current_penalty;
let polish_residual_tol = inner_tol * (1.0 + polish_obj.abs());
if res_inf_free <= polish_residual_tol {
converged = true;
break;
}
// Solve constrained Newton system if simple bounds are present,
// else unconstrained.
let delta = if let Some(ref constraints) = joint_constraints_now {
let warm = flatten_joint_active_set(&cached_active_sets, &block_constraints_now);
let lower_bounds_opt = extract_simple_lower_bounds(constraints, total_p_joint)
.ok()
.flatten();
if let Some(bounds) = lower_bounds_opt.as_ref() {
match solve_quadratic_with_simple_lower_bounds(
&h_dense,
&rhs,
&beta_joint,
bounds,
warm.as_deref(),
) {
Ok((beta_new, _active)) => &beta_new - &beta_joint,
Err(_) => break,
}
} else {
match solve_quadratic_with_linear_constraints(
&h_dense,
&rhs,
&beta_joint,
constraints,
warm.as_deref(),
) {
Ok((beta_new, _active)) => &beta_new - &beta_joint,
Err(_) => break,
}
}
} else {
let solver = crate::linalg::utils::StableSolver::new("joint polish");
match solver.solvevectorwithridge_retries(
&h_dense,
&rhs,
JOINT_TRACE_STABILITY_RIDGE,
) {
Some(d) => d,
None => break,
}
};
if !delta.iter().all(|v| v.is_finite()) {
break;
}
// Keep polishing until the free-space joint residual is small; a
// tiny delta alone is not a certificate of stationarity.
// Damped line search with projection.
let old_states: Vec<ParameterBlockState> = states.clone();
let old_obj = -eval_for_polish.log_likelihood + current_penalty;
let mut accepted_polish = false;
for bt in 0..10 {
let alpha = 0.5f64.powi(bt);
for b in 0..specs.len() {
let (start, end) = ranges_joint[b];
let mut trial_beta = old_states[b].beta.clone();
trial_beta.scaled_add(alpha, &delta.slice(ndarray::s![start..end]));
let projected = family.post_update_block_beta(
&old_states,
b,
&specs[b],
trial_beta.clone(),
)?;
reject_constrained_post_update_repair(
b,
&specs[b],
&trial_beta,
&projected,
block_constraints_now[b].as_ref(),
)?;
states[b].beta.assign(&projected);
}
refresh_all_block_etas(family, specs, &mut states)?;
let trial_ll = match family.log_likelihood_only(&states) {
Ok(v) => v,
Err(_) => {
for (b, s) in old_states.iter().enumerate() {
states[b] = s.clone();
}
refresh_all_block_etas(family, specs, &mut states)?;
continue;
}
};
let trial_penalty = total_quadratic_penalty(
&states,
&s_lambdas,
ridge,
options.ridge_policy,
joint_bundle,
Some(specs),
);
let trial_obj = -trial_ll + trial_penalty;
if trial_obj.is_finite() && trial_obj <= old_obj + 1e-12 {
current_penalty = trial_penalty;
cached_eval = family.evaluate(&states)?;
accepted_polish = true;
break;
}
}
if !accepted_polish {
// Restore and stop polishing.
for (b, s) in old_states.iter().enumerate() {
states[b] = s.clone();
}
refresh_all_block_etas(family, specs, &mut states)?;
break;
}
}
}
// Reuse cached evaluation from the last cycle's end (or the initial eval if 0 cycles ran).
let penalty_value = total_quadratic_penalty(
&states,
&s_lambdas,
ridge,
options.ridge_policy,
joint_bundle,
Some(specs),
);
let (block_logdet_h, block_logdet_s) =
blockwise_logdet_terms(family, specs, &mut states, block_log_lambdas, options)?;
let kkt_residual = if converged {
match exact_newton_joint_gradient_from_eval(&cached_eval, specs, &states)? {
Some(gradient) => {
let block_constraints = collect_block_linear_constraints(family, &states, specs)?;
let local_total_p: usize = specs.iter().map(|spec| spec.design.ncols()).sum();
let active_set_rows_total: usize = cached_active_sets
.iter()
.map(|maybe| maybe.as_ref().map(|v| v.len()).unwrap_or(0))
.sum();
let free_rank_at_cert = local_total_p.saturating_sub(active_set_rows_total);
exact_newton_joint_projected_kkt_residual_for_ift_from_gradient(
&gradient,
specs,
&states,
&s_lambdas,
ridge,
options.ridge_policy,
&block_constraints,
Some(cached_active_sets.as_slice()),
)?
.map(|r| r.with_metadata(last_residual_tol, free_rank_at_cert))
}
None => None,
}
} else {
// Inner did not converge; no caller should trust an IFT correction
// at a non-KKT iterate.
None
};
let active_constraints = {
let local_ranges = block_param_ranges(specs);
let local_total_p = local_ranges.last().map(|(_, end)| *end).unwrap_or(0);
let block_constraints = collect_block_linear_constraints(family, &states, specs)?;
assemble_active_constraint_block(
&block_constraints,
&cached_active_sets,
&local_ranges,
local_total_p,
)
.map(std::sync::Arc::new)
};
Ok(BlockwiseInnerResult {
block_states: states,
active_sets: normalize_active_sets(cached_active_sets),
log_likelihood: cached_eval.log_likelihood,
penalty_value,
cycles: cycles_done,
converged,
block_logdet_h,
block_logdet_s,
s_lambdas,
joint_workspace: None,
kkt_residual,
active_constraints,
})
}
/// Borrowed derivative provider for joint models that wraps closures with
/// non-`'static` lifetimes.
///
/// The closures borrow data from the calling stack frame (family, synced states,
/// specs), so we use borrowed closures with a non-`'static` lifetime.
/// Instead we borrow the closures and implement `HessianDerivativeProvider` directly.
///
/// # Sign convention
///
/// The unified evaluator passes `v_k = H⁻¹(A_k β̂)` to `hessian_derivative_correction`.
/// By the implicit function theorem, `dβ̂/dρ_k = −v_k`. The stored `compute_dh`
/// expects the actual perturbation direction `δβ`, so we negate `v_k` before calling it.
struct BorrowedJointDerivProvider<'a> {
compute_dh: &'a DriftDerivFn<'a>,
compute_dh_many: Option<&'a DriftDerivManyFn<'a>>,
compute_d2h: &'a DriftSecondDerivFn<'a>,
/// Optional batched second-derivative callback. The unified evaluator's
/// outer-Hessian ρ-ρ pair loop precomputes all K(K+1)/2 (v_k, v_l, u_kl)
/// triples and calls this once per outer Hessian assembly when set, so
/// families that fuse the per-row D²H walk across pairs (e.g. survival
/// marginal-slope which scans n rows once per outer eval) replace
/// K(K+1)/2 separate row-walks with one. The default `None` falls back
/// to the per-pair `compute_d2h` dispatch and preserves the historical
/// dispatch cost.
compute_d2h_many: Option<&'a DriftSecondDerivManyFn<'a>>,
family_outer_hessian_operator:
Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>>,
}
/// Shared `(term1, term2)` second-derivative correction assembly used by both
/// the borrowed and owned joint derivative providers. `compute_dh` supplies the
/// drift derivative `D_β H[u_kl]` (term1) and `compute_d2h` the mixed second
/// derivative `D²_β H[−v_l, −v_k]` (term2); the two are fused into a single
/// `CompositeHyperOperator`. Returns `None` as soon as either term is absent.
fn joint_second_derivative_correction_result(
compute_dh: &dyn Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String>,
compute_d2h: &dyn Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String>,
v_k: &Array1<f64>,
v_l: &Array1<f64>,
u_kl: &Array1<f64>,
) -> Result<Option<DriftDerivResult>, String> {
let Some(term1) = compute_dh(u_kl)? else {
return Ok(None);
};
let neg_v_k = -v_k;
let neg_v_l = -v_l;
let Some(term2) = compute_d2h(&neg_v_l, &neg_v_k)? else {
return Ok(None);
};
let op = crate::solver::estimate::reml::unified::CompositeHyperOperator {
dense: None,
operators: vec![term1.into_operator(), term2.into_operator()],
dim_hint: u_kl.len(),
};
Ok(Some(DriftDerivResult::Operator(Arc::new(op))))
}
impl HessianDerivativeProvider for BorrowedJointDerivProvider<'_> {
fn hessian_derivative_correction(
&self,
v_k: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
Ok(self
.hessian_derivative_correction_result(v_k)?
.map(|result| result.into_operator().to_dense()))
}
fn hessian_derivative_correction_result(
&self,
v_k: &Array1<f64>,
) -> Result<Option<DriftDerivResult>, String> {
let neg_v = -v_k;
(self.compute_dh)(&neg_v)
}
fn hessian_derivative_corrections_result(
&self,
v_ks: &[Array1<f64>],
) -> Result<Vec<Option<DriftDerivResult>>, String> {
let neg_vs: Vec<Array1<f64>> = v_ks.iter().map(|v_k| -v_k).collect();
if let Some(compute_dh_many) = self.compute_dh_many {
compute_dh_many(&neg_vs)
} else {
neg_vs
.iter()
.map(|neg_v| (self.compute_dh)(neg_v))
.collect()
}
}
fn has_batched_hessian_derivative_corrections(&self) -> bool {
self.compute_dh_many.is_some()
}
fn hessian_second_derivative_correction(
&self,
v_k: &Array1<f64>,
v_l: &Array1<f64>,
u_kl: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
Ok(self
.hessian_second_derivative_correction_result(v_k, v_l, u_kl)?
.map(|result| result.into_operator().to_dense()))
}
fn hessian_second_derivative_correction_result(
&self,
v_k: &Array1<f64>,
v_l: &Array1<f64>,
u_kl: &Array1<f64>,
) -> Result<Option<DriftDerivResult>, String> {
joint_second_derivative_correction_result(self.compute_dh, self.compute_d2h, v_k, v_l, u_kl)
}
fn hessian_second_derivative_corrections_result(
&self,
triples: &[(Array1<f64>, Array1<f64>, Array1<f64>)],
) -> Result<Vec<Option<DriftDerivResult>>, String> {
// Fast path: family supplied a batched D²H callback that fuses the
// per-row scan across all K(K+1)/2 (v_k, v_l, u_kl) triples in one
// pass. Pair it with the (also potentially batched) `compute_dh`
// term1 walk over `u_kl` directions to keep the (term1, term2)
// CompositeHyperOperator semantics that the singular hook produces.
if let Some(compute_d2h_many) = self.compute_d2h_many {
let u_kls: Vec<Array1<f64>> = triples.iter().map(|(_, _, u_kl)| u_kl.clone()).collect();
let term1s = self.hessian_derivative_corrections_result(
&u_kls.iter().map(|u| -u).collect::<Vec<_>>(),
)?;
let pairs: Vec<(Array1<f64>, Array1<f64>)> =
triples.iter().map(|(v_k, v_l, _)| (-v_l, -v_k)).collect();
let term2s = compute_d2h_many(&pairs)?;
triples
.iter()
.enumerate()
.map(|(idx, (_, _, u_kl))| match (&term1s[idx], &term2s[idx]) {
(Some(t1), Some(t2)) => {
let op = crate::solver::estimate::reml::unified::CompositeHyperOperator {
dense: None,
operators: vec![t1.clone().into_operator(), t2.clone().into_operator()],
dim_hint: u_kl.len(),
};
Ok(Some(DriftDerivResult::Operator(Arc::new(op))))
}
_ => Ok(None),
})
.collect()
} else {
triples
.iter()
.map(|(v_k, v_l, u_kl)| {
self.hessian_second_derivative_correction_result(v_k, v_l, u_kl)
})
.collect()
}
}
fn has_batched_hessian_second_derivative_corrections(&self) -> bool {
self.compute_d2h_many.is_some()
}
fn has_corrections(&self) -> bool {
true
}
fn family_outer_hessian_operator(
&self,
) -> Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>> {
self.family_outer_hessian_operator.clone()
}
}
struct OwnedJointDerivProvider {
compute_dh: Arc<dyn Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync>,
compute_dh_many: Option<
Arc<dyn Fn(&[Array1<f64>]) -> Result<Vec<Option<DriftDerivResult>>, String> + Send + Sync>,
>,
compute_d2h: Arc<
dyn Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String>
+ Send
+ Sync,
>,
/// Optional batched second-derivative callback. See the matching field on
/// `BorrowedJointDerivProvider` for the dispatch contract.
compute_d2h_many: Option<
Arc<
dyn Fn(&[(Array1<f64>, Array1<f64>)]) -> Result<Vec<Option<DriftDerivResult>>, String>
+ Send
+ Sync,
>,
>,
family_outer_hessian_operator:
Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>>,
}
impl HessianDerivativeProvider for OwnedJointDerivProvider {
fn hessian_derivative_correction(
&self,
v_k: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
Ok(self
.hessian_derivative_correction_result(v_k)?
.map(|result| result.into_operator().to_dense()))
}
fn hessian_derivative_correction_result(
&self,
v_k: &Array1<f64>,
) -> Result<Option<DriftDerivResult>, String> {
let neg_v = -v_k;
(self.compute_dh)(&neg_v)
}
fn hessian_derivative_corrections_result(
&self,
v_ks: &[Array1<f64>],
) -> Result<Vec<Option<DriftDerivResult>>, String> {
let neg_vs: Vec<Array1<f64>> = v_ks.iter().map(|v_k| -v_k).collect();
if let Some(compute_dh_many) = self.compute_dh_many.as_ref() {
compute_dh_many(&neg_vs)
} else {
neg_vs
.iter()
.map(|neg_v| (self.compute_dh)(neg_v))
.collect()
}
}
fn has_batched_hessian_derivative_corrections(&self) -> bool {
self.compute_dh_many.is_some()
}
fn hessian_second_derivative_correction(
&self,
v_k: &Array1<f64>,
v_l: &Array1<f64>,
u_kl: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
Ok(self
.hessian_second_derivative_correction_result(v_k, v_l, u_kl)?
.map(|result| result.into_operator().to_dense()))
}
fn hessian_second_derivative_correction_result(
&self,
v_k: &Array1<f64>,
v_l: &Array1<f64>,
u_kl: &Array1<f64>,
) -> Result<Option<DriftDerivResult>, String> {
joint_second_derivative_correction_result(
&*self.compute_dh,
&*self.compute_d2h,
v_k,
v_l,
u_kl,
)
}
fn hessian_second_derivative_corrections_result(
&self,
triples: &[(Array1<f64>, Array1<f64>, Array1<f64>)],
) -> Result<Vec<Option<DriftDerivResult>>, String> {
if let Some(compute_d2h_many) = self.compute_d2h_many.as_ref() {
let u_kls: Vec<Array1<f64>> = triples.iter().map(|(_, _, u_kl)| u_kl.clone()).collect();
let term1s = self.hessian_derivative_corrections_result(
&u_kls.iter().map(|u| -u).collect::<Vec<_>>(),
)?;
let pairs: Vec<(Array1<f64>, Array1<f64>)> =
triples.iter().map(|(v_k, v_l, _)| (-v_l, -v_k)).collect();
let term2s = compute_d2h_many(&pairs)?;
triples
.iter()
.enumerate()
.map(|(idx, (_, _, u_kl))| match (&term1s[idx], &term2s[idx]) {
(Some(t1), Some(t2)) => {
let op = crate::solver::estimate::reml::unified::CompositeHyperOperator {
dense: None,
operators: vec![t1.clone().into_operator(), t2.clone().into_operator()],
dim_hint: u_kl.len(),
};
Ok(Some(DriftDerivResult::Operator(Arc::new(op))))
}
_ => Ok(None),
})
.collect()
} else {
triples
.iter()
.map(|(v_k, v_l, u_kl)| {
self.hessian_second_derivative_correction_result(v_k, v_l, u_kl)
})
.collect()
}
}
fn has_batched_hessian_second_derivative_corrections(&self) -> bool {
self.compute_d2h_many.is_some()
}
fn has_corrections(&self) -> bool {
true
}
fn outer_hessian_derivative_kernel(
&self,
) -> Option<crate::solver::estimate::reml::unified::OuterHessianDerivativeKernel> {
Some(
crate::solver::estimate::reml::unified::OuterHessianDerivativeKernel::Callback {
first: Arc::clone(&self.compute_dh),
second: Arc::clone(&self.compute_d2h),
},
)
}
fn family_outer_hessian_operator(
&self,
) -> Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>> {
self.family_outer_hessian_operator.clone()
}
}
/// Drift closure producing the Tier-B Jeffreys-curvature drift
/// `D_β H_Φ[δβ]` for a mode-response direction `δβ = dβ̂/dρ_k`.
///
/// The closure already expects the actual perturbation direction `δβ` (NOT the
/// raw `v_k` the trait hands the provider); the wrapper negates `v_k → δβ = −v_k`
/// before calling, exactly mirroring `BorrowedJointDerivProvider`'s sign
/// convention and the inner `compute_dh` it composes with. Returns `None` when
/// the Jeffreys term is gated out or the family lacks the exact derivatives, so
/// the wrapper falls back to the inner provider's drift unchanged.
type JeffreysHphiDriftFn =
Arc<dyn Fn(&Array1<f64>) -> Result<Option<Array2<f64>>, String> + Send + Sync>;
/// Jeffreys-`H_Φ`-aware joint derivative provider.
///
/// Wraps an inner Tier-B joint provider (which supplies the likelihood-Hessian
/// drift `D_β H_L[v_k]`) and ADDS the Jeffreys-curvature drift `D_β H_Φ[v_k]` to
/// the first-order trace corrections. This closes the bug where the Tier-B outer
/// LAML gradient omitted `H_Φ`'s ρ-dependence (through β̂): the objective folds
/// `H_Φ` into `½ log|H + S_λ + H_Φ|`, so its exact gradient
/// `½ tr[(H+S_λ+H_Φ)⁻¹ (∂_ρ S_λ + D_β H_L[v_k] + D_β H_Φ[v_k])]`
/// MUST include the `D_β H_Φ[v_k]` term. It is the exact analogue of the Tier-A
/// `FirthAwareGlmDerivatives` (`unified.rs`) `−D(Hφ)[B_k]` first-order term, and
/// of `BarrierDerivativeProvider`'s additive-correction composition pattern.
///
/// SIGN. The trait passes `v_k = H⁻¹(A_kβ̂)`; the mode response is `δβ = −v_k`.
/// We negate before invoking the drift closure, so `corr = + D_β H_Φ[δβ]` is
/// added on top of the inner provider's already-correct likelihood drift.
struct JeffreysHphiAwareJointDerivatives<'a> {
inner: Box<dyn HessianDerivativeProvider + 'a>,
drift: JeffreysHphiDriftFn,
p: usize,
}
impl<'a> JeffreysHphiAwareJointDerivatives<'a> {
fn new(
inner: Box<dyn HessianDerivativeProvider + 'a>,
drift: JeffreysHphiDriftFn,
p: usize,
) -> Self {
Self { inner, drift, p }
}
/// `D_β H_Φ[δβ]` with the trait's `v_k → δβ = −v_k` mode-response convention.
fn hphi_drift(&self, v_k: &Array1<f64>) -> Result<Option<Array2<f64>>, String> {
let delta = v_k.mapv(|value| -value);
(self.drift)(&delta)
}
}
impl HessianDerivativeProvider for JeffreysHphiAwareJointDerivatives<'_> {
fn hessian_derivative_correction(
&self,
v_k: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
let inner = self.inner.hessian_derivative_correction(v_k)?;
let drift = self.hphi_drift(v_k)?;
Ok(match (inner, drift) {
(Some(mut ic), Some(d)) => {
ic += &d;
Some(ic)
}
(Some(ic), None) => Some(ic),
(None, Some(d)) => Some(d),
(None, None) => None,
})
}
fn hessian_derivative_correction_result(
&self,
v_k: &Array1<f64>,
) -> Result<Option<DriftDerivResult>, String> {
let inner = self.inner.hessian_derivative_correction_result(v_k)?;
let drift = self.hphi_drift(v_k)?;
Ok(match (inner, drift) {
(Some(DriftDerivResult::Dense(mut dense)), Some(d)) => {
dense += &d;
Some(DriftDerivResult::Dense(dense))
}
(Some(DriftDerivResult::Operator(operator)), Some(d)) => {
Some(DriftDerivResult::Operator(Arc::new(
crate::solver::estimate::reml::unified::CompositeHyperOperator {
dense: Some(d),
operators: vec![operator],
dim_hint: self.p,
},
)))
}
(Some(other), None) => Some(other),
(None, Some(d)) => Some(DriftDerivResult::Dense(d)),
(None, None) => None,
})
}
fn hessian_derivative_corrections_result(
&self,
v_ks: &[Array1<f64>],
) -> Result<Vec<Option<DriftDerivResult>>, String> {
// Delegate the (possibly batched) inner walk, then fold the per-direction
// H_Φ drift into each result so the batched path stays consistent with the
// singular one.
let inner = self.inner.hessian_derivative_corrections_result(v_ks)?;
inner
.into_iter()
.zip(v_ks.iter())
.map(|(inner_result, v_k)| {
let drift = self.hphi_drift(v_k)?;
Ok(match (inner_result, drift) {
(Some(DriftDerivResult::Dense(mut dense)), Some(d)) => {
dense += &d;
Some(DriftDerivResult::Dense(dense))
}
(Some(DriftDerivResult::Operator(operator)), Some(d)) => {
Some(DriftDerivResult::Operator(Arc::new(
crate::solver::estimate::reml::unified::CompositeHyperOperator {
dense: Some(d),
operators: vec![operator],
dim_hint: self.p,
},
)))
}
(Some(other), None) => Some(other),
(None, Some(d)) => Some(DriftDerivResult::Dense(d)),
(None, None) => None,
})
})
.collect()
}
fn has_batched_hessian_derivative_corrections(&self) -> bool {
self.inner.has_batched_hessian_derivative_corrections()
}
// SECOND-ORDER (outer Hessian) RESIDUAL GAP. The full second-order Jeffreys
// drift `D²_β H_Φ[v_k, v_l]` (the analogue of Tier-A's
// `−D(Hφ)[B_{kl}] − D²(Hφ)[B_k, B_l]`) is NOT yet folded in here: the
// second-derivative methods delegate to the inner likelihood drift only. This
// leaves the OUTER HESSIAN's Jeffreys contribution first-order-incomplete, but
// the FIRST-ORDER outer GRADIENT — the term the line search and KKT
// certification actually consume — is now exact. ARC/Newton on the outer
// problem still gets a consistent gradient; the Hessian is a (PD) curvature
// surrogate as before.
fn hessian_second_derivative_correction(
&self,
v_k: &Array1<f64>,
v_l: &Array1<f64>,
u_kl: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
self.inner
.hessian_second_derivative_correction(v_k, v_l, u_kl)
}
fn hessian_second_derivative_correction_result(
&self,
v_k: &Array1<f64>,
v_l: &Array1<f64>,
u_kl: &Array1<f64>,
) -> Result<Option<DriftDerivResult>, String> {
self.inner
.hessian_second_derivative_correction_result(v_k, v_l, u_kl)
}
fn hessian_second_derivative_corrections_result(
&self,
triples: &[(Array1<f64>, Array1<f64>, Array1<f64>)],
) -> Result<Vec<Option<DriftDerivResult>>, String> {
self.inner
.hessian_second_derivative_corrections_result(triples)
}
fn has_batched_hessian_second_derivative_corrections(&self) -> bool {
self.inner
.has_batched_hessian_second_derivative_corrections()
}
fn has_corrections(&self) -> bool {
true
}
fn outer_hessian_derivative_kernel(
&self,
) -> Option<crate::solver::estimate::reml::unified::OuterHessianDerivativeKernel> {
// Delegate to the inner provider so the matrix-free outer-HESSIAN route
// (the `Callback { first, second }` kernel) is preserved. This kernel
// feeds ONLY the outer Hessian, never the gradient (the gradient's
// first-order trace flows through `hessian_derivative_correction_result`,
// which IS wrapped above). The H_Φ SECOND-order drift is the documented
// residual gap; routing the kernel unchanged keeps the Hessian a
// consistent PD curvature surrogate without forcing dense assembly.
self.inner.outer_hessian_derivative_kernel()
}
fn family_outer_hessian_operator(
&self,
) -> Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>> {
self.inner.family_outer_hessian_operator()
}
}
/// Optional bundle of extended (ψ) hyperparameter coordinate data to attach
/// to an `InnerSolution` before calling the unified evaluator.
struct ExtCoordBundle {
coords: Vec<HyperCoord>,
ext_ext_fn: Option<Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>>,
rho_ext_fn: Option<Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>>,
drift_fn: Option<FixedDriftDerivFn>,
/// Direction-contracted ψψ second-order hook (#740). When `Some`, the
/// outer-Hessian operator builder skips the `K²` per-pair ψψ assembly
/// (`ext_ext_fn`) and applies this once per matvec. `ext_ext_fn` is still
/// kept as the documented fallback for the dense `compute_outer_hessian`
/// path and for outer evaluations that do not build the matrix-free
/// operator.
contracted_psi_fn: Option<ContractedPsiSecondOrderFn>,
}
struct ScaledHyperOperator {
inner: Arc<dyn HyperOperator>,
scale: f64,
}
impl HyperOperator for ScaledHyperOperator {
fn dim(&self) -> usize {
self.inner.dim()
}
fn mul_vec(&self, v: &Array1<f64>) -> Array1<f64> {
self.inner.mul_vec(v).mapv(|value| self.scale * value)
}
fn bilinear(&self, v: &Array1<f64>, u: &Array1<f64>) -> f64 {
self.scale * self.inner.bilinear(v, u)
}
fn to_dense(&self) -> Array2<f64> {
self.inner.to_dense().mapv(|value| self.scale * value)
}
fn is_implicit(&self) -> bool {
false
}
}
fn scale_hypercoord_drift(mut drift: HyperCoordDrift, scale: f64) -> HyperCoordDrift {
if scale == 1.0 {
return drift;
}
if let Some(ref mut dense) = drift.dense {
*dense *= scale;
}
if let Some(ref mut block_local) = drift.block_local {
block_local.local *= scale;
}
if let Some(operator) = drift.operator.take() {
drift.operator = Some(Arc::new(ScaledHyperOperator {
inner: operator,
scale,
}));
}
drift
}
fn scale_hypercoord(mut coord: HyperCoord, scale: f64) -> HyperCoord {
if scale == 1.0 {
return coord;
}
coord.g *= scale;
if let Some(firth_g) = coord.firth_g.as_mut() {
*firth_g *= scale;
}
if let Some(tk_eta_fixed) = coord.tk_eta_fixed.as_mut() {
*tk_eta_fixed *= scale;
}
if let Some(tk_x_fixed) = coord.tk_x_fixed.as_mut() {
*tk_x_fixed *= scale;
}
coord.drift = scale_hypercoord_drift(coord.drift, scale);
coord
}
fn scale_hypercoord_pair(mut pair: HyperCoordPair, scale: f64) -> HyperCoordPair {
if scale == 1.0 {
return pair;
}
pair.g *= scale;
pair.b_mat *= scale;
if let Some(operator) = pair.b_operator.take() {
pair.b_operator = Some(Box::new(ScaledHyperOperator {
inner: Arc::from(operator),
scale,
}));
}
pair
}
fn scale_drift_deriv_result(result: DriftDerivResult, scale: f64) -> DriftDerivResult {
if scale == 1.0 {
return result;
}
match result {
DriftDerivResult::Dense(mut dense) => {
dense *= scale;
DriftDerivResult::Dense(dense)
}
DriftDerivResult::Operator(operator) => {
DriftDerivResult::Operator(Arc::new(ScaledHyperOperator {
inner: operator,
scale,
}))
}
}
}
impl ExtCoordBundle {
fn scaled(self, scale: f64) -> Self {
if scale == 1.0 {
return self;
}
let coords = self
.coords
.into_iter()
.map(|coord| scale_hypercoord(coord, scale))
.collect();
let ext_ext_fn = self.ext_ext_fn.map(|callback| {
Box::new(move |i: usize, j: usize| scale_hypercoord_pair(callback(i, j), scale))
as Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>
});
let rho_ext_fn = self.rho_ext_fn.map(|callback| {
Box::new(move |i: usize, j: usize| scale_hypercoord_pair(callback(i, j), scale))
as Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>
});
let drift_fn = self.drift_fn.map(|callback| {
Box::new(move |ext_idx: usize, direction: &Array1<f64>| {
callback(ext_idx, direction).map(|result| scale_drift_deriv_result(result, scale))
}) as FixedDriftDerivFn
});
// The contracted ψψ hook is a (scaled) linear functional of the same
// family curvature `ext_ext_fn` reproduces, so the `rho_curvature_scale`
// applies term-for-term: objective/score/ld_s by `scale`, and each
// `hessian[i]` drift via `scale_drift_deriv_result` (matching how
// `scale_hypercoord_pair` scales the per-pair `b_mat`/`b_operator`).
let contracted_psi_fn = self.contracted_psi_fn.map(|callback| {
Arc::new(move |alpha_psi: &[f64]| {
callback(alpha_psi).map(|opt| {
opt.map(|contracted| ContractedPsiSecondOrder {
objective: contracted.objective.mapv(|v| scale * v),
score: contracted.score.mapv(|v| scale * v),
hessian: contracted
.hessian
.into_iter()
.map(|drift| scale_drift_deriv_result(drift, scale))
.collect(),
ld_s: contracted.ld_s.mapv(|v| scale * v),
})
})
}) as ContractedPsiSecondOrderFn
});
Self {
coords,
ext_ext_fn,
rho_ext_fn,
drift_fn,
contracted_psi_fn,
}
}
}
/// Build the canonical unified REML/LAML assembly for a custom-family outer
/// evaluation.
fn build_custom_family_inner_assembly<'dp>(
inner: &BlockwiseInnerResult,
specs: &[ParameterBlockSpec],
per_block: &[Array1<f64>],
beta_flat: &Array1<f64>,
hessian_op: Arc<dyn crate::solver::estimate::reml::unified::HessianOperator>,
ranges: &[(usize, usize)],
total: usize,
ridge: f64,
rho_curvature_scale: f64,
hessian_logdet_correction: f64,
penalty_subspace_trace: Option<Arc<PenaltySubspaceTrace>>,
include_logdet_h: bool,
include_logdet_s: bool,
options: &BlockwiseFitOptions,
rho_prior: crate::types::RhoPrior,
deriv_provider: Box<dyn HessianDerivativeProvider + 'dp>,
ext_bundle: Option<ExtCoordBundle>,
firth_value: Option<f64>,
) -> Result<(crate::estimate::reml::assembly::InnerAssembly<'dp>, usize), String> {
use crate::estimate::reml::assembly::{
InnerAssembly, PenaltyBlockDesc, penalty_coords_from_blocks,
};
// Collect dense penalty matrices so references stay valid for the assembler.
let per_block_penalties_dense: Vec<Vec<Array2<f64>>> = {
use rayon::iter::{IntoParallelIterator, ParallelIterator};
(0..specs.len())
.into_par_iter()
.map(|b| specs[b].penalties.iter().map(|p| p.to_dense()).collect())
.collect()
};
let block_descs: Vec<PenaltyBlockDesc> = (0..specs.len())
.flat_map(|b| {
let (start, end) = ranges[b];
per_block_penalties_dense[b]
.iter()
.map(move |dense| PenaltyBlockDesc {
matrix: dense,
range_start: start,
range_end: end,
})
})
.collect();
let penalty_coords = penalty_coords_from_blocks(&block_descs, total)?;
// Compute penalty logdet derivatives.
let per_block_penalties: Vec<&[Array2<f64>]> = per_block_penalties_dense
.iter()
.map(|v| v.as_slice())
.collect();
let penalty_logdet_ridge = if options.ridge_policy.include_penalty_logdet {
ridge
} else {
0.0
};
let penalty_logdet =
compute_block_penalty_logdet_derivs(per_block, &per_block_penalties, penalty_logdet_ridge)?;
let n_observations = inner.block_states.first().map(|s| s.eta.len()).unwrap_or(0);
// Unpack optional ext-coord bundle.
let (ext_coords, ext_coord_pair_fn, rho_ext_pair_fn, fixed_drift_deriv, contracted_psi_fn) =
if let Some(bundle) = ext_bundle {
(
bundle.coords,
bundle.ext_ext_fn,
bundle.rho_ext_fn,
bundle.drift_fn,
bundle.contracted_psi_fn,
)
} else {
(Vec::new(), None, None, None, None)
};
let ext_dim = ext_coords.len();
let evaluator = InnerAssembly {
log_likelihood: inner.log_likelihood,
// inner.penalty_value includes the 0.5 factor (= 0.5 β̂ᵀSβ̂), but the
// unified evaluator convention expects the FULL quadratic β̂ᵀSβ̂ and
// applies 0.5 itself. Double to match the convention.
penalty_quadratic: 2.0 * inner.penalty_value,
beta: beta_flat.clone(),
n_observations,
hessian_op,
penalty_coords,
penalty_logdet,
dispersion: DispersionHandling::Fixed {
phi: 1.0,
include_logdet_h,
include_logdet_s,
},
rho_curvature_scale,
rho_prior,
hessian_logdet_correction,
penalty_subspace_trace,
deriv_provider: Some(deriv_provider),
tk_correction: 0.0,
tk_gradient: None,
// Tier-B Firth fold (gam#979): the inner mode minimizes
// `−ℓ + ½βᵀSβ − Φ`, so the LAML cost must subtract the same gated
// `Φ(β̂)` or the envelope-based analytic outer gradient and the value
// describe different criteria at every Firth-active mode.
firth: firth_value.map(crate::estimate::reml::unified::ExactJeffreysTerm::value_only),
nullspace_dim: None,
barrier_config: None,
ext_coords,
ext_coord_pair_fn,
rho_ext_pair_fn,
fixed_drift_deriv,
contracted_psi_second_order: contracted_psi_fn,
kkt_residual: inner.kkt_residual.clone(),
active_constraints: inner.active_constraints.clone(),
};
Ok((evaluator, ext_dim))
}
struct FirstOrderTraceSkipOperator {
inner: Arc<dyn HessianOperator>,
remaining_first_order_traces: AtomicUsize,
}
impl FirstOrderTraceSkipOperator {
fn new(inner: Arc<dyn HessianOperator>, skip_count: usize) -> Self {
Self {
inner,
remaining_first_order_traces: AtomicUsize::new(skip_count),
}
}
fn first_order_skip_active(&self) -> bool {
self.remaining_first_order_traces.load(Ordering::Acquire) > 0
}
fn consume_first_order_trace(&self) -> bool {
let mut current = self.remaining_first_order_traces.load(Ordering::Acquire);
while current > 0 {
match self.remaining_first_order_traces.compare_exchange(
current,
current - 1,
Ordering::AcqRel,
Ordering::Acquire,
) {
Ok(_) => return true,
Err(actual) => current = actual,
}
}
false
}
}
impl HessianOperator for FirstOrderTraceSkipOperator {
fn logdet(&self) -> f64 {
self.inner.logdet()
}
fn trace_hinv_product(&self, a: &Array2<f64>) -> f64 {
self.inner.trace_hinv_product(a)
}
fn as_exact_dense_spectral(&self) -> Option<&DenseSpectralOperator> {
if self.first_order_skip_active() {
None
} else {
self.inner.as_exact_dense_spectral()
}
}
fn assemble_h_dense_for_tangent_projection(&self) -> Result<Array2<f64>, String> {
if self.first_order_skip_active() {
Err("backend does not support tangent projection".to_string())
} else {
self.inner.assemble_h_dense_for_tangent_projection()
}
}
fn trace_hinv_operator(&self, op: &dyn HyperOperator) -> f64 {
self.inner.trace_hinv_operator(op)
}
fn trace_hinv_h_k(
&self,
a_k: &Array2<f64>,
third_deriv_correction: Option<&Array2<f64>>,
) -> f64 {
self.inner.trace_hinv_h_k(a_k, third_deriv_correction)
}
fn solve(&self, rhs: &Array1<f64>) -> Array1<f64> {
self.inner.solve(rhs)
}
fn solve_multi(&self, rhs: &Array2<f64>) -> Array2<f64> {
self.inner.solve_multi(rhs)
}
fn stochastic_trace_solve(&self, rhs: &Array1<f64>, rel_tol: f64) -> Array1<f64> {
self.inner.stochastic_trace_solve(rhs, rel_tol)
}
fn stochastic_trace_solve_for_probe(
&self,
rhs: &Array1<f64>,
rel_tol: f64,
probe_id: u64,
trace_state: Option<&Arc<Mutex<StochasticTraceState>>>,
) -> Array1<f64> {
self.inner
.stochastic_trace_solve_for_probe(rhs, rel_tol, probe_id, trace_state)
}
fn stochastic_trace_solve_multi(&self, rhs: &Array2<f64>, rel_tol: f64) -> Array2<f64> {
self.inner.stochastic_trace_solve_multi(rhs, rel_tol)
}
fn has_matrix_free_trace_cg_operator(&self) -> bool {
self.inner.has_matrix_free_trace_cg_operator()
}
fn trace_hinv_product_cross(&self, a: &Array2<f64>, b: &Array2<f64>) -> f64 {
self.inner.trace_hinv_product_cross(a, b)
}
fn trace_hinv_matrix_operator_cross(
&self,
matrix: &Array2<f64>,
op: &dyn HyperOperator,
) -> f64 {
self.inner.trace_hinv_matrix_operator_cross(matrix, op)
}
fn trace_hinv_operator_cross(
&self,
left: &dyn HyperOperator,
right: &dyn HyperOperator,
) -> f64 {
self.inner.trace_hinv_operator_cross(left, right)
}
fn trace_logdet_gradient(&self, a: &Array2<f64>) -> f64 {
if self.consume_first_order_trace() {
0.0
} else {
self.inner.trace_logdet_gradient(a)
}
}
fn xt_logdet_kernel_x_diagonal(&self, x: &DesignMatrix) -> Array1<f64> {
self.inner.xt_logdet_kernel_x_diagonal(x)
}
fn trace_logdet_operator(&self, op: &dyn HyperOperator) -> f64 {
if self.consume_first_order_trace() {
0.0
} else {
self.inner.trace_logdet_operator(op)
}
}
fn trace_logdet_h_k(
&self,
a_k: &Array2<f64>,
third_deriv_correction: Option<&Array2<f64>>,
) -> f64 {
if self.consume_first_order_trace() {
0.0
} else {
self.inner.trace_logdet_h_k(a_k, third_deriv_correction)
}
}
fn trace_logdet_h_k_operator(
&self,
b_k: &dyn HyperOperator,
third_deriv_correction: Option<&Array2<f64>>,
) -> f64 {
if self.consume_first_order_trace() {
0.0
} else {
self.inner
.trace_logdet_h_k_operator(b_k, third_deriv_correction)
}
}
fn trace_logdet_block_local(
&self,
block: &Array2<f64>,
scale: f64,
start: usize,
end: usize,
) -> f64 {
if self.consume_first_order_trace() {
0.0
} else {
self.inner
.trace_logdet_block_local(block, scale, start, end)
}
}
fn trace_hinv_block_local(
&self,
block: &Array2<f64>,
scale: f64,
start: usize,
end: usize,
) -> f64 {
self.inner.trace_hinv_block_local(block, scale, start, end)
}
fn trace_hinv_block_local_cross(
&self,
block: &Array2<f64>,
scale: f64,
start: usize,
end: usize,
) -> f64 {
self.inner
.trace_hinv_block_local_cross(block, scale, start, end)
}
fn trace_logdet_hessian_cross(&self, h_i: &Array2<f64>, h_j: &Array2<f64>) -> f64 {
self.inner.trace_logdet_hessian_cross(h_i, h_j)
}
fn trace_logdet_hessian_cross_matrix_operator(
&self,
h_i: &Array2<f64>,
h_j: &dyn HyperOperator,
) -> f64 {
self.inner
.trace_logdet_hessian_cross_matrix_operator(h_i, h_j)
}
fn trace_logdet_hessian_cross_operator(
&self,
h_i: &dyn HyperOperator,
h_j: &dyn HyperOperator,
) -> f64 {
self.inner.trace_logdet_hessian_cross_operator(h_i, h_j)
}
fn trace_logdet_hessian_crosses(&self, matrices: &[&Array2<f64>]) -> Array2<f64> {
self.inner.trace_logdet_hessian_crosses(matrices)
}
fn active_rank(&self) -> usize {
self.inner.active_rank()
}
fn dim(&self) -> usize {
self.inner.dim()
}
fn is_dense(&self) -> bool {
self.inner.is_dense()
}
fn prefers_stochastic_trace_estimation(&self) -> bool {
if self.first_order_skip_active() {
false
} else {
self.inner.prefers_stochastic_trace_estimation()
}
}
fn logdet_traces_match_hinv_kernel(&self) -> bool {
self.inner.logdet_traces_match_hinv_kernel()
}
fn as_dense_spectral(&self) -> Option<&DenseSpectralOperator> {
if self.first_order_skip_active() {
None
} else {
self.inner.as_dense_spectral()
}
}
}
/// Build an `InnerSolution` from joint Hessian data and call the unified evaluator.
///
/// Bridge between the custom family's joint Hessian infrastructure and the
/// unified REML/LAML evaluator, routed through the canonical assembly module.
fn unified_joint_cost_gradient(
inner: &BlockwiseInnerResult,
specs: &[ParameterBlockSpec],
per_block: &[Array1<f64>],
rho: &Array1<f64>,
beta_flat: &Array1<f64>,
hessian_op: Arc<dyn crate::solver::estimate::reml::unified::HessianOperator>,
ranges: &[(usize, usize)],
total: usize,
ridge: f64,
rho_curvature_scale: f64,
hessian_logdet_correction: f64,
penalty_subspace_trace: Option<Arc<PenaltySubspaceTrace>>,
include_logdet_h: bool,
include_logdet_s: bool,
options: &BlockwiseFitOptions,
rho_prior: crate::types::RhoPrior,
deriv_provider: Box<dyn HessianDerivativeProvider + '_>,
eval_mode: EvalMode,
ext_bundle: Option<ExtCoordBundle>,
first_order_trace_skip: Option<Array1<f64>>,
// Gated Tier-B Jeffreys value `Φ(β̂)`, folded into the LAML cost
// (`cost −= Φ`) so the outer criterion matches the Φ-augmented inner
// objective (gam#979). `None` when the term is unavailable/gated to zero.
firth_value: Option<f64>,
) -> Result<
(
f64,
Array1<f64>,
crate::solver::outer_strategy::HessianResult,
),
String,
> {
let hessian_op: Arc<dyn HessianOperator> = match first_order_trace_skip.as_ref() {
Some(trace_values) if !trace_values.is_empty() => Arc::new(
FirstOrderTraceSkipOperator::new(hessian_op, trace_values.len()),
),
_ => hessian_op,
};
let (evaluator, ext_dim) = build_custom_family_inner_assembly(
inner,
specs,
per_block,
beta_flat,
hessian_op,
ranges,
total,
ridge,
rho_curvature_scale,
hessian_logdet_correction,
penalty_subspace_trace,
include_logdet_h,
include_logdet_s,
options,
rho_prior,
deriv_provider,
ext_bundle,
firth_value,
)?;
let rho_slice = rho
.as_slice()
.ok_or_else(|| "outer rho vector must be contiguous".to_string())?;
let first_order_trace_correction = first_order_trace_skip.map(|trace_values| {
let gradient_correction = trace_values.mapv(|trace| 0.5 * trace);
(0.0, gradient_correction, None)
});
let result = evaluator.evaluate(rho_slice, eval_mode, first_order_trace_correction)?;
let cost = result.cost;
let gradient = result
.gradient
.unwrap_or_else(|| Array1::zeros(rho.len() + ext_dim));
let hessian = result.hessian;
Ok((cost, gradient, hessian))
}
fn unified_joint_efs_eval(
inner: &BlockwiseInnerResult,
specs: &[ParameterBlockSpec],
per_block: &[Array1<f64>],
rho: &Array1<f64>,
beta_flat: &Array1<f64>,
hessian_op: Arc<dyn crate::solver::estimate::reml::unified::HessianOperator>,
ranges: &[(usize, usize)],
total: usize,
ridge: f64,
rho_curvature_scale: f64,
hessian_logdet_correction: f64,
penalty_subspace_trace: Option<Arc<PenaltySubspaceTrace>>,
include_logdet_h: bool,
include_logdet_s: bool,
options: &BlockwiseFitOptions,
rho_prior: crate::types::RhoPrior,
deriv_provider: Box<dyn HessianDerivativeProvider + '_>,
ext_bundle: Option<ExtCoordBundle>,
) -> Result<crate::solver::outer_strategy::EfsEval, String> {
let (assembly, _) = build_custom_family_inner_assembly(
inner,
specs,
per_block,
beta_flat,
hessian_op,
ranges,
total,
ridge,
rho_curvature_scale,
hessian_logdet_correction,
penalty_subspace_trace,
include_logdet_h,
include_logdet_s,
options,
rho_prior,
deriv_provider,
ext_bundle,
// The EFS screening path evaluates the Φ-less criterion with an
// unaugmented operator throughout; it stays self-consistent without
// the Tier-B Firth fold.
None,
)?;
let rho_slice = rho
.as_slice()
.ok_or_else(|| "outer rho vector must be contiguous".to_string())?;
let inner_solution = assembly.build();
let has_psi = inner_solution
.ext_coords
.iter()
.any(|coord| !coord.is_penalty_like);
// Always evaluate gradient: the universal-form EFS step
// `Δρ = log(1 − 2·g_full / q_eff)` reads it directly from the cost
// gradient slot, so out-of-band cost terms (TK, prior, Firth,
// barrier, SAS log-δ ridge) shift the multiplicative target through
// their gradient contribution without needing per-augmentation
// post-corrections.
let eval_mode = EvalMode::ValueAndGradient;
let result = crate::estimate::reml::assembly::evaluate_solution(
&inner_solution,
rho_slice,
eval_mode,
None,
)?;
let gradient = result
.gradient
.as_ref()
.ok_or_else(|| "EFS evaluation did not return the required gradient".to_string())?;
let gradient_slice = gradient
.as_slice()
.ok_or_else(|| "outer gradient must be contiguous for EFS".to_string())?;
if has_psi {
let inner_hessian_scale = crate::estimate::reml::unified::hessian_operator_geometric_scale(
inner_solution.hessian_op.as_ref(),
);
let hybrid = crate::estimate::reml::unified::compute_hybrid_efs_update(
&inner_solution,
rho_slice,
gradient_slice,
);
Ok(crate::solver::outer_strategy::EfsEval {
cost: result.cost,
steps: hybrid.steps,
beta: Some(inner_solution.beta.clone()),
psi_gradient: if hybrid.psi_gradient.is_empty() {
None
} else {
Some(Array1::from_vec(hybrid.psi_gradient))
},
psi_indices: if hybrid.psi_indices.is_empty() {
None
} else {
Some(hybrid.psi_indices)
},
inner_hessian_scale,
logdet_enclosure_gap: None,
})
} else {
let inner_hessian_scale = crate::estimate::reml::unified::hessian_operator_geometric_scale(
inner_solution.hessian_op.as_ref(),
);
Ok(crate::solver::outer_strategy::EfsEval {
cost: result.cost,
steps: crate::estimate::reml::unified::compute_efs_update(
&inner_solution,
rho_slice,
gradient_slice,
),
beta: Some(inner_solution.beta.clone()),
psi_gradient: None,
psi_indices: None,
inner_hessian_scale,
logdet_enclosure_gap: None,
})
}
}
fn joint_penalty_subspace_trace_parts(
h_joint_unpen: &JointHessianSource,
ranges: &[(usize, usize)],
s_lambdas: &[Array2<f64>],
total: usize,
hessian_diagonal_ridge: f64,
// Pre-scaled outer-REML Jeffreys curvature (already multiplied by
// `rho_curvature_scale` to live in the same scaled space as `s_lambdas`).
// Folded into `M = H + Sλ (+ H_Φ)` so the projected logdet AND its trace
// kernel `(H+Sλ+H_Φ)⁺` match the Jeffreys-augmented operator the LAML score
// runs on. `None` ⇒ byte-identical released projected logdet.
scaled_jeffreys_hphi: Option<&Array2<f64>>,
) -> Result<(f64, Option<PenaltySubspaceTrace>), String> {
if total == 0 {
return Ok((0.0, None));
}
// Structural-null gate: with no positive penalty eigenvalue there is no
// `log|Sλ|₊` term in the LAML ratio, hence no Hessian-side correction to
// pair with it — the caller keeps the operator's own logdet untouched.
// (The kernel itself no longer uses the Sλ eigenvectors: since #901 it is
// the full spectral `M⁺`, built from M's own eigendecomposition below.)
let mut s_lambda = Array2::<f64>::zeros((total, total));
add_joint_penalty_to_matrix(&mut s_lambda, ranges, s_lambdas, 0.0, None);
let s_evals = s_lambda
.eigh(Side::Lower)
.map_err(|e| format!("joint penalty subspace eigendecomposition failed: {e}"))?
.0;
let s_threshold = positive_eigenvalue_threshold(s_evals.as_slice().unwrap());
let rank = (0..total).filter(|&j| s_evals[j] > s_threshold).count();
if rank == 0 {
return Ok((0.0, None));
}
// ── REML log|H + Sλ|₊ and its trace kernel over the FULL identifiable
// subspace range(H + Sλ) ──────────────────────────────────────────────
//
// The REML penalty-determinant term is `½ log|H + Sλ|₊`, and its ρ-gradient
// is the trace `½ tr((H + Sλ)⁻¹ ∂Sλ/∂ρ)`. BOTH must be taken over
// range(H + Sλ) — the full identifiable subspace — not over range(Sλ).
//
// The previous code projected onto range(Sλ): it computed
// `log|U_Sᵀ(H+Sλ)U_S| = log|M_rr|` and the kernel `M_rr⁻¹`. That DROPS the
// determinant of the penalty-null block `M_kk = U_kᵀ H U_k` (on ker(Sλ), Sλ
// vanishes, so this is pure likelihood curvature) and the Schur coupling
// between the two. `M_kk` is the unpenalized polynomial trend; on a
// near-collinear design (admixture-cline PCs at small n) its curvature is
// large and GROWS as the smooth part is shrunk. Omitting it from
// `log|H+Sλ|` while `½ log|Sλ|₊` is correctly taken over range(Sλ) makes
// the ρ-derivative of the REML criterion inconsistent in the marginal
// block: the outer optimizer drives that block's λ → ∞ chasing a
// flat-increasing profile (gh#752), the coupled inner joint-Newton can no
// longer certify stationarity on the now-ill-conditioned trend, and the
// envelope-theorem outer gradient — valid only at a stationary β̂ — diverges
// on the coupled (logslope) block while the objective stalls, so ARC never
// reaches a KKT point.
//
// The correct generalized determinant (mgcv's treatment) takes both terms
// over range(H + Sλ): identical to the ordinary log-det / inverse when
// H + Sλ is non-singular (the well-posed case), and dropping only the truly
// unidentified directions ker(H) ∩ ker(Sλ) when it is singular — exactly the
// directions `½ log|Sλ|₊` also omits, keeping value and gradient consistent.
//
// To preserve value/gradient consistency the trace kernel must be the
// FULL pseudo-inverse `M⁺ = (H+Sλ)⁺` itself, carried in spectral form
// `(U_M, diag(1/σ_a))` over the kept eigenpairs (#901; supersedes the
// intermediate #752 realization that reduced `M⁺` to its range(Sλ)
// block). For penalty-supported drifts `∂Sλ/∂ρ` the two coincide:
// tr(M⁺ ∂Sλ) = tr(U_Sᵀ M⁺ U_S · U_Sᵀ ∂Sλ U_S) = ∂_ρ log|H+Sλ|₊.
// But the joint adaptive/ψ hyper-coordinates trace drifts with
// null(Sλ) support (basis κ-derivatives, the GLM cubic correction
// `D_β H[v]` through the intercept column), for which the range(Sλ)
// reduction silently discards the leaked component while the FD of
// `log|M|₊` keeps it. `tr(M⁺ Ḣ)` is the exact pseudo-logdet derivative
// for EVERY drift on a constant-rank stratum (first-order eigenvector
// motion cancels), so one spectral object serves the whole θ-vector.
// Value and kernel come from the same eigendecomposition of the same
// materialized `M` so they cannot drift apart.
//
// The #752 fix requires the full identifiable-subspace determinant. There
// is no lower-dimensional fallback that preserves that objective: the old
// range(Sλ) reduction is exactly the bug, because it drops the penalty-null
// likelihood determinant. If the dense path is over budget, fail loudly so
// the caller can choose a different Hessian representation instead of
// optimizing a different REML surface.
ensure_exact_joint_hessian_dense_budget(total, "joint penalty subspace logdet")?;
let m_dense =
materialize_joint_hessian_source(h_joint_unpen, total, "joint penalty subspace logdet")?;
let mut m = m_dense;
add_joint_penalty_to_matrix(&mut m, ranges, s_lambdas, hessian_diagonal_ridge, None);
if let Some(hphi) = scaled_jeffreys_hphi {
m += hphi;
}
symmetrize_dense_in_place(&mut m);
let (m_evals, m_evecs) = m.eigh(Side::Lower).map_err(|e| {
format!("joint penalty subspace full Hessian eigendecomposition failed: {e}")
})?;
let m_threshold = positive_eigenvalue_threshold(m_evals.as_slice().unwrap());
let logdet = exact_pseudo_logdet(m_evals.as_slice().unwrap(), m_threshold);
// Full Moore–Penrose pseudo-inverse `M⁺` (drop ker(H+Sλ)) in spectral
// form: kept eigenvectors as the kernel basis, diag(1/σ) as the reduced
// kernel. In this basis `h_proj_inverse = (U_Mᵀ M U_M)⁻¹ = diag(1/σ)`
// exactly, so every `PenaltySubspaceTrace` consumer evaluates the one
// true `tr(M⁺ ·)` / `M⁺`-bilinear — exact for penalty-supported AND
// null(Sλ)-leaking drifts alike (#901).
let kept: Vec<usize> = (0..total)
.filter(|&eig_idx| m_evals[eig_idx] > m_threshold)
.collect();
if kept.is_empty() {
return Ok((0.0, None));
}
let r_kept = kept.len();
let mut u_m = Array2::<f64>::zeros((total, r_kept));
let mut h_proj_inverse = Array2::<f64>::zeros((r_kept, r_kept));
for (out_col, &src_col) in kept.iter().enumerate() {
for row in 0..total {
u_m[[row, out_col]] = m_evecs[[row, src_col]];
}
h_proj_inverse[[out_col, out_col]] = 1.0 / m_evals[src_col];
}
Ok((
logdet,
Some(PenaltySubspaceTrace {
u_s: u_m,
h_proj_inverse,
}),
))
}
/// Shared implementation for the joint exact-Newton and surrogate outer paths.
///
/// Both paths differ only in:
/// - how the joint Hessian source is obtained (exact vs surrogate family methods)
/// - the closure for computing D_β H_L[v] (`compute_dh`)
/// - the closure for computing D²_β H_L[u, v] (`compute_d2h`)
/// - whether a tangent-basis projection is applied to the mode inverse
///
/// This function encapsulates all shared logic: penalty assembly, mode inverse
/// computation, precomputation of joint corrections + second-order traces, and
/// routing through `unified_joint_cost_gradient`.
fn joint_outer_evaluate(
inner: &BlockwiseInnerResult,
specs: &[ParameterBlockSpec],
per_block: &[Array1<f64>],
rho: &Array1<f64>,
beta_flat: &Array1<f64>,
h_joint_unpen: JointHessianSource,
ranges: &[(usize, usize)],
total: usize,
ridge: f64,
moderidge: f64,
extra_logdet_ridge: f64,
rho_curvature_scale: f64,
hessian_logdet_correction: f64,
include_logdet_h: bool,
include_logdet_s: bool,
strict_spd: bool,
project_hessian_logdet: bool,
eval_mode: EvalMode,
options: &BlockwiseFitOptions,
rho_prior: crate::types::RhoPrior,
pseudo_logdet_mode: PseudoLogdetMode,
compute_dh: &DriftDerivFn<'_>,
compute_dh_many: Option<&DriftDerivManyFn<'_>>,
compute_d2h: &DriftSecondDerivFn<'_>,
compute_d2h_many: Option<&DriftSecondDerivManyFn<'_>>,
owned_compute_dh: Option<
Arc<dyn Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync>,
>,
owned_compute_dh_many: Option<
Arc<dyn Fn(&[Array1<f64>]) -> Result<Vec<Option<DriftDerivResult>>, String> + Send + Sync>,
>,
owned_compute_d2h: Option<
Arc<
dyn Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String>
+ Send
+ Sync,
>,
>,
owned_compute_d2h_many: Option<
Arc<
dyn Fn(&[(Array1<f64>, Array1<f64>)]) -> Result<Vec<Option<DriftDerivResult>>, String>
+ Send
+ Sync,
>,
>,
ext_bundle: Option<ExtCoordBundle>,
first_order_trace_skip: Option<Array1<f64>>,
batched_outer_hessian_operator: Option<
Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>,
>,
// Universal under-identification robustness (always armed when the family can
// expose an exact joint Hessian). The
// outer REML logdet AND its trace derivatives must run on the same
// Jeffreys-augmented Hessian `H + S_λ + H_Φ` the inner Newton converged on,
// or the LAML value and its analytic gradient describe different objectives.
// Folding `H_Φ` into the operator's matvec augments the inverse/logdet, but is
// NOT by itself sufficient: `H_Φ` depends on ρ THROUGH β̂, so the trace
// contraction also needs its mode-response drift `D_β H_Φ[v_k]` — supplied
// separately via `jeffreys_hphi_drift` and folded into the first-order trace
// by `JeffreysHphiAwareJointDerivatives`. `None` means this evaluation has
// no active Jeffreys curvature (empty system, unavailable exact derivatives,
// or the conditioning gate proved the term zero), not a user-selected
// robustness-off mode.
// Gated Jeffreys VALUE `Φ(β̂)` paired with the curvature `H_Φ` from the same
// term evaluation. The value is folded into the LAML cost (`cost −= Φ`) so
// the outer criterion is the Laplace approximation of the SAME
// Firth-augmented objective the inner Newton converged on (gam#979).
robust_jeffreys_phi_hphi: Option<(f64, Array2<f64>)>,
// Companion mode-response drift `D_β H_Φ[δβ]` for the outer gradient's trace
// identity. `Some` exactly when `robust_jeffreys_phi_hphi` is `Some` (same
// under-identified span); installing it wraps the derivative provider so the
// first-order trace gains the `½ tr[(H+S_λ+H_Φ)⁻¹ D_β H_Φ[v_k]]` term that
// makes the analytic gradient match the augmented objective. `None` ⇒ the
// provider is used unwrapped.
jeffreys_hphi_drift: Option<JeffreysHphiDriftFn>,
) -> Result<OuterObjectiveEvalResult, String> {
let joint_trace_diagonal_ridge = moderidge + if !strict_spd { extra_logdet_ridge } else { 0.0 };
let scaled_joint_trace_diagonal_ridge = rho_curvature_scale * joint_trace_diagonal_ridge;
let (robust_jeffreys_phi, robust_jeffreys_hphi): (Option<f64>, Option<Array2<f64>>) =
match robust_jeffreys_phi_hphi {
Some((phi, hphi)) => (Some(phi), Some(hphi)),
None => (None, None),
};
// Pre-scale the outer-REML Jeffreys curvature into the same rescaled space as
// the penalties so the projected-logdet path and the operator agree. `None`
// (flag OFF / no under-identified span) keeps the released outer REML exact.
let scaled_robust_jeffreys_hphi: Option<Array2<f64>> = robust_jeffreys_hphi
.as_ref()
.map(|hphi| hphi.mapv(|value| rho_curvature_scale * value));
// Build derivative provider from the caller-supplied closures.
let base_provider_box: Box<dyn HessianDerivativeProvider + '_> =
if let (Some(owned_dh), Some(owned_d2h)) = (owned_compute_dh, owned_compute_d2h) {
Box::new(OwnedJointDerivProvider {
compute_dh: owned_dh,
compute_dh_many: owned_compute_dh_many,
compute_d2h: owned_d2h,
compute_d2h_many: owned_compute_d2h_many,
family_outer_hessian_operator: batched_outer_hessian_operator.clone(),
})
} else {
Box::new(BorrowedJointDerivProvider {
compute_dh,
compute_dh_many,
compute_d2h,
compute_d2h_many,
family_outer_hessian_operator: batched_outer_hessian_operator.clone(),
})
};
// Install the Jeffreys-`H_Φ` mode-response drift on top of the likelihood
// drift whenever the Jeffreys term is active. This is the term that makes the
// analytic outer gradient match the augmented objective `½ log|H+S_λ+H_Φ|`;
// without it the gradient omits `D_β H_Φ[v_k]` and the line search / KKT
// certification drifts in exactly the near-separating regime this machinery
// exists for. `None` ⇒ provider used unwrapped (byte-identical released path).
let provider_box: Box<dyn HessianDerivativeProvider + '_> = match jeffreys_hphi_drift {
Some(drift) => Box::new(JeffreysHphiAwareJointDerivatives::new(
base_provider_box,
drift,
total,
)),
None => base_provider_box,
};
let scaled_s_lambdas: Vec<Array2<f64>> = inner
.s_lambdas
.iter()
.map(|matrix| {
if rho_curvature_scale == 1.0 {
matrix.clone()
} else {
matrix.mapv(|value| rho_curvature_scale * value)
}
})
.collect();
let hessian_op: Arc<dyn crate::solver::estimate::reml::unified::HessianOperator> =
if use_joint_matrix_free_path(total, joint_observation_count(&inner.block_states)) {
let ranges_vec = ranges.to_vec();
let s_lambdas = Arc::new(scaled_s_lambdas.clone());
let trace_diagonal_ridge = scaled_joint_trace_diagonal_ridge
+ rho_curvature_scale * JOINT_TRACE_STABILITY_RIDGE;
match &h_joint_unpen {
JointHessianSource::Dense(h_joint) => {
let h_joint = Arc::new(h_joint.clone());
let apply_h = Arc::clone(&h_joint);
let apply_ranges = ranges_vec.clone();
let apply_s = Arc::clone(&s_lambdas);
let apply_hphi = robust_jeffreys_hphi.clone();
let hphi_scale = rho_curvature_scale;
Arc::new(MatrixFreeSpdOperator::new_with_mode(
total,
move |v| {
let mut out = apply_h.dot(v);
let penalty = apply_joint_block_penalty(
&apply_ranges,
apply_s.as_ref(),
v,
trace_diagonal_ridge,
None,
);
out += &penalty;
if let Some(hphi) = apply_hphi.as_ref() {
let jeffreys = hphi.dot(v);
out.scaled_add(hphi_scale, &jeffreys);
}
out
},
pseudo_logdet_mode,
))
}
JointHessianSource::Operator { apply, .. } => {
let apply_h = Arc::clone(apply);
let apply_ranges = ranges_vec.clone();
let apply_s = Arc::clone(&s_lambdas);
let apply_hphi = robust_jeffreys_hphi.clone();
let hphi_scale = rho_curvature_scale;
Arc::new(MatrixFreeSpdOperator::new_with_mode(
total,
move |v| {
let mut out = match apply_h(v) {
Ok(out) => out,
Err(error) => {
log::warn!(
"joint exact-newton operator matvec failed during outer trace construction: {error}"
);
Array1::<f64>::from_elem(total, f64::NAN)
}
};
let penalty = apply_joint_block_penalty(
&apply_ranges,
apply_s.as_ref(),
v,
trace_diagonal_ridge,
None,
);
out += &penalty;
if let Some(hphi) = apply_hphi.as_ref() {
let jeffreys = hphi.dot(v);
out.scaled_add(hphi_scale, &jeffreys);
}
out
},
pseudo_logdet_mode,
))
}
}
} else {
let mut j_for_traces = materialize_joint_hessian_source(
&h_joint_unpen,
total,
"joint exact-newton Hessian materialization",
)?;
add_joint_penalty_to_matrix(
&mut j_for_traces,
ranges,
&scaled_s_lambdas,
scaled_joint_trace_diagonal_ridge,
None,
);
if let Some(hphi) = robust_jeffreys_hphi.as_ref() {
j_for_traces.scaled_add(rho_curvature_scale, hphi);
}
Arc::new(
BlockCoupledOperator::from_joint_hessian_with_mode(
&j_for_traces,
pseudo_logdet_mode,
)
.map_err(|e| format!("BlockCoupledOperator from joint Hessian: {e}"))?,
)
};
let (projected_logdet_correction, penalty_subspace_trace) = if project_hessian_logdet
&& include_logdet_h
&& include_logdet_s
&& pseudo_logdet_mode == PseudoLogdetMode::Smooth
{
let (projected_logdet, kernel) = joint_penalty_subspace_trace_parts(
&h_joint_unpen,
ranges,
&scaled_s_lambdas,
total,
scaled_joint_trace_diagonal_ridge,
scaled_robust_jeffreys_hphi.as_ref(),
)?;
let correction = projected_logdet - hessian_op.logdet();
if kernel.is_some() {
log::debug!(
"[OUTER hessian-route] joint penalty subspace trace installed correction={:.6e}",
correction
);
}
(correction, kernel.map(Arc::new))
} else {
(0.0, None)
};
let hessian_logdet_correction = hessian_logdet_correction + projected_logdet_correction;
let expected_theta_dim = rho.len()
+ ext_bundle
.as_ref()
.map(|bundle| bundle.coords.len())
.unwrap_or(0);
let has_penalty_subspace_trace = penalty_subspace_trace.is_some();
// Option C: when the caller already has the batched first-order
// logdet traces, let the unified VGH path keep all mode-response,
// second-order, and Hessian work, but short-circuit only the
// soon-discarded first-order trace calls. The projected-subspace
// trace path is left untouched because the Hessian shares that
// kernel and it is not routed through HessianOperator trace methods.
// Bind the gating flag before `penalty_subspace_trace` is consumed by
// the call below so the trace-skip choice does not depend on a moved
// value (was: `if penalty_subspace_trace.is_none()` evaluated AFTER
// the trace had already been forwarded to `unified_joint_cost_gradient`).
let first_order_trace_skip = if penalty_subspace_trace.is_none() {
first_order_trace_skip
} else {
None
};
let (objective, grad, outer_hessian) = unified_joint_cost_gradient(
inner,
specs,
per_block,
rho,
beta_flat,
hessian_op,
ranges,
total,
ridge,
rho_curvature_scale,
hessian_logdet_correction,
penalty_subspace_trace,
include_logdet_h,
include_logdet_s,
options,
rho_prior,
provider_box,
eval_mode,
ext_bundle.map(|bundle| bundle.scaled(rho_curvature_scale)),
// Option C: when the caller already has the batched first-order
// logdet traces, let the unified VGH path keep all mode-response,
// second-order, and Hessian work, but short-circuit only the
// soon-discarded first-order trace calls. The projected-subspace
// trace path is left untouched because the Hessian shares that
// kernel and it is not routed through HessianOperator trace methods.
if has_penalty_subspace_trace {
None
} else {
first_order_trace_skip
},
robust_jeffreys_phi,
)?;
if !objective.is_finite() {
log::warn!(
"joint outer evaluation produced non-finite objective: log_likelihood={} penalty_value={} block_logdet_h={} block_logdet_s={} include_logdet_h={} include_logdet_s={} rho_curvature_scale={}",
inner.log_likelihood,
inner.penalty_value,
inner.block_logdet_h,
inner.block_logdet_s,
include_logdet_h,
include_logdet_s,
rho_curvature_scale,
);
return Err(CustomFamilyError::NumericalFailure {
reason: "joint outer evaluation produced a non-finite objective".to_string(),
}
.into());
}
if grad.iter().any(|value| !value.is_finite()) {
return Err(CustomFamilyError::NumericalFailure {
reason: "joint outer evaluation produced a non-finite gradient".to_string(),
}
.into());
}
if grad.len() != expected_theta_dim {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"joint outer evaluation returned gradient length {}, expected {}",
grad.len(),
expected_theta_dim
),
}
.into());
}
match &outer_hessian {
crate::solver::outer_strategy::HessianResult::Analytic(hessian) => {
if hessian.iter().any(|value| !value.is_finite()) {
return Err(CustomFamilyError::NumericalFailure {
reason: "joint outer evaluation produced a non-finite Hessian".to_string(),
}
.into());
}
if hessian.nrows() != expected_theta_dim || hessian.ncols() != expected_theta_dim {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"joint outer evaluation returned Hessian shape {}x{}, expected {}x{}",
hessian.nrows(),
hessian.ncols(),
expected_theta_dim,
expected_theta_dim
),
}
.into());
}
}
crate::solver::outer_strategy::HessianResult::Operator(op) => {
if op.dim() != expected_theta_dim {
return Err(format!(
"joint outer evaluation returned operator Hessian dim {}, expected {}",
op.dim(),
expected_theta_dim
));
}
}
crate::solver::outer_strategy::HessianResult::Unavailable => {}
}
let warm = ConstrainedWarmStart {
rho: rho.clone(),
block_beta: inner
.block_states
.iter()
.map(|st| st.beta.clone())
.collect(),
active_sets: inner.active_sets.clone(),
cached_inner: Some(cached_inner_mode_from_result(inner)),
};
Ok(OuterObjectiveEvalResult {
objective,
gradient: grad,
outer_hessian,
warm_start: warm,
inner_converged: inner.converged,
})
}
fn joint_outer_evaluate_efs(
inner: &BlockwiseInnerResult,
specs: &[ParameterBlockSpec],
per_block: &[Array1<f64>],
rho: &Array1<f64>,
beta_flat: &Array1<f64>,
h_joint_unpen: JointHessianSource,
ranges: &[(usize, usize)],
total: usize,
ridge: f64,
moderidge: f64,
extra_logdet_ridge: f64,
rho_curvature_scale: f64,
hessian_logdet_correction: f64,
include_logdet_h: bool,
include_logdet_s: bool,
strict_spd: bool,
project_hessian_logdet: bool,
options: &BlockwiseFitOptions,
rho_prior: crate::types::RhoPrior,
pseudo_logdet_mode: PseudoLogdetMode,
compute_dh: &DriftDerivFn<'_>,
compute_dh_many: Option<&DriftDerivManyFn<'_>>,
compute_d2h: &DriftSecondDerivFn<'_>,
compute_d2h_many: Option<&DriftSecondDerivManyFn<'_>>,
owned_compute_dh: Option<
Arc<dyn Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync>,
>,
owned_compute_dh_many: Option<
Arc<dyn Fn(&[Array1<f64>]) -> Result<Vec<Option<DriftDerivResult>>, String> + Send + Sync>,
>,
owned_compute_d2h: Option<
Arc<
dyn Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String>
+ Send
+ Sync,
>,
>,
owned_compute_d2h_many: Option<
Arc<
dyn Fn(&[(Array1<f64>, Array1<f64>)]) -> Result<Vec<Option<DriftDerivResult>>, String>
+ Send
+ Sync,
>,
>,
ext_bundle: Option<ExtCoordBundle>,
) -> Result<crate::solver::outer_strategy::EfsEval, String> {
let joint_trace_diagonal_ridge = moderidge + if !strict_spd { extra_logdet_ridge } else { 0.0 };
let scaled_joint_trace_diagonal_ridge = rho_curvature_scale * joint_trace_diagonal_ridge;
let provider_box: Box<dyn HessianDerivativeProvider + '_> =
if let (Some(owned_dh), Some(owned_d2h)) = (owned_compute_dh, owned_compute_d2h) {
Box::new(OwnedJointDerivProvider {
compute_dh: owned_dh,
compute_dh_many: owned_compute_dh_many,
compute_d2h: owned_d2h,
compute_d2h_many: owned_compute_d2h_many,
family_outer_hessian_operator: None,
})
} else {
Box::new(BorrowedJointDerivProvider {
compute_dh,
compute_dh_many,
compute_d2h,
compute_d2h_many,
family_outer_hessian_operator: None,
})
};
let scaled_s_lambdas: Vec<Array2<f64>> = inner
.s_lambdas
.iter()
.map(|matrix| {
if rho_curvature_scale == 1.0 {
matrix.clone()
} else {
matrix.mapv(|value| rho_curvature_scale * value)
}
})
.collect();
let hessian_op: Arc<dyn crate::solver::estimate::reml::unified::HessianOperator> =
if use_joint_matrix_free_path(total, joint_observation_count(&inner.block_states)) {
let ranges_vec = ranges.to_vec();
let s_lambdas = Arc::new(scaled_s_lambdas.clone());
let trace_diagonal_ridge = scaled_joint_trace_diagonal_ridge
+ rho_curvature_scale * JOINT_TRACE_STABILITY_RIDGE;
match &h_joint_unpen {
JointHessianSource::Dense(h_joint) => {
let h_joint = Arc::new(h_joint.clone());
let apply_h = Arc::clone(&h_joint);
let apply_ranges = ranges_vec.clone();
let apply_s = Arc::clone(&s_lambdas);
Arc::new(MatrixFreeSpdOperator::new_with_mode(
total,
move |v| {
let mut out = apply_h.dot(v);
let penalty = apply_joint_block_penalty(
&apply_ranges,
apply_s.as_ref(),
v,
trace_diagonal_ridge,
None,
);
out += &penalty;
out
},
pseudo_logdet_mode,
))
}
JointHessianSource::Operator { apply, .. } => {
let apply_h = Arc::clone(apply);
let apply_ranges = ranges_vec.clone();
let apply_s = Arc::clone(&s_lambdas);
Arc::new(MatrixFreeSpdOperator::new_with_mode(
total,
move |v| {
let mut out = match apply_h(v) {
Ok(out) => out,
Err(error) => {
log::warn!(
"joint exact-newton operator matvec failed during fixed-point trace construction: {error}"
);
Array1::<f64>::from_elem(total, f64::NAN)
}
};
let penalty = apply_joint_block_penalty(
&apply_ranges,
apply_s.as_ref(),
v,
trace_diagonal_ridge,
None,
);
out += &penalty;
out
},
pseudo_logdet_mode,
))
}
}
} else {
let mut j_for_traces = materialize_joint_hessian_source(
&h_joint_unpen,
total,
"joint exact-newton Hessian materialization for fixed-point evaluation",
)?;
add_joint_penalty_to_matrix(
&mut j_for_traces,
ranges,
&scaled_s_lambdas,
scaled_joint_trace_diagonal_ridge,
None,
);
Arc::new(
BlockCoupledOperator::from_joint_hessian_with_mode(
&j_for_traces,
pseudo_logdet_mode,
)
.map_err(|e| format!("BlockCoupledOperator from joint Hessian: {e}"))?,
)
};
let (projected_logdet_correction, penalty_subspace_trace) = if project_hessian_logdet
&& include_logdet_h
&& include_logdet_s
&& pseudo_logdet_mode == PseudoLogdetMode::Smooth
{
let (projected_logdet, kernel) = joint_penalty_subspace_trace_parts(
&h_joint_unpen,
ranges,
&scaled_s_lambdas,
total,
scaled_joint_trace_diagonal_ridge,
None,
)?;
let correction = projected_logdet - hessian_op.logdet();
if kernel.is_some() {
log::debug!(
"[OUTER hessian-route] joint EFS penalty subspace trace installed correction={:.6e}",
correction
);
}
(correction, kernel.map(Arc::new))
} else {
(0.0, None)
};
let hessian_logdet_correction = hessian_logdet_correction + projected_logdet_correction;
unified_joint_efs_eval(
inner,
specs,
per_block,
rho,
beta_flat,
hessian_op,
ranges,
total,
ridge,
rho_curvature_scale,
hessian_logdet_correction,
penalty_subspace_trace,
include_logdet_h,
include_logdet_s,
options,
rho_prior,
provider_box,
ext_bundle.map(|bundle| bundle.scaled(rho_curvature_scale)),
)
}
/// Evaluate the rho-only custom-family outer objective through the unified
/// joint hyperpath with no external ψ coordinates attached.
fn outerobjectivegradienthessian_internal<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
penalty_counts: &[usize],
rho: &Array1<f64>,
warm_start: Option<&ConstrainedWarmStart>,
rho_prior: crate::types::RhoPrior,
eval_mode: EvalMode,
) -> Result<OuterObjectiveEvalResult, String> {
let derivative_blocks = vec![Vec::<CustomFamilyBlockPsiDerivative>::new(); specs.len()];
evaluate_custom_family_hyper_internal(
family,
specs,
options,
penalty_counts,
rho,
&derivative_blocks,
warm_start,
rho_prior,
eval_mode,
)
.map_err(String::from)
}
fn outerobjectiveefs<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
penalty_counts: &[usize],
rho: &Array1<f64>,
warm_start: Option<&ConstrainedWarmStart>,
rho_prior: crate::types::RhoPrior,
) -> Result<
(
crate::solver::outer_strategy::EfsEval,
ConstrainedWarmStart,
bool,
),
String,
> {
let include_logdet_h = include_exact_newton_logdet_h(family, options);
let include_logdet_s = include_exact_newton_logdet_s(family, options);
let strict_spd = use_exact_newton_strict_spd(family);
let per_block = split_log_lambdas(rho, penalty_counts)?;
let mut inner = inner_blockwise_fit(family, specs, &per_block, options, warm_start)?;
if !inner.converged {
log::warn!(
"[OUTER] custom-family EFS inner solve did not converge after {} cycle(s); \
skipping EFS derivative assembly for theta_dim={}",
inner.cycles,
rho.len(),
);
return nonconverged_outer_efs_result(
&inner,
rho,
rho.len(),
include_logdet_h,
include_logdet_s,
"custom-family EFS non-converged inner solve",
);
}
let ridge = effective_solverridge(options.ridge_floor);
let moderidge = if options.ridge_policy.include_quadratic_penalty {
ridge
} else {
0.0
};
let extra_logdet_ridge = if options.ridge_policy.include_penalty_logdet
&& !options.ridge_policy.include_quadratic_penalty
{
ridge
} else {
0.0
};
refresh_all_block_etas(family, specs, &mut inner.block_states)?;
let ranges = block_param_ranges(specs);
let total = ranges.last().map(|(_, end)| *end).unwrap_or(0);
let efs_eval = {
if let Some(joint_bundle) = build_joint_hessian_closures(
family,
&inner.block_states,
specs,
total,
options,
inner.joint_workspace.clone(),
)? {
let JointHessianBundle {
source: h_joint_unpen,
beta_flat,
compute_dh,
compute_dh_many,
compute_d2h,
compute_d2h_many,
owned_compute_dh,
owned_compute_dh_many,
owned_compute_d2h,
owned_compute_d2h_many,
rho_curvature_scale,
hessian_logdet_correction,
} = joint_bundle;
joint_outer_evaluate_efs(
&inner,
specs,
&per_block,
rho,
&beta_flat,
h_joint_unpen,
&ranges,
total,
ridge,
moderidge,
extra_logdet_ridge,
rho_curvature_scale,
hessian_logdet_correction,
include_logdet_h,
include_logdet_s,
strict_spd,
family.use_projected_penalty_logdet(),
options,
rho_prior.clone(),
family.pseudo_logdet_mode(),
compute_dh.as_ref(),
compute_dh_many.as_deref(),
compute_d2h.as_ref(),
compute_d2h_many.as_deref(),
owned_compute_dh,
owned_compute_dh_many,
owned_compute_d2h,
owned_compute_d2h_many,
None,
)
} else {
if family.requires_joint_outer_hyper_path() {
return Err(
"outer hyper fixed-point evaluation requires a joint exact path for this family"
.to_string(),
);
}
if specs.len() != 1 {
return Err(
"generic fixed-point outer fallback is only valid for single-block families; multi-block families must provide a joint outer path"
.to_string(),
);
}
let eval = family.evaluate(&inner.block_states)?;
let block_idx = 0;
let spec = &specs[block_idx];
let work = &eval.blockworking_sets[block_idx];
let p = spec.design.ncols();
let mut diagonal_design = None::<DesignMatrix>;
let h_joint_unpen = match work {
BlockWorkingSet::Diagonal {
working_response: _,
working_weights,
} => with_block_geometry(
family,
&inner.block_states,
spec,
block_idx,
|x_dyn, _| {
let w = floor_positiveworking_weights(working_weights, options.minweight);
let (xtwx, _) = weighted_normal_equations(x_dyn, &w, None)?;
diagonal_design = Some(x_dyn.clone());
Ok(xtwx)
},
)?,
BlockWorkingSet::ExactNewton {
gradient: _,
hessian,
} => {
if hessian.nrows() != p || hessian.ncols() != p {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"block {block_idx} exact-newton Hessian shape mismatch in fixed-point outer evaluation: got {}x{}, expected {}x{}",
hessian.nrows(),
hessian.ncols(),
p,
p
) }.into());
}
hessian.to_dense()
}
};
let beta_flat = inner.block_states[block_idx].beta.clone();
let compute_dh = |direction: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> {
if !include_logdet_h {
return Ok(None);
}
match work {
BlockWorkingSet::ExactNewton { .. } => {
match family.exact_newton_hessian_directional_derivative(
&inner.block_states,
block_idx,
direction,
)? {
Some(h_exact) => {
Ok(Some(DriftDerivResult::Dense(symmetrized_square_matrix(
h_exact,
p,
&format!(
"block {block_idx} exact-newton dH shape mismatch in fixed-point outer evaluation"
),
)?)))
}
None => Err(CustomFamilyError::UnsupportedConfiguration { reason: format!(
"missing exact-newton dH callback for block {block_idx} while fixed-point evaluation requires H_beta term"
) }.into()),
}
}
BlockWorkingSet::Diagonal {
working_response: _,
working_weights,
} => {
let x_dyn = diagonal_design.as_ref().ok_or_else(|| {
format!(
"missing dynamic design for block {block_idx} diagonal fixed-point correction"
)
})?;
let wwork =
floor_positiveworking_weights(working_weights, options.minweight);
let x_dense = x_dyn.to_dense();
let n = x_dense.nrows();
let mut d_eta = x_dyn.matrixvectormultiply(direction);
let geom = family.block_geometry_directional_derivative(
&inner.block_states,
block_idx,
spec,
direction,
)?;
let mut correction_mat = Array2::<f64>::zeros((p, p));
if let Some(geom_dir) = geom {
d_eta += &geom_dir.d_offset;
if let Some(dx) = geom_dir.d_design {
d_eta += &fast_av(&dx, &beta_flat);
let mut wx = x_dense.clone();
let mut wdx = dx.clone();
ndarray::Zip::from(wx.rows_mut())
.and(wdx.rows_mut())
.and(wwork.view())
.par_for_each(|mut wxr, mut wdxr, &wi| {
if wi != 1.0 {
wxr.mapv_inplace(|v| v * wi);
wdxr.mapv_inplace(|v| v * wi);
}
});
correction_mat += &fast_atb(&dx, &wx);
correction_mat += &fast_atb(&x_dense, &wdx);
}
}
let dw = family
.diagonalworking_weights_directional_derivative(
&inner.block_states,
block_idx,
&d_eta,
)?
.ok_or_else(|| {
format!(
"missing diagonal dW callback for block {block_idx} while fixed-point evaluation requires H_beta term"
)
})?;
if dw.len() != n {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"block {block_idx} diagonal dW length mismatch in fixed-point outer evaluation: got {}, expected {}",
dw.len(),
n
) }.into());
}
let mut scaled_x = x_dense.clone();
ndarray::Zip::from(scaled_x.rows_mut())
.and(&dw)
.par_for_each(|mut sr, &dwi| sr.mapv_inplace(|v| v * dwi));
correction_mat += &fast_atb(&x_dense, &scaled_x);
Ok(Some(DriftDerivResult::Dense(correction_mat)))
}
}
};
let compute_d2h = |u: &Array1<f64>,
v: &Array1<f64>|
-> Result<Option<DriftDerivResult>, String> {
if !include_logdet_h {
return Ok(None);
}
match work {
BlockWorkingSet::ExactNewton { .. } => {
match family.exact_newton_hessian_second_directional_derivative(
&inner.block_states,
block_idx,
u,
v,
)? {
Some(h_exact) => {
Ok(Some(DriftDerivResult::Dense(symmetrized_square_matrix(
h_exact,
p,
&format!(
"block {block_idx} exact-newton d2H shape mismatch in fixed-point outer evaluation"
),
)?)))
}
None => Err(CustomFamilyError::UnsupportedConfiguration { reason: format!(
"missing exact-newton d2H callback for block {block_idx} while fixed-point evaluation requires H_beta_beta term"
) }.into()),
}
}
BlockWorkingSet::Diagonal { .. } => {
let x_dyn = diagonal_design.as_ref().ok_or_else(|| {
format!(
"missing dynamic design for block {block_idx} diagonal fixed-point second correction"
)
})?;
let x_dense = x_dyn.to_dense();
let n = x_dense.nrows();
let reject_second_order_geometry =
|label: &str,
geom: Option<BlockGeometryDirectionalDerivative>|
-> Result<(), String> {
if let Some(geom_dir) = geom {
let has_offset =
geom_dir.d_offset.iter().any(|value| *value != 0.0);
if geom_dir.d_design.is_some() || has_offset {
return Err(CustomFamilyError::UnsupportedConfiguration { reason: format!(
"block {block_idx} diagonal d2H requires second-order block-geometry derivatives for {label}; use an exact-newton or joint outer path"
) }.into());
}
}
Ok(())
};
reject_second_order_geometry(
"first direction",
family.block_geometry_directional_derivative(
&inner.block_states,
block_idx,
spec,
u,
)?,
)?;
reject_second_order_geometry(
"second direction",
family.block_geometry_directional_derivative(
&inner.block_states,
block_idx,
spec,
v,
)?,
)?;
let d_eta_u = x_dyn.matrixvectormultiply(u);
let d_eta_v = x_dyn.matrixvectormultiply(v);
let d2w = family
.diagonalworking_weights_second_directional_derivative(
&inner.block_states,
block_idx,
&d_eta_u,
&d_eta_v,
)?
.ok_or_else(|| {
format!(
"missing diagonal d2W callback for block {block_idx} while fixed-point evaluation requires H_beta_beta term"
)
})?;
if d2w.len() != n {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"block {block_idx} diagonal d2W length mismatch in fixed-point outer evaluation: got {}, expected {}",
d2w.len(),
n
) }.into());
}
let mut scaled_x = x_dense.clone();
ndarray::Zip::from(scaled_x.rows_mut())
.and(&d2w)
.par_for_each(|mut sr, &d2wi| sr.mapv_inplace(|value| value * d2wi));
Ok(Some(DriftDerivResult::Dense(fast_atb(&x_dense, &scaled_x))))
}
}
};
joint_outer_evaluate_efs(
&inner,
specs,
&per_block,
rho,
&beta_flat,
JointHessianSource::Dense(h_joint_unpen),
&ranges,
total,
ridge,
moderidge,
extra_logdet_ridge,
1.0,
0.0,
include_logdet_h,
include_logdet_s,
strict_spd,
family.use_projected_penalty_logdet(),
options,
rho_prior.clone(),
family.pseudo_logdet_mode(),
&compute_dh,
None,
&compute_d2h,
None,
None,
None,
None,
None,
None,
)
}
}?;
let warm = ConstrainedWarmStart {
rho: rho.clone(),
block_beta: inner
.block_states
.iter()
.map(|state| state.beta.clone())
.collect(),
active_sets: inner.active_sets.clone(),
cached_inner: Some(cached_inner_mode_from_result(&inner)),
};
Ok((efs_eval, warm, inner.converged))
}
fn normalize_outer_eval_error_detail(error: &str) -> &str {
// Any `String` round-tripped through `CustomFamilyError::From<String>`
// gets re-wrapped as `InvalidInput { context: "custom-family string
// boundary", … }`, which `Display`s as `custom-family invalid input
// in custom-family string boundary: <reason>`. Strip that "boundary"
// wrapper first, then the historical bare `custom-family invalid
// input: ` form, so the `last objective error: …` summary surfaces
// the inner reason root cause once — not the doubly-wrapped form
// that masked the synthetic-failure marker the outer-objective error
// contract pins.
let stripped = error
.strip_prefix("custom-family invalid input in custom-family string boundary: ")
.unwrap_or(error);
stripped
.strip_prefix("custom-family invalid input: ")
.unwrap_or(stripped)
}
// ═══════════════════════════════════════════════════════════════════════════
// Section: joint outer hyper surface — unified calculus for [rho, psi]
// ═══════════════════════════════════════════════════════════════════════════
//
// The callers have already applied the current spatial coordinates `psi` when
// constructing `family`, `specs`, and `derivative_blocks`, so the explicit
// input into the section below is still only the smoothing vector
// `rho_current`. Mathematically, however, the surface being differentiated
// is the full joint profiled/Laplace objective in
//
// theta = [rho, psi].
//
// The exact outer calculus is unified across all hypercoordinates:
//
// J(theta)
// = V(beta^(theta), theta)
// + 0.5 log|H(beta^(theta), theta)|
// - 0.5 log|S(theta)|_+,
//
// with stationarity and joint curvature
//
// F(beta, theta) := V_beta(beta, theta) = 0,
// H(beta, theta) := V_beta_beta(beta, theta).
//
// For each theta_i we need the fixed-beta objects
//
// V_i, g_i := F_i, H_i,
//
// and for each pair (i, j)
//
// V_ij, g_ij, H_ij,
//
// together with the beta-curvature contractions
//
// D_beta H[u], D_beta^2 H[u, v], T_i[u] := D_beta H_i[u].
//
// These determine the exact joint mode responses
//
// beta_i = -H^{-1} g_i,
// beta_ij = -H^{-1}(g_ij + H_i beta_j + H_j beta_i + D_beta H[beta_i] beta_j),
//
// and the total Hessian drifts
//
// dot H_i
// = H_i + D_beta H[beta_i],
//
// ddot H_ij
// = H_ij
// + T_i[beta_j]
// + T_j[beta_i]
// + D_beta H[beta_ij]
// + D_beta^2 H[beta_i, beta_j].
//
// Therefore the exact joint outer derivatives are
//
// J_i
// = V_i
// + 0.5 tr(H^{-1} dot H_i)
// - 0.5 partial_i log|S(theta)|_+,
//
// J_ij
// = (V_ij - g_i^T H^{-1} g_j)
// + 0.5 [ tr(H^{-1} ddot H_ij)
// - tr(H^{-1} dot H_j H^{-1} dot H_i) ]
// - 0.5 partial^2_{ij} log|S(theta)|_+.
//
// In this unified view rho and psi differ only in the likelihood-side
// fixed-beta derivative objects contributed by the family. The generic exact
// assembler always adds realized penalty motion through `S(theta)` for every
// hypercoordinate:
//
// - `rho` coordinates usually have zero likelihood-side objects and pick up
// their fixed-beta derivatives entirely from `S_rho` / `S_{rho rho}`
// - `psi` coordinates contribute likelihood-side objects from the family's
// joint exact psi hooks and may also pick up extra penalty terms through
// `S_psi`, `S_{rho psi}`, and `S_{psi psi}` when realized penalties move
// with `psi`
//
// The implementation below follows this unified calculus directly. Once a
// family supplies the joint fixed-beta psi objects and the mixed
// `D_beta H_psi[u]` contraction, exact joint hyper evaluation treats `rho`
// and `psi` identically and returns the full profiled/Laplace Hessian over
// `theta = [rho, psi]`.
//
// ═══════════════════════════════════════════════════════════════════════════
// Unified HyperCoord builders for ψ coordinates
// ═══════════════════════════════════════════════════════════════════════════
/// Assemble the penalty derivative matrix S_ψ = Σ_k exp(ρ_k) ∂S_k/∂ψ
/// in the *block-local* coefficient space (p_block × p_block).
///
/// When the derivative carries multi-penalty components the sum iterates
/// over all `(penalty_idx, s_part)` pairs. When only a single
/// `penalty_index` is stored the derivative `s_psi` is scaled by that
/// penalty's current lambda. If neither is present, the derivative is
/// zero (the ψ coordinate does not move any realized penalty).
fn assemble_block_local_s_psi(
deriv: &CustomFamilyBlockPsiDerivative,
per_block_rho: &Array1<f64>,
p_block: usize,
) -> Array2<f64> {
if let Some(ref components) = deriv.s_psi_penalty_components {
let mut s = Array2::<f64>::zeros((p_block, p_block));
for (penalty_idx, s_part) in components {
s_part.add_scaled_to(per_block_rho[*penalty_idx].exp(), &mut s);
}
return s;
}
if let Some(ref components) = deriv.s_psi_components {
let mut s = Array2::<f64>::zeros((p_block, p_block));
for (penalty_idx, s_part) in components {
s.scaled_add(per_block_rho[*penalty_idx].exp(), s_part);
}
s
} else if let Some(penalty_idx) = deriv.penalty_index {
deriv.s_psi.mapv(|v| per_block_rho[penalty_idx].exp() * v)
} else {
Array2::<f64>::zeros((p_block, p_block))
}
}
/// Assemble the second penalty derivative matrix S_{ψ_i ψ_j} in block-local
/// coefficient space.
///
/// This mirrors the psi/psi branch of `joint_theta_penaltysecond_matrix` but
/// returns the block-local matrix directly instead of embedding it into the
/// full flattened coefficient space.
fn assemble_block_local_s_psi_psi(
deriv_i: &CustomFamilyBlockPsiDerivative,
local_j: usize,
per_block_rho: &Array1<f64>,
p_block: usize,
) -> Array2<f64> {
if let Some(ref parts) = deriv_i.s_psi_psi_penalty_components {
let mut s = Array2::<f64>::zeros((p_block, p_block));
if let Some(pair_parts) = parts.get(local_j) {
for (penalty_idx, s_part) in pair_parts {
s_part.add_scaled_to(per_block_rho[*penalty_idx].exp(), &mut s);
}
}
return s;
}
if let Some(ref parts) = deriv_i.s_psi_psi_components {
let mut s = Array2::<f64>::zeros((p_block, p_block));
if let Some(pair_parts) = parts.get(local_j) {
for (penalty_idx, s_part) in pair_parts {
s.scaled_add(per_block_rho[*penalty_idx].exp(), s_part);
}
}
s
} else if let Some(ref parts) = deriv_i.s_psi_psi {
if let Some(s_part) = parts.get(local_j) {
if let Some(penalty_index) = deriv_i.penalty_index {
s_part.mapv(|v| per_block_rho[penalty_index].exp() * v)
} else {
Array2::<f64>::zeros((p_block, p_block))
}
} else {
Array2::<f64>::zeros((p_block, p_block))
}
} else {
Array2::<f64>::zeros((p_block, p_block))
}
}
/// Build `HyperCoord` objects for ψ (custom family) hyperparameters.
///
/// Converts family-provided (a^ℓ, q, L) objects and penalty derivatives
/// into the unified (a, g, B, ld_s) format. Each ψ coordinate produces
/// one `HyperCoord` in the flattened joint coefficient space.
///
/// The mapping from family objects to HyperCoord is:
///
/// a = a^ℓ_ψ + 0.5 β̂^T S_ψ β̂
/// g = q_ψ + S_ψ β̂
/// B = L_ψ + S_ψ
/// ld_s = tr(S₊⁻¹ S_ψ)
///
/// where S_ψ is the assembled penalty derivative in joint coefficient space.
pub fn build_psi_hyper_coords<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
synced_states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
beta_flat: &Array1<f64>,
rho: &[f64],
penalty_counts: &[usize],
s_logdet_blocks: Option<&[PenaltyPseudologdet]>,
hessian_beta_independent: bool,
psi_workspace: Option<Arc<dyn ExactNewtonJointPsiWorkspace>>,
) -> Result<Vec<HyperCoord>, String> {
let ranges = block_param_ranges(specs);
let total = beta_flat.len();
let per_block = split_log_lambdas(&Array1::from_vec(rho.to_vec()), penalty_counts)?;
let mut coords = Vec::new();
let mut psi_global = 0usize;
let build_psi_hyper_coords_start = std::time::Instant::now();
let total_axes: usize = derivative_blocks.iter().map(|b| b.len()).sum();
let batched_terms: Option<Vec<ExactNewtonJointPsiTerms>> = match psi_workspace.as_ref() {
Some(workspace) => workspace.first_order_terms_all()?,
None => None,
};
// EXPLICIT ∂_ρ H_Φ context (gam#854). The joint-Jeffreys curvature `H_Φ` is
// built from the JOINT Hessian `H_joint(β, ρ)`, so for a family whose
// `H_joint` depends on a ψ hyperparameter (the adaptive penalty's `λ_m`/`ε_m`,
// or any penalty folded into `H_joint`) it depends on ρ EXPLICITLY, not only
// through β̂. The augmented-LAML score `½ tr[(H+S_λ+H_Φ)⁻¹ ∂_ρ(H+S_λ+H_Φ)]` then
// needs the explicit term `∂_ρ_i H_Φ|_β` added to each ψ coord's drift (the
// mode-response part `D_β H_Φ[v_k]` is already folded in elsewhere). We form it
// from the SAME pieces the value path uses — the full identifiable Jeffreys span
// `Z_J` and the snapshot joint Hessian `H_joint(β̂)` — once per evaluation, and
// contract it per coord with `∂_ρ_i H_joint|_β` (the coord drift `dense_b`) and
// `∂_ρ_i Hdot[e_a]|_β` (the family's ψ-Hessian directional derivative). `None`
// unless the family uses the Jeffreys term and exposes a dense joint Hessian, so
// every non-Jeffreys / operator-only family is byte-unchanged.
let jeffreys_hphi_ctx: Option<(Array2<f64>, Array2<f64>)> = if family
.joint_jeffreys_term_required()
&& derivative_blocks.iter().any(|block| !block.is_empty())
{
match (
build_joint_jeffreys_subspace(specs, &ranges)?,
family.exact_newton_joint_hessian_with_specs(synced_states, specs)?,
) {
(Some(z), Some(h))
if z.nrows() == total && h.nrows() == total && h.ncols() == total =>
{
Some((z, h))
}
_ => None,
}
} else {
None
};
for (block_idx, block_derivs) in derivative_blocks.iter().enumerate() {
let (start, end) = ranges[block_idx];
let p_block = end - start;
for deriv in block_derivs.iter() {
// 1. Get family-provided likelihood objects (joint flattened space).
let psi_terms = if let Some(batched) = batched_terms.as_ref() {
batched[psi_global].clone()
} else if let Some(workspace) = psi_workspace.as_ref() {
if let Some(terms) = workspace.first_order_terms(psi_global)? {
terms
} else {
family
.exact_newton_joint_psi_terms(
synced_states,
specs,
derivative_blocks,
psi_global,
)?
.unwrap_or_else(|| ExactNewtonJointPsiTerms::zeros(total))
}
} else {
family
.exact_newton_joint_psi_terms(
synced_states,
specs,
derivative_blocks,
psi_global,
)?
.unwrap_or_else(|| ExactNewtonJointPsiTerms::zeros(total))
};
// 2. Assemble S_ψ from penalty derivatives (block-local, not embedded).
let s_psi_local = assemble_block_local_s_psi(deriv, &per_block[block_idx], p_block);
// 3. Build HyperCoord using block-local S_ψ (avoids full p×p materialization).
let beta_block = beta_flat.slice(ndarray::s![start..end]);
let s_psi_beta_local = s_psi_local.dot(&beta_block);
let a = psi_terms.objective_psi + 0.5 * beta_block.dot(&s_psi_beta_local);
// Embed s_psi_beta into full p-vector for the score.
let mut s_psi_beta = Array1::zeros(total);
s_psi_beta
.slice_mut(ndarray::s![start..end])
.assign(&s_psi_beta_local);
let g = &psi_terms.score_psi + &s_psi_beta;
let ld_s = if let Some(blocks) = s_logdet_blocks {
blocks[block_idx].tau_gradient_component(&s_psi_local)
} else {
0.0
};
// Build drift: use block-local representation when possible to avoid
// materializing full p×p dense matrices.
let drift = if psi_terms.hessian_psi_operator.is_some() {
// No dense Hessian contribution — penalty is block-local, operator
// (if present) handles the likelihood part. O(p_block²) fast path.
HyperCoordDrift::from_block_local_and_operator(
s_psi_local,
start,
end,
total,
psi_terms.hessian_psi_operator,
)
} else {
// Dense Hessian term exists (e.g., from non-implicit family).
// Must add block-local penalty into the dense matrix.
let mut dense_b = psi_terms.hessian_psi;
dense_b
.slice_mut(ndarray::s![start..end, start..end])
.scaled_add(1.0, &s_psi_local);
// `dense_b` is now `∂_ρ_i H_joint|_β`. Add the explicit Jeffreys term
// `∂_ρ_i H_Φ|_β` (gam#854) using it as the H_joint perturbation, the
// family's base directional Hessian derivative `Hdot[e_a]`, and the
// ψ-Hessian directional derivative `∂_ρ_i Hdot[e_a]|_β`. The helper
// returns zeros when the conditioning gate skips the term or the
// family lacks the exact directional derivatives, so a clean /
// well-conditioned fit is byte-unchanged.
if let Some((z_j, h_joint)) = jeffreys_hphi_ctx.as_ref() {
let explicit_hphi =
crate::estimate::reml::jeffreys_subspace::joint_jeffreys_hphi_explicit_param_derivative(
h_joint.view(),
z_j.view(),
&dense_b,
|dir: &Array1<f64>| {
family.exact_newton_joint_hessian_directional_derivative_with_specs(
synced_states,
specs,
dir,
)
},
|dir: &Array1<f64>| {
family.exact_newton_joint_psihessian_directional_derivative(
synced_states,
specs,
derivative_blocks,
psi_global,
dir,
)
},
)?;
dense_b += &explicit_hphi;
}
HyperCoordDrift::from_parts(Some(dense_b), psi_terms.hessian_psi_operator)
};
coords.push(HyperCoord {
a,
g,
drift,
ld_s,
b_depends_on_beta: !hessian_beta_independent,
is_penalty_like: false,
firth_g: None,
tk_eta_fixed: None,
tk_x_fixed: None,
});
psi_global += 1;
}
}
log::info!(
"[STAGE] build_psi_hyper_coords axis_count={} workspace_present={} elapsed={:.3}s",
total_axes,
psi_workspace.is_some(),
build_psi_hyper_coords_start.elapsed().as_secs_f64(),
);
Ok(coords)
}
/// Build the direction-contracted ψψ second-order hook for the profiled θ-HVP
/// (#740).
///
/// Returns `Some(hook)` only when the family's psi workspace supplies a
/// combined-direction likelihood kernel (`second_order_terms_contracted`);
/// otherwise `None`, which keeps the outer-Hessian operator on the exact
/// per-pair `ext_ext_fn` assembly.
///
/// The hook produces, for the ψ-direction weights `α_ψ`, the
/// [`ContractedPsiSecondOrder`] ψψ-block contraction: it sums the family
/// likelihood contraction (from the workspace) with the generic ψψ penalty
/// motion, mirroring exactly the `α`-contraction of the per-pair `ext_ext`
/// callback's penalty terms (`½βᵀS_{ψiψj}β` into `objective`, `S_{ψiψj}β` into
/// `score`, `S_{ψiψj}` as a `BlockLocalDrift` into `hessian`, and the
/// `tau_hessian_component` into `ld_s`). Same-block-only, matching `ext_ext`.
///
/// `pub(crate)` so the #740 in-crate gate
/// `bernoulli_contracted_psi_hook_matches_per_pair_with_penalty` can assert the
/// generic penalty fold here equals `Σ_j α_j · build_psi_pair_callbacks().ext_ext(i, j)`.
pub(crate) fn build_contracted_psi_hook(
specs: &[ParameterBlockSpec],
derivative_blocks: SharedDerivativeBlocks,
beta_flat: &Array1<f64>,
rho: &[f64],
penalty_counts: &[usize],
s_logdet_blocks: Option<&[PenaltyPseudologdet]>,
psi_workspace: Option<Arc<dyn ExactNewtonJointPsiWorkspace>>,
) -> Result<Option<ContractedPsiSecondOrderFn>, String> {
// The contraction is a representation/cost choice for the family likelihood
// ψψ second-order; without a contracted family kernel there is nothing to
// accelerate, so decline (the per-pair `ext_ext_fn` path stays).
let Some(workspace) = psi_workspace else {
return Ok(None);
};
let total = beta_flat.len();
let ranges = block_param_ranges(specs);
let per_block = Arc::new(split_log_lambdas(
&Array1::from_vec(rho.to_vec()),
penalty_counts,
)?);
let beta_arc = Arc::new(beta_flat.clone());
let ranges_arc = Arc::new(ranges);
let s_logdet_block_cache = Arc::new(s_logdet_blocks.map(|blocks| blocks.to_vec()));
// ψ → (block, local) location and block-local S_ψ for every ψ axis, built
// once. `s_local` (block-local S_ψ) is reused for the τ-Hessian and as the
// first leg of the bilinear `tr(S⁺ S_ψi S⁺ S_ψj)` penalty-logdet term.
struct PsiAxis {
block: usize,
local: usize,
start: usize,
end: usize,
s_psi_local: Array2<f64>,
}
let mut axes: Vec<PsiAxis> = Vec::new();
for (block_idx, block_derivs) in derivative_blocks.iter().enumerate() {
let (start, end) = ranges_arc[block_idx];
let p_block = end - start;
for (local_idx, deriv) in block_derivs.iter().enumerate() {
let s_psi_local = assemble_block_local_s_psi(deriv, &per_block[block_idx], p_block);
axes.push(PsiAxis {
block: block_idx,
local: local_idx,
start,
end,
s_psi_local,
});
}
}
let axes = Arc::new(axes);
let psi_dim = axes.len();
if psi_dim == 0 {
return Ok(None);
}
let derivative_blocks = Arc::clone(&derivative_blocks);
let hook = move |alpha_psi: &[f64]| -> Result<Option<ContractedPsiSecondOrder>, String> {
if alpha_psi.len() != psi_dim {
return Err(format!(
"contracted ψψ hook: alpha_psi length {} != psi_dim {psi_dim}",
alpha_psi.len()
));
}
// Family likelihood ψψ contraction (one combined-direction row pass).
// Declining here (e.g. a σ-aux axis carried weight) declines the whole
// hook so the operator builder keeps the per-pair assembly.
let Some(likelihood) = workspace.second_order_terms_contracted(alpha_psi)? else {
return Ok(None);
};
let mut objective = likelihood.objective;
let mut score = likelihood.score;
let mut ld_s = Array1::<f64>::zeros(psi_dim);
// Per-output-row penalty drift `Σ_j α_j S_{ψi ψj}` (block-local),
// composed onto the likelihood `hessian[i]` operator below.
let mut hessian: Vec<DriftDerivResult> = likelihood.hessian;
if objective.len() != psi_dim || score.nrows() != psi_dim || hessian.len() != psi_dim {
return Err(format!(
"contracted ψψ hook: family kernel shape mismatch (objective={}, score_rows={}, hessian={}, psi_dim={psi_dim})",
objective.len(),
score.nrows(),
hessian.len(),
));
}
for (i, axis_i) in axes.iter().enumerate() {
let p_block = axis_i.end - axis_i.start;
let beta_block = beta_arc.slice(s![axis_i.start..axis_i.end]).to_owned();
// Combined same-block penalty second derivative
// S_{ψi ψ(α)}_local = Σ_{j: block_j == block_i} α_j S_{ψi ψj}_local,
// and the combined first-leg penalty derivative
// S_ψ(α)_local = Σ_{j: block_j == block_i} α_j S_ψj_local
// (the second leg of the bilinear penalty-logdet cross term).
let mut s_psi_psi_alpha = Array2::<f64>::zeros((p_block, p_block));
let mut s_psi_alpha = Array2::<f64>::zeros((p_block, p_block));
for (j, axis_j) in axes.iter().enumerate() {
let aj = alpha_psi[j];
if aj == 0.0 || axis_j.block != axis_i.block {
continue;
}
let deriv_i = &derivative_blocks[axis_i.block][axis_i.local];
let s_ij = assemble_block_local_s_psi_psi(
deriv_i,
axis_j.local,
&per_block[axis_i.block],
p_block,
);
s_psi_psi_alpha.scaled_add(aj, &s_ij);
s_psi_alpha.scaled_add(aj, &axis_j.s_psi_local);
}
// objective += 0.5 βᵀ S_{ψi ψ(α)} β (matches ext_ext `a`).
let s_beta = s_psi_psi_alpha.dot(&beta_block);
objective[i] += 0.5 * beta_block.dot(&s_beta);
// score[i] (block-local slice) += S_{ψi ψ(α)} β (matches ext_ext `g`).
{
let mut score_local = score.row_mut(i);
let mut slot = score_local.slice_mut(s![axis_i.start..axis_i.end]);
slot += &s_beta;
}
// hessian[i] += S_{ψi ψ(α)} as a block-local drift (matches the
// ext_ext `b_operator` BlockLocalDrift composite).
let block_drift: Arc<dyn HyperOperator> =
Arc::new(crate::solver::estimate::reml::unified::BlockLocalDrift {
local: s_psi_psi_alpha.clone(),
start: axis_i.start,
end: axis_i.end,
total_dim: total,
});
let combined = match std::mem::replace(
&mut hessian[i],
DriftDerivResult::Operator(Arc::clone(&block_drift)),
) {
DriftDerivResult::Operator(existing) => DriftDerivResult::Operator(Arc::new(
crate::solver::estimate::reml::unified::CompositeHyperOperator {
dense: None,
operators: vec![existing, block_drift],
dim_hint: total,
},
)),
DriftDerivResult::Dense(dense) => DriftDerivResult::Operator(Arc::new(
crate::solver::estimate::reml::unified::CompositeHyperOperator {
dense: Some(dense),
operators: vec![block_drift],
dim_hint: total,
},
)),
};
hessian[i] = combined;
// ld_s[i] += Σ_j α_j tau_hessian_component(S_ψi, S_ψj, S_{ψiψj})
// = tau_hessian_component(S_ψi, S_ψ(α), S_{ψi ψ(α)})
// by the (linearity in the second leg + bilinearity of the cross)
// of the τ-Hessian; matches the ext_ext `ld_s` contraction.
if let Some(ref logdet_blocks) = *s_logdet_block_cache {
let pld = &logdet_blocks[axis_i.block];
ld_s[i] = pld.tau_hessian_component(
&axis_i.s_psi_local,
&s_psi_alpha,
Some(&s_psi_psi_alpha),
);
}
}
Ok(Some(ContractedPsiSecondOrder {
objective,
score,
hessian,
ld_s,
}))
};
Ok(Some(Arc::new(hook) as ContractedPsiSecondOrderFn))
}
/// Build pair callbacks for ψ-ψ and ρ-ψ Hessian entries.
///
/// Returns two closures:
///
/// 1. **ext-ext** `(psi_i, psi_j) -> HyperCoordPair`: second-order
/// fixed-β objects for a pair of ψ coordinates.
///
/// 2. **rho-ext** `(rho_k, psi_j) -> HyperCoordPair`: mixed second-order
/// fixed-β objects for a ρ-ψ pair.
///
/// The closures capture (via `Arc`) shared references to penalty derivatives,
/// family state, and the penalty pseudo-inverse needed for logdet terms.
///
/// # Arguments
///
/// * `family` - The custom family instance (must be `Send + Sync + 'static`).
/// * `synced_states` - Synchronized block states at the current inner mode.
/// * `specs` - Parameter block specifications.
/// * `derivative_blocks` - Per-block ψ derivative payloads.
/// * `beta_flat` - Flattened joint coefficient vector at the inner mode.
/// * `rho` - Current log-smoothing parameters (flat).
/// * `penalty_counts` - Number of penalties per block.
/// * `s_logdet_blocks` - Optional exact block-local pseudologdet eigenspaces.
pub fn build_psi_pair_callbacks<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
synced_states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
derivative_blocks: SharedDerivativeBlocks,
beta_flat: &Array1<f64>,
rho: &[f64],
penalty_counts: &[usize],
s_logdet_blocks: Option<&[PenaltyPseudologdet]>,
psi_workspace: Option<Arc<dyn ExactNewtonJointPsiWorkspace>>,
) -> Result<
(
Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>,
Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>,
),
String,
> {
// Precompute shared data into Arc-wrapped clones for the closures.
let ranges = block_param_ranges(specs);
let total = beta_flat.len();
let per_block = Arc::new(split_log_lambdas(
&Array1::from_vec(rho.to_vec()),
penalty_counts,
)?);
let specs_arc = Arc::new(specs.to_vec());
let beta_arc = Arc::new(beta_flat.clone());
let synced_arc = Arc::new(synced_states.to_vec());
let ranges_arc = Arc::new(ranges);
let family_arc = Arc::new(family.clone());
let s_logdet_block_cache = Arc::new(s_logdet_blocks.map(|blocks| blocks.to_vec()));
struct PsiPenaltyCacheEntry {
block_idx: usize,
local_idx: usize,
start: usize,
end: usize,
/// Block-local S_ψ matrix, stored for use with `PenaltyPseudologdet` methods.
s_local: Option<Array2<f64>>,
}
struct RhoPenaltyCacheEntry {
block_idx: usize,
penalty_idx: usize,
start: usize,
end: usize,
/// Unscaled penalty matrix S_k for use with `PenaltyPseudologdet::rho_tau_hessian_component`.
s_k_unscaled: Array2<f64>,
}
// Build the psi coordinate cache once. These block-local S_psi matrices are
// reused by ψψ and ρψ callbacks, avoiding repeated assembly inside the
// O(q²) ext-ext loop.
let mut psi_penalty_cache: Vec<PsiPenaltyCacheEntry> = Vec::new();
for (block_idx, block_derivs) in derivative_blocks.iter().enumerate() {
let (start, end) = ranges_arc[block_idx];
let p_block = end - start;
for (local_idx, deriv) in block_derivs.iter().enumerate() {
let s_local = assemble_block_local_s_psi(deriv, &per_block[block_idx], p_block);
// Store the block-local S_ψ matrix when penalty logdet is active;
// PenaltyPseudologdet methods will handle pseudoinverse and leakage internally.
let s_local_opt = if s_logdet_block_cache.is_some() {
Some(s_local)
} else {
None
};
psi_penalty_cache.push(PsiPenaltyCacheEntry {
block_idx,
local_idx,
start,
end,
s_local: s_local_opt,
});
}
}
let psi_penalty_cache = Arc::new(psi_penalty_cache);
let mut rho_penalty_cache: Vec<RhoPenaltyCacheEntry> = Vec::new();
for (block_idx, &count) in penalty_counts.iter().enumerate() {
let (start, end) = ranges_arc[block_idx];
for penalty_idx in 0..count {
let s_k_unscaled = specs_arc[block_idx].penalties[penalty_idx].to_dense();
rho_penalty_cache.push(RhoPenaltyCacheEntry {
block_idx,
penalty_idx,
start,
end,
s_k_unscaled,
});
}
}
let rho_penalty_cache = Arc::new(rho_penalty_cache);
// ψ-ψ pair callback
let ext_ext = {
let per_block = Arc::clone(&per_block);
let derivative_blocks = Arc::clone(&derivative_blocks);
let specs_arc = Arc::clone(&specs_arc);
let beta_arc = Arc::clone(&beta_arc);
let synced_arc = Arc::clone(&synced_arc);
let s_logdet_block_cache = Arc::clone(&s_logdet_block_cache);
let psi_penalty_cache = Arc::clone(&psi_penalty_cache);
let family_arc = Arc::clone(&family_arc);
let psi_workspace = psi_workspace.clone();
Box::new(move |psi_i: usize, psi_j: usize| -> HyperCoordPair {
// Defensive bounds check: callers in the unified outer solver only ever
// pass indices in `0..psi_penalty_cache.len()`, but treating an OOB
// request as a documented zero-pair sentinel keeps integration code
// (which may probe spurious coordinate pairs while building joint
// Hessian sparsity patterns) panic-free.
if psi_i >= psi_penalty_cache.len() || psi_j >= psi_penalty_cache.len() {
return HyperCoordPair::zero();
}
let cache_i = &psi_penalty_cache[psi_i];
let cache_j = &psi_penalty_cache[psi_j];
// Get family-provided second-order likelihood terms.
let psi2 = if let Some(workspace) = psi_workspace.as_ref() {
workspace.second_order_terms(psi_i, psi_j).ok().flatten()
} else {
family_arc
.exact_newton_joint_psisecond_order_terms(
&synced_arc,
&specs_arc,
&derivative_blocks,
psi_i,
psi_j,
)
.ok()
.flatten()
};
let (obj_ll, score_ll, hess_ll, hess_ll_op) = match psi2 {
Some(t) => (
t.objective_psi_psi,
t.score_psi_psi,
t.hessian_psi_psi,
t.hessian_psi_psi_operator,
),
None => (
0.0,
Array1::zeros(total),
Array2::zeros((total, total)),
None,
),
};
let mut a = obj_ll;
let mut g = score_ll;
let mut b_mat = hess_ll;
let mut b_operator = hess_ll_op;
// Assemble S_{ψ_i ψ_j} only on the touched block.
let ld_s = if cache_i.block_idx == cache_j.block_idx {
let p_block = cache_i.end - cache_i.start;
let deriv_i = &derivative_blocks[cache_i.block_idx][cache_i.local_idx];
let s_local = assemble_block_local_s_psi_psi(
deriv_i,
cache_j.local_idx,
&per_block[cache_i.block_idx],
p_block,
);
let beta_block = beta_arc.slice(s![cache_i.start..cache_i.end]).to_owned();
let s_ij_beta_local = s_local.dot(&beta_block);
a += 0.5 * beta_block.dot(&s_ij_beta_local);
{
let mut g_local = g.slice_mut(s![cache_i.start..cache_i.end]);
g_local += &s_ij_beta_local;
}
// The S_{ψ_i ψ_j} block contribution attaches to the dense
// Hessian when the family returned a dense `b_mat`, and to
// the operator-backed Hessian (via a `BlockLocalDrift`
// composite) when the family returned `hessian_psi_psi`
// empty alongside an operator. Slicing into a `(0, 0)`
// dense matrix would otherwise panic in the matrix-free
// path that survival-marginal-slope and other operator-
// backed families use.
if b_mat.nrows() > 0 {
let mut b_local =
b_mat.slice_mut(s![cache_i.start..cache_i.end, cache_i.start..cache_i.end]);
b_local += &s_local;
} else {
let block_drift: Arc<dyn HyperOperator> =
Arc::new(crate::solver::estimate::reml::unified::BlockLocalDrift {
local: s_local.clone(),
start: cache_i.start,
end: cache_i.end,
total_dim: total,
});
b_operator = Some(match b_operator.take() {
Some(existing) => {
let existing_arc: Arc<dyn HyperOperator> = Arc::from(existing);
Box::new(
crate::solver::estimate::reml::unified::CompositeHyperOperator {
dense: None,
operators: vec![existing_arc, block_drift],
dim_hint: total,
},
) as Box<dyn HyperOperator>
}
None => Box::new(crate::solver::estimate::reml::unified::BlockLocalDrift {
local: s_local.clone(),
start: cache_i.start,
end: cache_i.end,
total_dim: total,
}) as Box<dyn HyperOperator>,
});
}
if let Some(ref logdet_blocks) = *s_logdet_block_cache {
let pld = &logdet_blocks[cache_i.block_idx];
let s_psi_i = cache_i
.s_local
.as_ref()
.expect("psi cache should include S_psi when penalty logdet is active");
let s_psi_j = cache_j
.s_local
.as_ref()
.expect("psi cache should include S_psi when penalty logdet is active");
// τ-Hessian: tr(S⁺ S_{ψi ψj}) − tr(S⁺ S_ψi S⁺ S_ψj) + 2 tr(Σ₊⁻² L_i L_j^T)
pld.tau_hessian_component(s_psi_i, s_psi_j, Some(&s_local))
} else {
0.0
}
} else {
0.0
};
HyperCoordPair {
a,
g,
b_mat,
b_operator,
ld_s,
}
}) as Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>
};
// ρ-ψ pair callback
let rho_ext = {
let per_block = Arc::clone(&per_block);
let derivative_blocks = Arc::clone(&derivative_blocks);
let beta_arc = Arc::clone(&beta_arc);
let psi_penalty_cache = Arc::clone(&psi_penalty_cache);
let rho_penalty_cache = Arc::clone(&rho_penalty_cache);
let s_logdet_block_cache = Arc::clone(&s_logdet_block_cache);
Box::new(move |rho_k: usize, psi_j: usize| -> HyperCoordPair {
if rho_k >= rho_penalty_cache.len() || psi_j >= psi_penalty_cache.len() {
return HyperCoordPair::zero();
}
let rho_cache = &rho_penalty_cache[rho_k];
let psi_cache = &psi_penalty_cache[psi_j];
let mut a = 0.0;
let mut g = Array1::<f64>::zeros(total);
let mut b_mat = Array2::<f64>::zeros((total, total));
// S_{ρ_k, ψ_j} = λ_k ∂S_k/∂ψ_j.
// Only nonzero when both coordinates share the same block and the
// ψ derivative touches the k-th penalty.
let ld_s = if rho_cache.block_idx == psi_cache.block_idx {
let p_block = rho_cache.end - rho_cache.start;
let deriv = &derivative_blocks[psi_cache.block_idx][psi_cache.local_idx];
let lambda_k = per_block[rho_cache.block_idx][rho_cache.penalty_idx].exp();
let local = if let Some(ref components) = deriv.s_psi_penalty_components {
let mut m = Array2::<f64>::zeros((p_block, p_block));
for (penalty_idx, s_part) in components {
if *penalty_idx == rho_cache.penalty_idx {
s_part.add_scaled_to(lambda_k, &mut m);
}
}
m
} else if let Some(ref components) = deriv.s_psi_components {
let mut m = Array2::<f64>::zeros((p_block, p_block));
for (penalty_idx, s_part) in components {
if *penalty_idx == rho_cache.penalty_idx {
m.scaled_add(lambda_k, s_part);
}
}
m
} else if deriv.penalty_index == Some(rho_cache.penalty_idx) {
deriv.s_psi.mapv(|v| lambda_k * v)
} else {
Array2::<f64>::zeros((p_block, p_block))
};
let beta_block = beta_arc
.slice(s![rho_cache.start..rho_cache.end])
.to_owned();
let s_kj_beta_local = local.dot(&beta_block);
a = 0.5 * beta_block.dot(&s_kj_beta_local);
{
let mut g_local = g.slice_mut(s![rho_cache.start..rho_cache.end]);
g_local += &s_kj_beta_local;
}
{
let mut b_local = b_mat.slice_mut(s![
rho_cache.start..rho_cache.end,
rho_cache.start..rho_cache.end
]);
b_local += &local;
}
if let Some(ref logdet_blocks) = *s_logdet_block_cache {
let pld = &logdet_blocks[rho_cache.block_idx];
let s_psi_j = psi_cache
.s_local
.as_ref()
.expect("psi cache should include S_psi when penalty logdet is active");
// ∂S_k/∂ψ_j (unscaled): extract from local by dividing out λ_k.
let ds_k_dpsi = if lambda_k.abs() > 1e-300 {
Some(local.mapv(|v| v / lambda_k))
} else {
None
};
// Mixed ρ×τ Hessian: λ_k [tr(S⁺ ∂S_k/∂ψ_j) − tr(S⁺ S_k S⁺ S_ψj)]
pld.rho_tau_hessian_component(
&rho_cache.s_k_unscaled,
lambda_k,
s_psi_j,
ds_k_dpsi.as_ref(),
)
} else {
0.0
}
} else {
0.0
};
HyperCoordPair {
a,
g,
b_mat,
b_operator: None,
ld_s,
}
}) as Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>
};
Ok((ext_ext, rho_ext))
}
/// Build the M_i[u] = D_β B_i[u] callback for ψ coordinates.
///
/// This wraps `family.exact_newton_joint_psihessian_directional_derivative`
/// into the unified `FixedDriftDerivFn` signature. For each external
/// (ψ) coordinate index `ext_idx`, calling `f(ext_idx, &direction)` returns
/// `Some(D_β H_ψ[u])` when the family provides it, or `None` otherwise.
///
/// The returned closure also adds the penalty-side β-drift when the ψ
/// coordinate moves realized penalties: `D_β S_ψ[u] = 0` for ψ that
/// only enters via the likelihood, so the penalty contribution vanishes
/// and the callback delegates entirely to the family hook. (Penalty
/// matrices S_ψ do not depend on β, so their β-directional derivative
/// is zero.)
///
/// # Returns
///
/// `Some(callback)` when the family potentially provides the drift term.
/// `None` when the family is Gaussian (B_i is β-independent for all
/// coordinates, so M_i ≡ 0).
pub fn build_psi_drift_deriv_callback<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
synced_states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
derivative_blocks_arc: SharedDerivativeBlocks,
hessian_beta_independent: bool,
psi_workspace: Option<Arc<dyn ExactNewtonJointPsiWorkspace>>,
) -> Option<FixedDriftDerivFn> {
if hessian_beta_independent {
// Likelihood Hessian is β-independent; M_i ≡ 0.
return None;
}
let synced_arc = Arc::new(synced_states.to_vec());
let specs_arc = Arc::new(specs.to_vec());
let family_arc = Arc::new(family.clone());
let psi_workspace = psi_workspace;
Some(Box::new(
move |ext_idx: usize, direction: &Array1<f64>| -> Option<DriftDerivResult> {
// The family hook takes a psi index (0-based within ψ coordinates)
// and a flattened coefficient direction.
if let Some(workspace) = psi_workspace.as_ref() {
workspace
.hessian_directional_derivative(ext_idx, direction)
.ok()
.flatten()
} else {
family_arc
.exact_newton_joint_psihessian_directional_derivative(
&synced_arc,
&specs_arc,
&derivative_blocks_arc,
ext_idx,
direction,
)
.ok()
.flatten()
.map(DriftDerivResult::Dense)
}
},
))
}
fn evaluate_custom_family_hyper_internal<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
penalty_counts: &[usize],
rho_current: &Array1<f64>,
derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
warm_start: Option<&ConstrainedWarmStart>,
rho_prior: crate::types::RhoPrior,
eval_mode: EvalMode,
) -> Result<OuterObjectiveEvalResult, CustomFamilyError> {
evaluate_custom_family_hyper_internal_shared(
family,
specs,
options,
penalty_counts,
rho_current,
Arc::new(derivative_blocks.to_vec()),
warm_start,
rho_prior,
eval_mode,
)
}
fn evaluate_custom_family_hyper_internal_shared<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
penalty_counts: &[usize],
rho_current: &Array1<f64>,
derivative_blocks: SharedDerivativeBlocks,
warm_start: Option<&ConstrainedWarmStart>,
rho_prior: crate::types::RhoPrior,
eval_mode: EvalMode,
) -> Result<OuterObjectiveEvalResult, CustomFamilyError> {
if derivative_blocks.len() != specs.len() {
crate::bail_dim_custom!(
"joint hyper derivative block count mismatch: got {}, expected {}",
derivative_blocks.len(),
specs.len()
);
}
if penalty_counts.len() != specs.len() {
crate::bail_dim_custom!(
"joint hyper penalty-count block mismatch: got {}, expected {}",
penalty_counts.len(),
specs.len()
);
}
let rho_dim = penalty_counts.iter().sum::<usize>();
let psi_dim = derivative_blocks.iter().map(Vec::len).sum::<usize>();
if rho_current.len() != rho_dim {
crate::bail_dim_custom!(
"joint hyper rho dimension mismatch: got {}, expected {} (psi={})",
rho_current.len(),
rho_dim,
psi_dim
);
}
// ── Common setup: inner solve, ridge, refresh, ranges ──
let include_logdet_h = include_exact_newton_logdet_h(family, options);
let include_logdet_s = include_exact_newton_logdet_s(family, options);
let strict_spd = use_exact_newton_strict_spd(family);
let per_block = split_log_lambdas(rho_current, penalty_counts)?;
let psi_safe_warm_start =
warm_start_without_cached_inner_for_psi_derivatives(warm_start, psi_dim > 0);
let mut inner = inner_blockwise_fit(
family,
specs,
&per_block,
options,
psi_safe_warm_start.as_ref().or(warm_start),
)?;
if !inner.converged {
let theta_dim = rho_dim + psi_dim;
return Err(CustomFamilyError::UnsupportedConfiguration {
reason: format!(
"custom-family inner solve did not converge after {} cycle(s); \
refusing to expose profile objective derivatives for theta_dim={} \
(rho_dim={}, psi_dim={}). The analytic outer gradient/Hessian \
require the inner KKT equation F_beta(beta, theta)=0; returning \
a value with zero or shape-only derivatives is mathematically \
inconsistent.",
inner.cycles, theta_dim, rho_dim, psi_dim
),
});
}
let ridge = effective_solverridge(options.ridge_floor);
let moderidge = if options.ridge_policy.include_quadratic_penalty {
ridge
} else {
0.0
};
let extra_logdet_ridge = if options.ridge_policy.include_penalty_logdet
&& !options.ridge_policy.include_quadratic_penalty
{
ridge
} else {
0.0
};
refresh_all_block_etas(family, specs, &mut inner.block_states)?;
let ranges = block_param_ranges(specs);
let total = ranges.last().map(|(_, e)| *e).unwrap_or(0);
// ── Try to obtain a joint Hessian and route through the unified evaluator ──
//
// When psi_dim > 0, exact Newton is required because the ψ derivative
// callbacks use exact Newton trait methods. When psi_dim == 0,
// build_joint_hessian_closures handles both exact Newton and surrogate.
let cthf_internal_psi_branch_start = std::time::Instant::now();
if psi_dim > 0 {
log::info!(
"[STAGE] cthf_internal psi_dim={} eval_mode={:?} pre_unified elapsed={:.3}s",
psi_dim,
eval_mode,
cthf_internal_psi_branch_start.elapsed().as_secs_f64(),
);
// ψ coordinates present: require exact Newton Hessian for consistency
// with the psi derivative callbacks.
let beta_flat = flatten_state_betas(&inner.block_states, specs);
let synced_joint_states = Arc::new(synchronized_states_from_flat_beta(
family,
specs,
&inner.block_states,
&beta_flat,
)?);
let hessian_workspace = match inner.joint_workspace.clone() {
Some(workspace) => Some(workspace),
None => family.exact_newton_joint_hessian_workspace_with_options(
synced_joint_states.as_ref(),
specs,
options,
)?,
};
// Outer-eval entry: prime per-row jet caches before the ext-coord
// par_iter — see `warm_up_outer_caches` doc.
if let Some(workspace) = hessian_workspace.as_ref() {
workspace.warm_up_outer_caches()?;
}
let (
h_joint_unpen,
rho_curvature_scale,
hessian_logdet_correction,
use_outer_curvature_derivatives,
) = if let Some(curvature) = family.exact_newton_outer_curvature(&inner.block_states)? {
(
JointHessianSource::Dense(symmetrized_square_matrix(
curvature.hessian,
total,
"joint exact-newton Hessian shape mismatch in joint hyper evaluator (rescaled)",
)?),
curvature.rho_curvature_scale,
curvature.hessian_logdet_correction,
true,
)
} else {
let h_joint_unpen = if let Some(workspace) = hessian_workspace.as_ref() {
exact_newton_joint_hessian_source_from_workspace(
workspace,
total,
MaterializationIntent::OuterEvaluation,
"joint exact-newton operator mismatch in joint hyper evaluator",
)?
} else {
None
};
(
match h_joint_unpen {
Some(source) => Some(source),
None => exact_newton_joint_hessian_symmetrized(
family,
&inner.block_states,
specs,
total,
"joint exact-newton Hessian shape mismatch in joint hyper evaluator",
)
.map(|source| source.map(JointHessianSource::Dense))?,
}
.ok_or_else(|| -> CustomFamilyError {
"joint exact-newton Hessian unavailable for full [rho, psi] outer calculus"
.to_string()
.into()
})?,
1.0,
0.0,
false,
)
};
// Build the exact pseudologdet eigenspace for each penalty block so
// the value, ψ gradient, ψψ Hessian, and ρψ mixed block all
// differentiate the same log|S|_+ objective.
let s_logdet_blocks = if include_logdet_s {
use rayon::iter::{IntoParallelIterator, ParallelIterator};
let block_results: Vec<Result<PenaltyPseudologdet, String>> = (0..specs.len())
.into_par_iter()
.map(|b| {
let spec = &specs[b];
let p = spec.design.ncols();
let lambdas = per_block[b].mapv(f64::exp);
let mut s_lambda = Array2::<f64>::zeros((p, p));
for (k, s) in spec.penalties.iter().enumerate() {
s.add_scaled_to(lambdas[k], &mut s_lambda);
}
let ridge_hint = if options.ridge_policy.include_penalty_logdet {
for d in 0..p {
s_lambda[[d, d]] += ridge;
}
Some(ridge)
} else {
None
};
// No metadata-based structural-nullity hint: the
// PenaltyPseudologdet classifier derives the positive
// eigenspace from the assembled spectrum alone (issues
// #192/#318).
PenaltyPseudologdet::from_assembled(s_lambda, ridge_hint)
})
.collect();
let blocks: Result<Vec<_>, _> = block_results.into_iter().collect();
Some(blocks?)
} else {
None
};
// Build ψ HyperCoords, pair callbacks, and drift derivative callback.
let hessian_beta_independent = !family.exact_newton_joint_hessian_beta_dependent();
let psi_workspace = if eval_mode != EvalMode::ValueOnly
&& (eval_mode == EvalMode::ValueGradientHessian
|| family.exact_newton_joint_psi_workspace_for_first_order_terms())
{
family.exact_newton_joint_psi_workspace_with_options(
synced_joint_states.as_ref(),
specs,
derivative_blocks.as_ref(),
options,
)?
} else {
None
};
let rho_slice = rho_current
.as_slice()
.ok_or_else(|| "outer rho vector must be contiguous".to_string())?;
let ext_bundle = if eval_mode == EvalMode::ValueOnly {
None
} else {
let psi_coords = build_psi_hyper_coords(
family,
synced_joint_states.as_ref(),
specs,
derivative_blocks.as_ref(),
&beta_flat,
rho_slice,
penalty_counts,
s_logdet_blocks.as_deref(),
hessian_beta_independent,
psi_workspace.clone(),
)?;
let (ext_ext_fn, rho_ext_fn, drift_fn, contracted_psi_fn) =
if eval_mode == EvalMode::ValueGradientHessian {
let (ext_ext_fn, rho_ext_fn) = build_psi_pair_callbacks(
family,
synced_joint_states.as_ref(),
specs,
Arc::clone(&derivative_blocks),
&beta_flat,
rho_slice,
penalty_counts,
s_logdet_blocks.as_deref(),
psi_workspace.clone(),
)?;
// #740: build the direction-contracted ψψ hook from the same psi
// workspace + penalty data the per-pair `ext_ext_fn` uses, so the
// matrix-free outer-Hessian operator collapses the `K²` per-pair
// ψψ assembly to one combined-direction family row pass per
// matvec. `None` (no contracted family kernel) keeps the exact
// per-pair `ext_ext_fn` path. Built before the drift callback
// moves `psi_workspace`.
let contracted_psi_fn = build_contracted_psi_hook(
specs,
Arc::clone(&derivative_blocks),
&beta_flat,
rho_slice,
penalty_counts,
s_logdet_blocks.as_deref(),
psi_workspace.clone(),
)?;
let drift_fn = build_psi_drift_deriv_callback(
family,
synced_joint_states.as_ref(),
specs,
Arc::clone(&derivative_blocks),
hessian_beta_independent,
psi_workspace,
);
(
Some(ext_ext_fn),
Some(rho_ext_fn),
drift_fn,
contracted_psi_fn,
)
} else {
(None, None, None, None)
};
Some(ExtCoordBundle {
coords: psi_coords,
ext_ext_fn,
rho_ext_fn,
drift_fn,
contracted_psi_fn,
})
};
// Build derivative provider for the ρ coordinates (D_β H[v]).
let compute_dh = exact_newton_dh_closure(
family,
Arc::clone(&synced_joint_states),
specs,
total,
use_outer_curvature_derivatives,
if use_outer_curvature_derivatives {
1.0
} else {
rho_curvature_scale
},
hessian_workspace.clone(),
);
let compute_dh_many = if use_outer_curvature_derivatives {
None
} else {
exact_newton_dh_many_closure(rho_curvature_scale, hessian_workspace.clone())
};
let compute_d2h = exact_newton_d2h_closure(
family,
Arc::clone(&synced_joint_states),
specs,
total,
use_outer_curvature_derivatives,
if use_outer_curvature_derivatives {
1.0
} else {
rho_curvature_scale
},
hessian_workspace.clone(),
);
let owned_compute_dh = exact_newton_dh_closure_owned(
family.clone(),
Arc::clone(&synced_joint_states),
specs.to_vec(),
total,
use_outer_curvature_derivatives,
if use_outer_curvature_derivatives {
1.0
} else {
rho_curvature_scale
},
hessian_workspace.clone(),
);
let owned_compute_dh_many = if use_outer_curvature_derivatives {
None
} else {
exact_newton_dh_many_closure_owned(rho_curvature_scale, hessian_workspace.clone())
};
let owned_compute_d2h = exact_newton_d2h_closure_owned(
family.clone(),
Arc::clone(&synced_joint_states),
specs.to_vec(),
total,
use_outer_curvature_derivatives,
if use_outer_curvature_derivatives {
1.0
} else {
rho_curvature_scale
},
hessian_workspace.clone(),
);
let compute_d2h_many = if use_outer_curvature_derivatives {
None
} else {
exact_newton_d2h_many_closure(rho_curvature_scale, hessian_workspace.clone())
};
let owned_compute_d2h_many = if use_outer_curvature_derivatives {
None
} else {
exact_newton_d2h_many_closure_owned(rho_curvature_scale, hessian_workspace.clone())
};
// Route through the unified path (joint_outer_evaluate → reml_laml_evaluate).
let eval_result = joint_outer_evaluate(
&inner,
specs,
&per_block,
rho_current,
&beta_flat,
h_joint_unpen,
&ranges,
total,
ridge,
moderidge,
extra_logdet_ridge,
rho_curvature_scale,
hessian_logdet_correction,
include_logdet_h,
include_logdet_s,
strict_spd,
// ψ-bearing path (matern/duchon marginal-slope kernel length-scales):
// use the projected #752 generalized determinant for value AND
// gradient AND Hessian — all produced by this single call, so they are
// consistent by construction. This is the route the clustered-PC
// matern bernoulli/survival marginal-slope fits take, where the
// range(Sλ)-only determinant dropped the penalty-null trend likelihood
// determinant and froze the outer gradient (gam#808/#787). No batched
// override is possible here (it is gated to psi_dim==0).
family.use_projected_penalty_logdet(),
eval_mode,
options,
rho_prior.clone(),
family.pseudo_logdet_mode(),
&compute_dh,
compute_dh_many.as_deref(),
&compute_d2h,
compute_d2h_many.as_deref(),
Some(owned_compute_dh),
owned_compute_dh_many,
Some(owned_compute_d2h),
owned_compute_d2h_many,
ext_bundle,
None,
custom_family_batched_outer_hessian_operator(
family,
synced_joint_states.as_ref(),
specs,
derivative_blocks.as_ref(),
rho_current,
hessian_workspace.clone(),
eval_mode,
)?,
custom_family_outer_jeffreys_hphi(family, &inner.block_states, specs, &ranges)?,
custom_family_outer_jeffreys_hphi_drift(family, &inner.block_states, specs, &ranges)?,
)?;
// The unified evaluator produces gradient/Hessian of size (rho_dim + psi_dim),
// with ρ coordinates first and ψ coordinates appended — matching the expected
// output order of CustomFamilyJointHyperResult.
log::info!(
"[STAGE] cthf_internal psi_dim={} eval_mode={:?} post_unified elapsed={:.3}s",
psi_dim,
eval_mode,
cthf_internal_psi_branch_start.elapsed().as_secs_f64(),
);
return Ok(eval_result);
}
// ── ρ-only path (psi_dim == 0): route through unified evaluator ──
//
// Batched fast-path: if the family overrides `batched_outer_gradient_terms`,
// factor H once at the family level and amortize all K trace computations in
// a single streaming pass. Runs in both `ValueAndGradient` and
// `ValueGradientHessian` modes; in VGH the Hessian still flows through the
// standard joint_outer_evaluate path below and only the gradient is
// replaced. See `BatchedOuterGradientTerms`. The replacement is permitted
// only when it differentiates the same objective: if robust Jeffreys
// curvature is nonzero, the unified H_phi-aware evaluator owns the gradient.
let has_configured_rho_prior = !matches!(rho_prior, crate::types::RhoPrior::Flat);
let robust_jeffreys_hphi =
custom_family_outer_jeffreys_hphi(family, &inner.block_states, specs, &ranges)?;
let batched_gradient_contract_allows_override = batched_outer_gradient_contract_allows_override(
robust_jeffreys_hphi.as_ref().map(|(_phi, hphi)| hphi),
);
let mut batched_gradient_override: Option<Array1<f64>> = None;
if !has_configured_rho_prior
&& batched_gradient_contract_allows_override
&& (eval_mode == EvalMode::ValueAndGradient || eval_mode == EvalMode::ValueGradientHessian)
{
let beta_flat_for_batch = flatten_state_betas(&inner.block_states, specs);
let synced_states_for_batch = synchronized_states_from_flat_beta(
family,
specs,
&inner.block_states,
&beta_flat_for_batch,
)?;
let workspace_for_batch = match inner.joint_workspace.clone() {
Some(workspace) => Some(workspace),
None => family
.exact_newton_joint_hessian_workspace_with_options(
&synced_states_for_batch,
specs,
options,
)
.ok()
.flatten(),
};
let derivative_blocks_for_batch =
vec![Vec::<CustomFamilyBlockPsiDerivative>::new(); specs.len()];
if let Ok(Some(batch)) = family.batched_outer_gradient_terms(
&synced_states_for_batch,
specs,
&derivative_blocks_for_batch,
rho_current,
options,
workspace_for_batch.clone(),
) {
// Sanity check: batched output must match (rho_dim + psi_dim).
let expected = rho_dim + psi_dim;
if batch.objective_theta.len() == expected
&& batch.trace_h_inv_hdot.len() == expected
&& batch.trace_s_pinv_sdot.len() == expected
&& let Some(joint_bundle_value_only) = build_joint_hessian_closures(
family,
&inner.block_states,
specs,
total,
options,
inner.joint_workspace.clone(),
)?
{
let mut gradient = Array1::<f64>::zeros(expected);
for j in 0..expected {
let trace_term = if include_logdet_h {
0.5 * batch.trace_h_inv_hdot[j]
} else {
0.0
};
let det_term = if include_logdet_s {
0.5 * batch.trace_s_pinv_sdot[j]
} else {
0.0
};
gradient[j] = batch.objective_theta[j] + trace_term - det_term;
}
if eval_mode == EvalMode::ValueGradientHessian {
batched_gradient_override = Some(gradient);
} else {
let JointHessianBundle {
source: h_joint_unpen,
beta_flat,
compute_dh,
compute_dh_many,
compute_d2h,
compute_d2h_many,
owned_compute_dh: _,
owned_compute_dh_many: _,
owned_compute_d2h: _,
owned_compute_d2h_many: _,
rho_curvature_scale,
hessian_logdet_correction,
} = joint_bundle_value_only;
let value_only = joint_outer_evaluate(
&inner,
specs,
&per_block,
rho_current,
&beta_flat,
h_joint_unpen,
&ranges,
total,
ridge,
moderidge,
extra_logdet_ridge,
rho_curvature_scale,
hessian_logdet_correction,
include_logdet_h,
include_logdet_s,
strict_spd,
// VALUE/GRADIENT CONSISTENCY: this `value_only` is paired
// with the family's BATCHED gradient (computed just above),
// which evaluates the logdet derivative through the
// family's `pseudo_logdet_mode` spectral operator (Smooth
// `r_ε` for BMS) — an internally exact antiderivative pair
// (value `log r_ε`, gradient `φ'=r_ε'/r_ε`). The value must
// therefore use the SAME spectral convention, NOT the
// projected #752 generalized determinant, or value and the
// batched gradient would describe different objectives under
// rank deficiency. The projected determinant is used on the
// non-batched path (the ψ-bearing matern marginal-slope
// route, gam#808/#787), where joint_outer_evaluate produces
// a matched projected value AND gradient in one call.
false,
EvalMode::ValueOnly,
options,
crate::types::RhoPrior::Flat,
family.pseudo_logdet_mode(),
compute_dh.as_ref(),
compute_dh_many.as_deref(),
compute_d2h.as_ref(),
compute_d2h_many.as_deref(),
None,
None,
None,
None,
None,
None,
None,
robust_jeffreys_hphi.clone(),
// ValueOnly: the gradient is supplied separately below, so
// the H_Φ mode-response drift (a gradient-only term) is not
// needed here.
None,
)?;
return Ok(OuterObjectiveEvalResult {
objective: value_only.objective,
gradient,
outer_hessian: crate::solver::outer_strategy::HessianResult::Unavailable,
warm_start: value_only.warm_start,
inner_converged: inner.converged,
});
}
}
}
}
// Try build_joint_hessian_closures which handles both exact Newton and
// surrogate Hessian sources, then call joint_outer_evaluate with no
// extended coordinates.
if let Some(joint_bundle) = build_joint_hessian_closures(
family,
&inner.block_states,
specs,
total,
options,
inner.joint_workspace.clone(),
)? {
let JointHessianBundle {
source: h_joint_unpen,
beta_flat,
compute_dh,
compute_dh_many,
compute_d2h,
compute_d2h_many,
owned_compute_dh,
owned_compute_dh_many,
owned_compute_d2h,
owned_compute_d2h_many,
rho_curvature_scale,
hessian_logdet_correction,
} = joint_bundle;
let eval_result = joint_outer_evaluate(
&inner,
specs,
&per_block,
rho_current,
&beta_flat,
h_joint_unpen,
&ranges,
total,
ridge,
moderidge,
extra_logdet_ridge,
rho_curvature_scale,
hessian_logdet_correction,
include_logdet_h,
include_logdet_s,
strict_spd,
// VALUE/GRADIENT CONSISTENCY: when a batched (Smooth-mode) gradient
// override is pending, it will replace `eval_result.gradient` below,
// so the value (and outer Hessian) here must use the SAME spectral
// convention as that gradient — the family's `pseudo_logdet_mode`
// (Smooth `r_ε`), NOT the projected #752 generalized determinant. The
// projected determinant is used only when no batched override is
// active (the ψ-bearing matern marginal-slope route, gam#808/#787),
// where this call produces a matched projected value+gradient+Hessian.
if batched_gradient_override.is_some() {
false
} else {
family.use_projected_penalty_logdet()
},
eval_mode,
options,
rho_prior.clone(),
family.pseudo_logdet_mode(),
compute_dh.as_ref(),
compute_dh_many.as_deref(),
compute_d2h.as_ref(),
compute_d2h_many.as_deref(),
owned_compute_dh,
owned_compute_dh_many,
owned_compute_d2h,
owned_compute_d2h_many,
None, // no ext_coords when psi_dim == 0
None,
custom_family_batched_outer_hessian_operator(
family,
&inner.block_states,
specs,
derivative_blocks.as_ref(),
rho_current,
inner.joint_workspace.clone(),
eval_mode,
)?,
custom_family_outer_jeffreys_hphi(family, &inner.block_states, specs, &ranges)?,
custom_family_outer_jeffreys_hphi_drift(family, &inner.block_states, specs, &ranges)?,
)?;
let mut eval_result = eval_result;
if let Some(batched_grad) = batched_gradient_override.take()
&& batched_grad.len() == eval_result.gradient.len()
{
eval_result.gradient = batched_grad;
}
return Ok(eval_result);
}
// Joint Hessian unavailable via either exact Newton or surrogate.
// The generic fallback is only mathematically defensible for single-block
// families — multi-block families with coupled likelihood curvature require
// the joint path.
if family.requires_joint_outer_hyper_path() {
return Err(
"outer hyper-derivative evaluation requires a joint exact path for this family"
.to_string()
.into(),
);
}
// Generic fallback: single-block only. Extract the per-block Hessian and
// route through joint_outer_evaluate with the single block as the "joint"
// system.
if specs.len() != 1 {
return Err(
"generic outer fallback is only valid for single-block families; multi-block families must provide a joint outer path"
.to_string()
.into(),
);
}
let eval = family.evaluate(&inner.block_states)?;
let b = 0;
let spec = &specs[b];
let work = &eval.blockworking_sets[b];
let p = spec.design.ncols();
let mut diagonal_design = None::<DesignMatrix>;
let h_joint_unpen = match work {
BlockWorkingSet::Diagonal {
working_response: _,
working_weights,
} => with_block_geometry(family, &inner.block_states, spec, b, |x_dyn, _| {
let w = floor_positiveworking_weights(working_weights, options.minweight);
let (xtwx, _) = weighted_normal_equations(x_dyn, &w, None)?;
diagonal_design = Some(x_dyn.clone());
Ok(xtwx)
})?,
BlockWorkingSet::ExactNewton {
gradient: _,
hessian,
} => {
if hessian.nrows() != p || hessian.ncols() != p {
crate::bail_dim_custom!(
"block {b} exact-newton Hessian shape mismatch in outer gradient: got {}x{}, expected {}x{}",
hessian.nrows(),
hessian.ncols(),
p,
p
);
}
hessian.to_dense()
}
};
let beta_flat = inner.block_states[b].beta.clone();
// Build a derivative provider that computes D_β H_L[direction] on demand.
let compute_dh = |direction: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> {
if !include_logdet_h {
return Ok(None);
}
match work {
BlockWorkingSet::ExactNewton { .. } => {
match family.exact_newton_hessian_directional_derivative(
&inner.block_states,
b,
direction,
)? {
Some(h_exact) => Ok(Some(DriftDerivResult::Dense(symmetrized_square_matrix(
h_exact,
p,
&format!("block {b} exact-newton dH shape mismatch"),
)?))),
None => Err(CustomFamilyError::UnsupportedConfiguration { reason: format!(
"missing exact-newton dH callback for block {b} while REML gradient requires H_beta term"
) }.into()),
}
}
BlockWorkingSet::Diagonal {
working_response: _,
working_weights,
} => {
let x_dyn = diagonal_design.as_ref().ok_or_else(|| {
format!("missing dynamic design for block {b} diagonal correction")
})?;
let wwork = floor_positiveworking_weights(working_weights, options.minweight);
let x_dense = x_dyn.to_dense();
let n = x_dense.nrows();
let mut d_eta = x_dyn.matrixvectormultiply(direction);
let geom = family.block_geometry_directional_derivative(
&inner.block_states,
b,
spec,
direction,
)?;
let mut correction_mat = Array2::<f64>::zeros((p, p));
if let Some(geom_dir) = geom {
d_eta += &geom_dir.d_offset;
if let Some(dx) = geom_dir.d_design {
d_eta += &dx.dot(&beta_flat);
let mut wx = x_dense.clone();
let mut wdx = dx.clone();
ndarray::Zip::from(wx.rows_mut())
.and(wdx.rows_mut())
.and(wwork.view())
.par_for_each(|mut wxr, mut wdxr, &wi| {
if wi != 1.0 {
wxr.mapv_inplace(|v| v * wi);
wdxr.mapv_inplace(|v| v * wi);
}
});
// Same X'(W·Y) pattern as the parallel sibling at
// line ~9258; route through faer for SIMD GEMM
// (n × p² flops at large-scale moderate scale).
correction_mat += &fast_atb(&dx, &wx);
correction_mat += &fast_atb(&x_dense, &wdx);
}
}
let dw = family
.diagonalworking_weights_directional_derivative(
&inner.block_states,
b,
&d_eta,
)?
.ok_or_else(|| {
format!(
"missing diagonal dW callback for block {b} while REML gradient requires H_beta term"
)
})?;
if dw.len() != n {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"block {b} diagonal dW length mismatch: got {}, expected {}",
dw.len(),
n
),
}
.into());
}
let mut scaled_x = x_dense.clone();
ndarray::Zip::from(scaled_x.rows_mut())
.and(&dw)
.par_for_each(|mut sr, &dwi| sr.mapv_inplace(|v| v * dwi));
// X'(diag(dW)·X) outer correction term — faer route, same
// rationale as above.
correction_mat += &fast_atb(&x_dense, &scaled_x);
Ok(Some(DriftDerivResult::Dense(correction_mat)))
}
}
};
// Build a derivative provider that computes D²_β H_L[u, v] on demand.
let compute_d2h = |u: &Array1<f64>,
v: &Array1<f64>|
-> Result<Option<DriftDerivResult>, String> {
if !include_logdet_h {
return Ok(None);
}
match work {
BlockWorkingSet::ExactNewton { .. } => {
match family.exact_newton_hessian_second_directional_derivative(
&inner.block_states,
b,
u,
v,
)? {
Some(h_exact) => Ok(Some(DriftDerivResult::Dense(symmetrized_square_matrix(
h_exact,
p,
&format!("block {b} exact-newton d2H shape mismatch"),
)?))),
None => Err(CustomFamilyError::UnsupportedConfiguration { reason: format!(
"missing exact-newton d2H callback for block {b} while REML Hessian requires H_beta_beta term"
) }.into()),
}
}
BlockWorkingSet::Diagonal {
working_response: _,
working_weights: _,
} => {
let x_dyn = diagonal_design.as_ref().ok_or_else(|| {
format!("missing dynamic design for block {b} diagonal second correction")
})?;
let x_dense = x_dyn.to_dense();
let n = x_dense.nrows();
let reject_second_order_geometry = |label: &str,
geom: Option<
BlockGeometryDirectionalDerivative,
>|
-> Result<(), String> {
if let Some(geom_dir) = geom {
let has_offset = geom_dir.d_offset.iter().any(|value| *value != 0.0);
if geom_dir.d_design.is_some() || has_offset {
return Err(CustomFamilyError::UnsupportedConfiguration { reason: format!(
"block {b} diagonal d2H requires second-order block-geometry derivatives for {label}; use an exact-newton or joint outer path"
) }.into());
}
}
Ok(())
};
reject_second_order_geometry(
"first direction",
family.block_geometry_directional_derivative(
&inner.block_states,
b,
spec,
u,
)?,
)?;
reject_second_order_geometry(
"second direction",
family.block_geometry_directional_derivative(
&inner.block_states,
b,
spec,
v,
)?,
)?;
let d_eta_u = x_dyn.matrixvectormultiply(u);
let d_eta_v = x_dyn.matrixvectormultiply(v);
let d2w = family
.diagonalworking_weights_second_directional_derivative(
&inner.block_states,
b,
&d_eta_u,
&d_eta_v,
)?
.ok_or_else(|| {
format!(
"missing diagonal d2W callback for block {b} while REML Hessian requires H_beta_beta term"
)
})?;
if d2w.len() != n {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"block {b} diagonal d2W length mismatch: got {}, expected {}",
d2w.len(),
n
),
}
.into());
}
let mut scaled_x = x_dense.clone();
ndarray::Zip::from(scaled_x.rows_mut())
.and(&d2w)
.par_for_each(|mut sr, &d2wi| sr.mapv_inplace(|value| value * d2wi));
Ok(Some(DriftDerivResult::Dense(fast_atb(&x_dense, &scaled_x))))
}
}
};
let eval_result = joint_outer_evaluate(
&inner,
specs,
&per_block,
rho_current,
&beta_flat,
JointHessianSource::Dense(h_joint_unpen),
&ranges,
total,
ridge,
moderidge,
extra_logdet_ridge,
1.0,
0.0,
include_logdet_h,
include_logdet_s,
strict_spd,
family.use_projected_penalty_logdet(),
eval_mode,
options,
rho_prior,
family.pseudo_logdet_mode(),
&compute_dh,
None,
&compute_d2h,
None,
None,
None,
None,
None,
None, // no ext_coords for generic single-block fallback
None,
custom_family_batched_outer_hessian_operator(
family,
&inner.block_states,
specs,
derivative_blocks.as_ref(),
rho_current,
inner.joint_workspace.clone(),
eval_mode,
)?,
robust_jeffreys_hphi,
custom_family_outer_jeffreys_hphi_drift(family, &inner.block_states, specs, &ranges)?,
)?;
Ok(eval_result)
}
pub fn evaluate_custom_family_joint_hyper<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
rho_current: &Array1<f64>,
derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
warm_start: Option<&CustomFamilyWarmStart>,
eval_mode: EvalMode,
) -> Result<CustomFamilyJointHyperResult, CustomFamilyError> {
let penalty_counts = validate_blockspecs(specs)?;
let has_psi_derivatives = derivative_blocks.iter().any(|block| !block.is_empty());
let (eval_options, strict_warm_start) =
derivative_quality_options_and_warm_start(options, warm_start, has_psi_derivatives);
let eval_result = evaluate_custom_family_hyper_internal(
family,
specs,
&eval_options,
&penalty_counts,
rho_current,
derivative_blocks,
strict_warm_start
.as_ref()
.map(|w| &w.inner)
.or_else(|| warm_start.map(|w| &w.inner)),
crate::types::RhoPrior::Flat,
eval_mode,
)?;
Ok(outer_eval_result_to_joint_hyper_result(eval_result))
}
pub(crate) fn evaluate_custom_family_joint_hyper_shared<
F: CustomFamily + Clone + Send + Sync + 'static,
>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
rho_current: &Array1<f64>,
derivative_blocks: SharedDerivativeBlocks,
warm_start: Option<&CustomFamilyWarmStart>,
eval_mode: EvalMode,
) -> Result<CustomFamilyJointHyperResult, CustomFamilyError> {
let penalty_counts = validate_blockspecs(specs)?;
let has_psi_derivatives = derivative_blocks.iter().any(|block| !block.is_empty());
let (eval_options, strict_warm_start) =
derivative_quality_options_and_warm_start(options, warm_start, has_psi_derivatives);
let eval_result = evaluate_custom_family_hyper_internal_shared(
family,
specs,
&eval_options,
&penalty_counts,
rho_current,
derivative_blocks,
strict_warm_start
.as_ref()
.map(|w| &w.inner)
.or_else(|| warm_start.map(|w| &w.inner)),
crate::types::RhoPrior::Flat,
eval_mode,
)?;
Ok(outer_eval_result_to_joint_hyper_result(eval_result))
}
fn derivative_quality_options_and_warm_start(
options: &BlockwiseFitOptions,
warm_start: Option<&CustomFamilyWarmStart>,
has_psi_derivatives: bool,
) -> (BlockwiseFitOptions, Option<CustomFamilyWarmStart>) {
const DIRECT_JOINT_HYPER_INNER_TOL_FLOOR: f64 = 1e-10;
const DIRECT_JOINT_HYPER_MIN_CYCLES: usize = 200;
let mut eval_options = options.clone();
// The alignment exists so exact joint-hyper evaluations with real ψ
// coordinates resolve the inner solve at the outer optimizer's requested
// derivative scale. With zero ψ-derivative blocks this API is just the
// rho-only outer surface; mutating its inner tolerance makes the direct
// joint-hyper path evaluate a different function than the rho-only path.
if !has_psi_derivatives {
return (eval_options, None);
}
//
// Do not hard-force f64-precision KKT solves for every ψ-bearing model:
// large-scale survival marginal-slope fits have row-summed objectives
// around 1e5-1e6, so `1e-10 * objective` asks the inner loop to resolve
// gradient components far below the outer optimizer's own `outer_tol`.
// Matching the inner target to the outer target keeps the IFT gradient
// noise below the requested optimization accuracy without rejecting all
// startup seeds after hundreds of accepted but numerically flat Newton
// steps.
let direct_joint_hyper_inner_tol = eval_options
.outer_tol
.max(DIRECT_JOINT_HYPER_INNER_TOL_FLOOR);
let tolerance_differs = eval_options.inner_tol != direct_joint_hyper_inner_tol;
let tightening = eval_options.inner_tol > direct_joint_hyper_inner_tol;
let align = eval_options.inner_max_cycles > 1 && tolerance_differs;
let psi_safe_warm_start = warm_start_without_cached_inner_for_psi_derivatives(
warm_start.map(|warm| &warm.inner),
true,
)
.map(|inner| CustomFamilyWarmStart { inner });
if !align {
return (eval_options, psi_safe_warm_start);
}
eval_options.inner_tol = direct_joint_hyper_inner_tol;
if tightening {
eval_options.inner_max_cycles = eval_options
.inner_max_cycles
.max(DIRECT_JOINT_HYPER_MIN_CYCLES);
}
(eval_options, psi_safe_warm_start)
}
pub(crate) fn joint_hyper_options_for_outer_tolerance(
options: &BlockwiseFitOptions,
outer_tol: f64,
) -> BlockwiseFitOptions {
let mut eval_options = options.clone();
eval_options.outer_tol = eval_options.outer_tol.max(outer_tol);
eval_options
}
fn evaluate_custom_family_joint_hyper_efs_internal_shared<
F: CustomFamily + Clone + Send + Sync + 'static,
>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
penalty_counts: &[usize],
rho_current: &Array1<f64>,
derivative_blocks: SharedDerivativeBlocks,
warm_start: Option<&ConstrainedWarmStart>,
) -> Result<
(
crate::solver::outer_strategy::EfsEval,
ConstrainedWarmStart,
bool,
),
CustomFamilyError,
> {
if derivative_blocks.len() != specs.len() {
crate::bail_dim_custom!(
"joint hyper derivative block count mismatch: got {}, expected {}",
derivative_blocks.len(),
specs.len()
);
}
if penalty_counts.len() != specs.len() {
crate::bail_dim_custom!(
"joint hyper penalty-count block mismatch: got {}, expected {}",
penalty_counts.len(),
specs.len()
);
}
let rho_dim = penalty_counts.iter().sum::<usize>();
let psi_dim = derivative_blocks.iter().map(Vec::len).sum::<usize>();
if psi_dim == 0 {
return Err(CustomFamilyError::InvalidInput {
context: "evaluate_custom_family_joint_hyper_efs",
reason: "joint hyper EFS requires at least one ψ coordinate".to_string(),
});
}
if rho_current.len() != rho_dim {
crate::bail_dim_custom!(
"joint hyper rho dimension mismatch: got {}, expected {} (psi={})",
rho_current.len(),
rho_dim,
psi_dim
);
}
let include_logdet_h = include_exact_newton_logdet_h(family, options);
let include_logdet_s = include_exact_newton_logdet_s(family, options);
let strict_spd = use_exact_newton_strict_spd(family);
let per_block = split_log_lambdas(rho_current, penalty_counts)?;
let psi_safe_warm_start = warm_start_without_cached_inner_for_psi_derivatives(warm_start, true);
let mut inner = inner_blockwise_fit(
family,
specs,
&per_block,
options,
psi_safe_warm_start.as_ref().or(warm_start),
)?;
if !inner.converged {
let theta_dim = rho_dim + psi_dim;
log::warn!(
"[OUTER] custom-family joint-hyper EFS inner solve did not converge after {} cycle(s); \
skipping joint-hyper EFS derivative assembly for theta_dim={} (rho_dim={}, psi_dim={})",
inner.cycles,
theta_dim,
rho_dim,
psi_dim,
);
return nonconverged_outer_efs_result(
&inner,
rho_current,
theta_dim,
include_logdet_h,
include_logdet_s,
"custom-family joint-hyper EFS non-converged inner solve",
)
.map_err(CustomFamilyError::from);
}
let ridge = effective_solverridge(options.ridge_floor);
let moderidge = if options.ridge_policy.include_quadratic_penalty {
ridge
} else {
0.0
};
let extra_logdet_ridge = if options.ridge_policy.include_penalty_logdet
&& !options.ridge_policy.include_quadratic_penalty
{
ridge
} else {
0.0
};
refresh_all_block_etas(family, specs, &mut inner.block_states)?;
let ranges = block_param_ranges(specs);
let total = ranges.last().map(|(_, e)| *e).unwrap_or(0);
let beta_flat = flatten_state_betas(&inner.block_states, specs);
let synced_joint_states = Arc::new(synchronized_states_from_flat_beta(
family,
specs,
&inner.block_states,
&beta_flat,
)?);
let hessian_workspace = family.exact_newton_joint_hessian_workspace_with_options(
synced_joint_states.as_ref(),
specs,
options,
)?;
// Outer-eval entry: prime per-row jet caches before the ext-coord
// par_iter — see `warm_up_outer_caches` doc.
if let Some(workspace) = hessian_workspace.as_ref() {
workspace.warm_up_outer_caches()?;
}
let (
h_joint_unpen,
rho_curvature_scale,
hessian_logdet_correction,
use_outer_curvature_derivatives,
) = if let Some(curvature) = family.exact_newton_outer_curvature(&inner.block_states)? {
(
JointHessianSource::Dense(symmetrized_square_matrix(
curvature.hessian,
total,
"joint exact-newton Hessian shape mismatch in joint hyper EFS evaluator (rescaled)",
)?),
curvature.rho_curvature_scale,
curvature.hessian_logdet_correction,
true,
)
} else {
let h_joint_unpen = if let Some(workspace) = hessian_workspace.as_ref() {
exact_newton_joint_hessian_source_from_workspace(
workspace,
total,
MaterializationIntent::OuterEvaluation,
"joint exact-newton operator mismatch in joint hyper EFS evaluator",
)?
} else {
None
};
(
match h_joint_unpen {
Some(source) => Some(source),
None => exact_newton_joint_hessian_symmetrized(
family,
&inner.block_states,
specs,
total,
"joint exact-newton Hessian shape mismatch in joint hyper EFS evaluator",
)
.map(|source| source.map(JointHessianSource::Dense))?,
}
.ok_or_else(|| -> CustomFamilyError {
"joint exact-newton Hessian unavailable for full [rho, psi] fixed-point outer calculus"
.to_string()
.into()
})?,
1.0,
0.0,
false,
)
};
let s_logdet_blocks = if include_logdet_s {
use rayon::iter::{IntoParallelIterator, ParallelIterator};
let block_results: Vec<Result<PenaltyPseudologdet, String>> = (0..specs.len())
.into_par_iter()
.map(|b| {
let spec = &specs[b];
let p = spec.design.ncols();
let lambdas = per_block[b].mapv(f64::exp);
let mut s_lambda = Array2::<f64>::zeros((p, p));
for (k, s) in spec.penalties.iter().enumerate() {
s.add_scaled_to(lambdas[k], &mut s_lambda);
}
let ridge_hint = if options.ridge_policy.include_penalty_logdet {
for d in 0..p {
s_lambda[[d, d]] += ridge;
}
Some(ridge)
} else {
None
};
// No metadata-based structural-nullity hint: the
// PenaltyPseudologdet classifier derives the positive
// eigenspace from the assembled spectrum alone (issues
// #192/#318).
PenaltyPseudologdet::from_assembled(s_lambda, ridge_hint)
})
.collect();
let blocks: Result<Vec<_>, _> = block_results.into_iter().collect();
Some(blocks?)
} else {
None
};
let hessian_beta_independent = !family.exact_newton_joint_hessian_beta_dependent();
let psi_workspace = if family.exact_newton_joint_psi_workspace_for_first_order_terms() {
family.exact_newton_joint_psi_workspace_with_options(
synced_joint_states.as_ref(),
specs,
derivative_blocks.as_ref(),
options,
)?
} else {
None
};
let rho_slice = rho_current
.as_slice()
.ok_or_else(|| "outer rho vector must be contiguous".to_string())?;
let psi_coords = build_psi_hyper_coords(
family,
synced_joint_states.as_ref(),
specs,
derivative_blocks.as_ref(),
&beta_flat,
rho_slice,
penalty_counts,
s_logdet_blocks.as_deref(),
hessian_beta_independent,
psi_workspace.clone(),
)?;
let ext_bundle = ExtCoordBundle {
coords: psi_coords,
ext_ext_fn: None,
rho_ext_fn: None,
drift_fn: None,
contracted_psi_fn: None,
};
let compute_dh = exact_newton_dh_closure(
family,
Arc::clone(&synced_joint_states),
specs,
total,
use_outer_curvature_derivatives,
if use_outer_curvature_derivatives {
1.0
} else {
rho_curvature_scale
},
hessian_workspace.clone(),
);
let compute_dh_many = if use_outer_curvature_derivatives {
None
} else {
exact_newton_dh_many_closure(rho_curvature_scale, hessian_workspace.clone())
};
let compute_d2h = exact_newton_d2h_closure(
family,
Arc::clone(&synced_joint_states),
specs,
total,
use_outer_curvature_derivatives,
if use_outer_curvature_derivatives {
1.0
} else {
rho_curvature_scale
},
hessian_workspace.clone(),
);
let owned_compute_dh = exact_newton_dh_closure_owned(
family.clone(),
Arc::clone(&synced_joint_states),
specs.to_vec(),
total,
use_outer_curvature_derivatives,
if use_outer_curvature_derivatives {
1.0
} else {
rho_curvature_scale
},
hessian_workspace.clone(),
);
let owned_compute_dh_many = if use_outer_curvature_derivatives {
None
} else {
exact_newton_dh_many_closure_owned(rho_curvature_scale, hessian_workspace.clone())
};
let owned_compute_d2h = exact_newton_d2h_closure_owned(
family.clone(),
Arc::clone(&synced_joint_states),
specs.to_vec(),
total,
use_outer_curvature_derivatives,
if use_outer_curvature_derivatives {
1.0
} else {
rho_curvature_scale
},
hessian_workspace.clone(),
);
let compute_d2h_many = if use_outer_curvature_derivatives {
None
} else {
exact_newton_d2h_many_closure(rho_curvature_scale, hessian_workspace.clone())
};
let owned_compute_d2h_many = if use_outer_curvature_derivatives {
None
} else {
exact_newton_d2h_many_closure_owned(rho_curvature_scale, hessian_workspace.clone())
};
let efs_eval = joint_outer_evaluate_efs(
&inner,
specs,
&per_block,
rho_current,
&beta_flat,
h_joint_unpen,
&ranges,
total,
ridge,
moderidge,
extra_logdet_ridge,
rho_curvature_scale,
hessian_logdet_correction,
include_logdet_h,
include_logdet_s,
strict_spd,
// ψ-bearing EFS path: projected #752 generalized determinant for value
// and gradient (matched in this single _efs call). Same root-cause fix as
// the VGH ψ path (gam#808/#787); no batched override here.
family.use_projected_penalty_logdet(),
options,
crate::types::RhoPrior::Flat,
family.pseudo_logdet_mode(),
&compute_dh,
compute_dh_many.as_deref(),
&compute_d2h,
compute_d2h_many.as_deref(),
Some(owned_compute_dh),
owned_compute_dh_many,
Some(owned_compute_d2h),
owned_compute_d2h_many,
Some(ext_bundle),
)
.map_err(CustomFamilyError::from)?;
let warm = ConstrainedWarmStart {
rho: rho_current.clone(),
block_beta: inner
.block_states
.iter()
.map(|state| state.beta.clone())
.collect(),
active_sets: inner.active_sets.clone(),
cached_inner: Some(cached_inner_mode_from_result(&inner)),
};
Ok((efs_eval, warm, inner.converged))
}
/// Evaluate the joint custom-family hyper-surface in fixed-point form for the
/// outer EFS / hybrid-EFS planners.
pub fn evaluate_custom_family_joint_hyper_efs<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
rho_current: &Array1<f64>,
derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
warm_start: Option<&CustomFamilyWarmStart>,
) -> Result<CustomFamilyJointHyperEfsResult, CustomFamilyError> {
// Borrowed entry point: lift the `&[Vec<…>]` derivative blocks into a
// `SharedDerivativeBlocks` (`Arc<Vec<Vec<…>>>`) and delegate to the single
// source of truth. All validation, the empty-block fast path, and the
// internal evaluator dispatch live in `…_efs_shared`.
evaluate_custom_family_joint_hyper_efs_shared(
family,
specs,
options,
rho_current,
Arc::new(derivative_blocks.to_vec()),
warm_start,
)
}
pub(crate) fn evaluate_custom_family_joint_hyper_efs_shared<
F: CustomFamily + Clone + Send + Sync + 'static,
>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
rho_current: &Array1<f64>,
derivative_blocks: SharedDerivativeBlocks,
warm_start: Option<&CustomFamilyWarmStart>,
) -> Result<CustomFamilyJointHyperEfsResult, CustomFamilyError> {
let penalty_counts = validate_blockspecs(specs)?;
if derivative_blocks.len() != specs.len() {
crate::bail_dim_custom!(
"joint hyper derivative block count mismatch: got {}, expected {}",
derivative_blocks.len(),
specs.len()
);
}
let (efs_eval, warm_start, inner_converged) = if derivative_blocks.iter().all(Vec::is_empty) {
outerobjectiveefs(
family,
specs,
options,
&penalty_counts,
rho_current,
warm_start.map(|w| &w.inner),
crate::types::RhoPrior::Flat,
)
.map_err(CustomFamilyError::from)?
} else {
evaluate_custom_family_joint_hyper_efs_internal_shared(
family,
specs,
options,
&penalty_counts,
rho_current,
derivative_blocks,
warm_start.map(|w| &w.inner),
)?
};
Ok(outer_efs_result_to_joint_hyper_efs_result(
efs_eval,
warm_start,
inner_converged,
))
}
fn block_param_ranges(specs: &[ParameterBlockSpec]) -> Vec<(usize, usize)> {
block_offsets_from_specs(specs)
.iter()
.map(|r| (r.start, r.end))
.collect()
}
/// Build the joint Jeffreys/Firth basis `Z_J` (block-diagonal stack of each
/// block's per-block span) for the universal robustness term.
///
/// Each block contributes its FULL reduced coefficient span (`I_p` per block) —
/// the principled cure. Because the Jeffreys score is `O(1)` against the data's
/// `O(n)` Fisher information, applying it on the full span is the `O(1/n)` Firth
/// bias correction on data-identified directions (no bias on genuine smooth
/// fits) and the missing `O(1)`-bounding curvature on ANY near-separating
/// direction — penalized (`range(S)`) or not (`ker(S)`) — so the inner objective
/// becomes coercive with a finite unique minimizer. The previous `ker(S)`-only
/// scoping could not reach a near-separation on a penalized spline direction,
/// which was the residual BMS-probit pathology.
///
/// The per-block bases are embedded block-diagonally into the joint
/// `total_p x m_total` matrix. Returns `None` only for an empty system.
///
/// The Jeffreys conditioning gate, not the smoothing penalty null space,
/// decides whether this basis contributes at the current iterate.
fn build_joint_jeffreys_subspace(
specs: &[ParameterBlockSpec],
ranges: &[(usize, usize)],
) -> Result<Option<Array2<f64>>, String> {
let total_p = ranges.last().map(|(_, e)| *e).unwrap_or(0);
if total_p == 0 {
return Ok(None);
}
let mut per_block: Vec<Array2<f64>> = Vec::with_capacity(specs.len());
let mut m_total = 0usize;
for (b, _spec) in specs.iter().enumerate() {
let (start, end) = ranges[b];
let p_block = end - start;
// Full identifiable-span Jeffreys: `Z_J = I_{p_block}` over the entire
// reduced block coefficient space. The aggregate penalty only fixes the
// block dimension; the span no longer depends on `ker(S)`.
let aggregate = Array2::<f64>::zeros((p_block, p_block));
let subspace = crate::estimate::reml::jeffreys_subspace::jeffreys_subspace_from_penalty(
aggregate.view(),
)?;
m_total += subspace.span_dim();
per_block.push(subspace.columns);
}
if m_total == 0 {
return Ok(None);
}
let mut z_joint = Array2::<f64>::zeros((total_p, m_total));
let mut col_cursor = 0usize;
for (b, columns) in per_block.iter().enumerate() {
let (start, _) = ranges[b];
let m_block = columns.ncols();
let p_block = columns.nrows();
for j in 0..m_block {
for i in 0..p_block {
z_joint[[start + i, col_cursor + j]] = columns[[i, j]];
}
}
col_cursor += m_block;
}
Ok(Some(z_joint))
}
/// CHEAP, matrix-free conditioning pre-check: can the always-on Jeffreys term be
/// PROVABLY skipped at this working point WITHOUT forming the dense joint Hessian
/// `H` or running the `O(p³)` reduced eigendecomposition?
///
/// This is the perf gate in front of the expensive `custom_family_joint_jeffreys_*`
/// formation. On the FULL span (`Z_J = I`) the reduced information is `H_id = H`,
/// so the conditioning gate only needs `H`'s extreme eigenvalues — and those can
/// be bounded conservatively from a few Hessian-vector products against the SAME
/// `joint_hessian_source` operator the inner Newton already built (matrix-free on
/// the large-`p` path, dense otherwise). When the conservative bounds clear both
/// gates with a safe margin (see `jeffreys_term_skippable_via_matvec`), the exact
/// gate is CERTAIN to return the zero term, so the caller skips the dense `H`
/// materialization, the `Z_JᵀHZ_J` build, the eigendecomposition, the `∇Φ`/`H_Φ`
/// assembly, and the Q1 outer drift entirely — returning the EXACT-ZERO term,
/// byte-identical to the gated-off dense path. Returns `false` (never skip)
/// whenever the cheap bounds are unresolved or merely near the gate, so any fit
/// where the term might bite still flows to the exact formation.
///
/// Matrix-free preservation: the pre-check issues only `O(p·k)` (`k≤12`) matvecs
/// through `source` and forms nothing dense at `p`-scale; on a well-conditioned
/// large-`p` matrix-free fit (the common case) it returns `true` and NOTHING
/// dense is ever built — preserving the matrix-free path the dense `H_id`
/// formation was defeating. Only on a genuinely near-separating large-`p` fit
/// (rare) does it return `false` and fall through to the inherent `O(p²)` dense
/// `H_id`/`H_Φ` formation, where that cost is justified.
fn jeffreys_term_skippable_for_source(
source: &JointHessianSource,
total_p: usize,
) -> Result<bool, String> {
// Below the dense-eigh-is-cheap threshold the inner `jeffreys_term_skippable_via_matvec`
// short-circuits to `false` anyway; bail early so small fits (e.g. BMS p≈51)
// pay nothing for the pre-check and run the exact dense path unchanged.
if total_p < crate::estimate::reml::jeffreys_subspace::CHEAP_CONDITIONING_PRECHECK_MIN_DIM {
return Ok(false);
}
// Matrix-free Hessian-vector product against the SAME observed information the
// exact gate sees. `joint_jeffreys_term`'s reduced information is `Z_JᵀHZ_J`
// with `Z_J = I`, i.e. exactly the UNRIDGED likelihood joint Hessian `H` that
// `exact_newton_joint_hessian_with_specs` materializes; the `Operator::apply`
// / `Dense` here is that SAME `H` (the workspace's `hessian_matvec`, which the
// dense source also reconstructs). So the pre-check estimates the spectrum of
// precisely the matrix the dense path eigendecomposes — the skip decision and
// the exact gate are consistent by construction, with no ridge discrepancy
// (the solver's separate ridged solve operator is not involved here).
let hv = |v: &Array1<f64>| -> Result<Array1<f64>, String> {
match source {
JointHessianSource::Dense(matrix) => Ok(matrix.dot(v)),
JointHessianSource::Operator { apply, .. } => apply(v),
}
};
crate::estimate::reml::jeffreys_subspace::jeffreys_term_skippable_via_matvec(hv, total_p)
}
/// Evaluate ONLY the Jeffreys objective value `Phi = 1/2 log|Z_J^T H Z_J|` at
/// the current working point. Cheaper than the full term (no directional
/// derivatives), used to keep the trust-region accept/reject objective
/// consistent with the Jeffreys-modified Newton step. Returns `0.0` when there
/// is no coefficient system, the family exposes no exact joint Hessian,
/// or the reduced Fisher information is not yet SPD (the value contribution is
/// then simply omitted for that trial point — the step machinery still bounds
/// the coefficient, and the next accepted cycle re-folds a finite value).
fn custom_family_joint_jeffreys_value<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
ranges: &[(usize, usize)],
z_joint: &Array2<f64>,
) -> f64 {
let total_p = ranges.last().map(|(_, e)| *e).unwrap_or(0);
if total_p == 0 || z_joint.ncols() == 0 {
return 0.0;
}
let h_joint = match family.exact_newton_joint_hessian_with_specs(states, specs) {
Ok(Some(h)) if h.nrows() == total_p && h.ncols() == total_p => h,
_ => return 0.0,
};
match crate::estimate::reml::jeffreys_subspace::joint_jeffreys_term(
h_joint.view(),
z_joint.view(),
|_direction: &Array1<f64>| Ok(None),
) {
Ok((phi, _grad, _hphi)) => phi,
Err(_) => 0.0,
}
}
/// Evaluate the family-general Jeffreys term `(Phi, grad, H_Phi)` at the current
/// working point from the coupled joint Hessian (Tier-B path). Returns `None`
/// when there is no coefficient system or the family does not expose an
/// exact joint Hessian (in which case the term is inapplicable and the caller
/// proceeds unchanged).
fn custom_family_joint_jeffreys_term<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
ranges: &[(usize, usize)],
z_joint: &Array2<f64>,
) -> Result<Option<(f64, Array1<f64>, Array2<f64>)>, String> {
let total_p = ranges.last().map(|(_, e)| *e).unwrap_or(0);
if total_p == 0 || z_joint.ncols() == 0 {
return Ok(None);
}
let h_joint = match family.exact_newton_joint_hessian_with_specs(states, specs)? {
Some(h) => h,
None => return Ok(None),
};
if h_joint.nrows() != total_p || h_joint.ncols() != total_p {
return Ok(None);
}
let term = crate::estimate::reml::jeffreys_subspace::joint_jeffreys_term(
h_joint.view(),
z_joint.view(),
|direction: &Array1<f64>| {
family.exact_newton_joint_hessian_directional_derivative_with_specs(
states, specs, direction,
)
},
)?;
Ok(Some(term))
}
/// Outer-REML full-span Jeffreys curvature `H_Φ` for the coupled joint Hessian.
/// Returns `None` when there is no coefficient system or the family exposes no
/// exact joint Hessian.
///
/// This is the OUTER-path companion to the inner-Newton wiring: the LAML score
/// uses `log|H + S_λ + H_Φ|` and its analytic ρ-derivatives
/// `tr((H+S_λ+H_Φ)⁻¹ ∂_ρ(H+S_λ+H_Φ))`.
///
/// CORRECTNESS NOTE (was a bug — see `custom_family_outer_jeffreys_hphi_drift`).
/// `H_Φ` has no EXPLICIT ρ-dependence, but it DOES depend on ρ implicitly through
/// the mode β̂(ρ): `H_Φ = H_Φ(β̂(ρ))` because it is built from `H_id = Z_Jᵀ H Z_J`
/// and `D_a = Z_Jᵀ ∂_a H Z_J`, both functions of β̂. So the exact outer gradient
/// of `½ log|H+S_λ+H_Φ|` carries a `½ tr[(·)⁻¹ D_β H_Φ[v_k]]` drift term ALONGSIDE
/// the likelihood drift `D_β H[v_k]`. Folding `H_Φ` into the `HessianOperator`
/// (the `(·)⁻¹` kernel and `logdet()`) is necessary but NOT sufficient: the
/// trace contraction must ALSO include `D_β H_Φ[v_k]`, supplied by the companion
/// drift wrapper. Without it the analytic gradient describes a DIFFERENT objective
/// than the value, breaking the line search / KKT certification exactly in the
/// near-separating regime where the Jeffreys term is active.
fn custom_family_outer_jeffreys_hphi<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
ranges: &[(usize, usize)],
) -> Result<Option<(f64, Array2<f64>)>, String> {
if !family.joint_jeffreys_term_required() {
return Ok(None);
}
let z_joint = match build_joint_jeffreys_subspace(specs, ranges)? {
Some(z) => z,
None => return Ok(None),
};
// Return the gated VALUE alongside the curvature: the outer LAML must fold
// `−Φ(β̂)` into its cost (the inner mode is Φ-augmented-stationary, so the
// envelope identity only holds for the Φ-folded criterion — gam#979), and
// value/curvature must come from the SAME term evaluation.
let phi_and_hphi = custom_family_joint_jeffreys_term(family, states, specs, ranges, &z_joint)?
.map(|(phi, _grad, hphi)| (phi, hphi));
Ok(phi_and_hphi)
}
fn batched_outer_gradient_contract_allows_override(
robust_jeffreys_hphi: Option<&Array2<f64>>,
) -> bool {
match robust_jeffreys_hphi {
None => true,
Some(hphi) => hphi.iter().all(|value| *value == 0.0),
}
}
/// Build the Tier-B Jeffreys-curvature drift closure `D_β H_Φ[δβ]` for the outer
/// gradient, evaluated at the current outer point (states = β̂(ρ)).
///
/// THE FIX. The outer LAML objective folds `H_Φ` into `½ log|H + S_λ + H_Φ|`;
/// because `H_Φ` depends on ρ through β̂, the exact gradient's trace contraction
/// must include `½ tr[(H+S_λ+H_Φ)⁻¹ D_β H_Φ[v_k]]`. The released Tier-B path
/// supplied ONLY the likelihood-Hessian drift `D_β H[v_k]`, so the analytic
/// gradient omitted `H_Φ`'s mode-response drift — wrong precisely when Jeffreys
/// is active. This returns the missing drift as a `Send + Sync + 'static` closure
/// the `JeffreysHphiAwareJointDerivatives` wrapper folds into the first-order
/// trace, mirroring Tier-A's `FirthAwareGlmDerivatives` `−D(Hφ)[B_k]` term.
///
/// The closure takes the mode-response direction `δβ = dβ̂/dρ_k` (the wrapper
/// performs `v_k → δβ = −v_k`) and returns `D_β H_Φ[δβ]`. Returns `None` when
/// there is no coefficient system — i.e. exactly when
/// `custom_family_outer_jeffreys_hphi` itself returns `None`. The per-direction
/// conditioning gate and floored
/// pseudo-inverse inside `joint_jeffreys_hphi_directional_derivative` reproduce
/// the value path's, so when the value's `H_Φ` is zero (gated/clean fit) the
/// drift is identically zero too.
fn custom_family_outer_jeffreys_hphi_drift<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
ranges: &[(usize, usize)],
) -> Result<Option<JeffreysHphiDriftFn>, String> {
if !family.joint_jeffreys_term_required() {
return Ok(None);
}
let z_joint = match build_joint_jeffreys_subspace(specs, ranges)? {
Some(z) => z,
None => return Ok(None),
};
let total_p = ranges.last().map(|(_, e)| *e).unwrap_or(0);
if total_p == 0 || z_joint.ncols() == 0 {
return Ok(None);
}
// Snapshot the joint Hessian H(β̂) at the current outer point. If the family
// exposes no exact joint Hessian the Jeffreys term is inapplicable (matching
// `custom_family_joint_jeffreys_term`), so no drift is installed.
let h_joint = match family.exact_newton_joint_hessian_with_specs(states, specs)? {
Some(h) => h,
None => return Ok(None),
};
if h_joint.nrows() != total_p || h_joint.ncols() != total_p {
return Ok(None);
}
// Own everything the closure needs so it is `'static + Send + Sync`. β̂ is
// fixed across the single outer evaluation, so capturing the snapshot states
// is correct; the closure recomputes the exact directional derivatives of the
// joint Hessian at that point for each mode-response direction.
let family_owned = family.clone();
let states_owned: Vec<ParameterBlockState> = states.to_vec();
let specs_owned: Vec<ParameterBlockSpec> = specs.to_vec();
let z_columns = z_joint.clone();
let drift: JeffreysHphiDriftFn = Arc::new(move |delta: &Array1<f64>| {
crate::estimate::reml::jeffreys_subspace::joint_jeffreys_hphi_directional_derivative(
h_joint.view(),
z_columns.view(),
delta,
|direction: &Array1<f64>| {
family_owned.exact_newton_joint_hessian_directional_derivative_with_specs(
&states_owned,
&specs_owned,
direction,
)
},
|u: &Array1<f64>, v: &Array1<f64>| {
family_owned.exact_newton_joint_hessian_second_directional_derivative_with_specs(
&states_owned,
&specs_owned,
u,
v,
)
},
)
.map(Some)
});
Ok(Some(drift))
}
const JOINT_MATRIX_FREE_MIN_DIM: usize = 512;
const JOINT_MATRIX_FREE_MIN_ROWS: usize = 50_000;
const JOINT_MATRIX_FREE_MIN_DIM_AT_LARGE_N: usize = 128;
const JOINT_MATRIX_FREE_MIN_LINEAR_WORK: usize = 4_000_000;
const JOINT_TRACE_STABILITY_RIDGE: f64 = 1e-10;
const JOINT_PCG_MAX_ITER_MULTIPLIER: usize = 4;
pub(crate) fn joint_exact_analytic_outer_hessian_available() -> bool {
true
}
fn joint_observation_count(states: &[ParameterBlockState]) -> usize {
states
.iter()
.map(|state| state.eta.len())
.max()
.unwrap_or(0)
}
/// Whether the unified evaluator will pick the matrix-free joint Hessian path
/// for a problem of size `(total_p, total_n)`. Exposed at crate scope so
/// families with matrix-free operators can branch their `coefficient_hessian_cost`
/// estimate on the same predicate the evaluator will use at fit time.
///
/// For large-scale row counts with only tens of coefficients, exact
/// materialization is bounded by `total_p` Hessian-vector products and then a
/// tiny dense factorization. That is cheaper and more predictable than PCG when
/// each matrix-free product streams all rows through expensive FLEX marginal-
/// slope kernels and the initial joint Hessian is ill-conditioned. Keep the
/// matrix-free route for genuinely wide joint systems, where `total_p` dense
/// products and factorization dominate.
pub(crate) fn use_joint_matrix_free_path(total_p: usize, total_n: usize) -> bool {
total_p >= JOINT_MATRIX_FREE_MIN_DIM
|| (total_n >= JOINT_MATRIX_FREE_MIN_ROWS
&& total_p >= JOINT_MATRIX_FREE_MIN_DIM_AT_LARGE_N)
|| (total_p >= JOINT_MATRIX_FREE_MIN_DIM_AT_LARGE_N
&& total_n.saturating_mul(total_p) >= JOINT_MATRIX_FREE_MIN_LINEAR_WORK)
}
fn apply_joint_block_penalty(
ranges: &[(usize, usize)],
s_lambdas: &[Array2<f64>],
vector: &Array1<f64>,
diagonal_ridge: f64,
joint_full_width: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
) -> Array1<f64> {
let mut out = Array1::<f64>::zeros(vector.len());
apply_joint_block_penalty_into(
ranges,
s_lambdas,
vector,
diagonal_ridge,
&mut out,
joint_full_width,
);
out
}
/// In-place variant of [`apply_joint_block_penalty`]. Caller supplies the
/// output buffer to eliminate per-call allocation.
///
/// Uses `fast_av_view_into` to write directly into the per-block slice of
/// `out`, avoiding the per-block intermediate `Array1` from `fast_av`. At
/// large scale this is invoked inside the PCG matvec closure (called
/// once per CG iter, hundreds-to-thousands of times per outer iter per
/// the perf-scout report).
fn apply_joint_block_penalty_into(
ranges: &[(usize, usize)],
s_lambdas: &[Array2<f64>],
vector: &Array1<f64>,
diagonal_ridge: f64,
out: &mut Array1<f64>,
joint_full_width: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
) {
assert_eq!(out.len(), vector.len());
assert!(s_lambdas.len() <= ranges.len());
out.fill(0.0);
if s_lambdas.len() <= 1 {
for (b, s_lambda) in s_lambdas.iter().enumerate() {
let (start, end) = ranges[b];
let block = vector.slice(s![start..end]);
let mut out_slice = out.slice_mut(s![start..end]);
crate::linalg::faer_ndarray::fast_av_view_into(s_lambda, &block, out_slice.view_mut());
}
if diagonal_ridge > 0.0 {
out.scaled_add(diagonal_ridge, vector);
}
if let Some(bundle) = joint_full_width
&& !bundle.is_empty()
{
bundle.add_apply_into(vector.view(), out);
}
return;
}
if out.as_slice_mut().is_none() {
for (b, s_lambda) in s_lambdas.iter().enumerate() {
let (start, end) = ranges[b];
let block = vector.slice(s![start..end]);
let mut out_slice = out.slice_mut(s![start..end]);
crate::linalg::faer_ndarray::fast_av_view_into(s_lambda, &block, out_slice.view_mut());
}
if diagonal_ridge > 0.0 {
out.scaled_add(diagonal_ridge, vector);
}
if let Some(bundle) = joint_full_width
&& !bundle.is_empty()
{
bundle.add_apply_into(vector.view(), out);
}
return;
}
{
let out_values = out
.as_slice_mut()
.expect("joint penalty output should be contiguous");
let mut out_blocks = Vec::with_capacity(s_lambdas.len());
let mut remaining = out_values;
let mut cursor = 0usize;
for &(start, end) in ranges.iter().take(s_lambdas.len()) {
assert!(start >= cursor);
assert!(end >= start);
let (_, after_gap) = remaining.split_at_mut(start - cursor);
let (out_block, after_block) = after_gap.split_at_mut(end - start);
out_blocks.push(out_block);
remaining = after_block;
cursor = end;
}
use rayon::prelude::*;
out_blocks
.into_par_iter()
.enumerate()
.for_each(|(b, out_block)| {
let (start, end) = ranges[b];
let block = vector.slice(s![start..end]);
let out_view = ArrayViewMut1::from(out_block);
crate::linalg::faer_ndarray::fast_av_view_into(&s_lambdas[b], &block, out_view);
});
}
if diagonal_ridge > 0.0 {
if let (Some(out_values), Some(vector_values)) = (out.as_slice_mut(), vector.as_slice()) {
use rayon::prelude::*;
out_values
.par_iter_mut()
.zip(vector_values.par_iter())
.for_each(|(out_value, vector_value)| {
*out_value += diagonal_ridge * *vector_value;
});
} else {
out.scaled_add(diagonal_ridge, vector);
}
}
if let Some(bundle) = joint_full_width
&& !bundle.is_empty()
{
bundle.add_apply_into(vector.view(), out);
}
}
/// Penalty-aware Jacobi preconditioner used by every matrix-free PCG path
/// in the inner coefficient solve.
///
/// Builds `diag(H) + Σ_k gershgorin(S_k(λ)) + ridge`, clamped at 1e-10, where
/// `gershgorin(S)[i] = Σ_j |S[i,j]|` is the absolute row-sum (Gershgorin
/// radius) of each penalty block. This strictly dominates `diag(S)` for any
/// penalty with off-diagonal mass — the high-order difference / thin-plate
/// smooths (the cubic-Duchon `[mass, tension, stiffness]` triple, orders
/// [1,2,3] in `WigglePenaltyConfig::cubic_triple_operator_default`) are
/// strongly off-diagonal-dominant, so `S[i,i]` alone understates the
/// operator's true row scale by orders of magnitude there.
///
/// Why the row-sum and not just the diagonal: a plain Jacobi (diagonal-only)
/// preconditioner collapses to `diag(S_λ)` exactly in the saturated-softmax
/// regime, where the data Fisher weight `W = diag(p) − ppᵀ → 0` near the
/// simplex boundary and the data part of `diag(H)` vanishes. When the penalty
/// is off-diagonal-dominant, `diag(S_λ)` is a poor spectral match for
/// `H + S_λ`, leaving PCG with a large effective condition number and only
/// geometric (linear) convergence — the multinomial-penguins grind in #715.
/// The Gershgorin row-sum diagonal tracks the operator's per-coordinate scale
/// (`|S| 𝟙` bounds `S`'s action), tightening the preconditioned spectrum and
/// cutting CG iterations sharply in that regime. It is `≥ diag(S)` entrywise
/// for SPD `S`, so it stays strictly positive and SPD: it changes only the
/// PCG trajectory, never the converged Newton step or the KKT certificate
/// (PCG converges to the same `(H + S_λ)⁻¹ rhs` under any SPD preconditioner).
/// Design docs sometimes call this the "triple-operator penalty
/// preconditioner"; in code it is the single, unified preconditioner shared by
/// all PCG callsites.
///
/// Callers in the PIRLS inner Newton PCG path feed the result as the diagonal
/// rescale every CG iteration: PCG applies `M^{-1}` to residuals directly.
/// Do not square-root or trace-normalize these entries, and do not apply a
/// second preconditioner-side rescale to the returned Newton step.
fn positive_joint_diagonal_entry(value: f64) -> f64 {
if value.is_finite() && value > 1.0e-10 {
value
} else {
1.0e-10
}
}
fn joint_penalty_preconditioner_diag(
base_diagonal: &Array1<f64>,
ranges: &[(usize, usize)],
s_lambdas: &[Array2<f64>],
diagonal_ridge: f64,
joint_full_width: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
) -> Array1<f64> {
assert!(s_lambdas.len() <= ranges.len());
let mut diag = base_diagonal.clone();
for (b, s_lambda) in s_lambdas.iter().enumerate() {
let (start, end) = ranges[b];
assert_eq!(s_lambda.nrows(), end - start);
assert_eq!(s_lambda.ncols(), end - start);
// Gershgorin radius: the absolute row-sum `Σ_j |S[i,j]|` of the penalty
// block, not just its diagonal `S[i,i]`. For an off-diagonal-dominant
// smooth penalty (high-order difference / thin-plate) this tracks the
// operator's true per-coordinate scale, where `S[i,i]` understates it.
// For SPD `S` the row-sum is `≥ |S[i,i]| = S[i,i]`, so the result still
// strictly dominates the plain-diagonal preconditioner and stays SPD.
for (local_idx, global_idx) in (start..end).enumerate() {
let row_abs_sum: f64 = s_lambda
.row(local_idx)
.iter()
.map(|value| value.abs())
.sum();
diag[global_idx] += row_abs_sum;
}
}
if diagonal_ridge > 0.0 {
for value in &mut diag {
*value += diagonal_ridge;
}
}
if let Some(bundle) = joint_full_width
&& !bundle.is_empty()
{
bundle.add_diag(&mut diag);
}
diag.mapv(positive_joint_diagonal_entry)
}
fn log_joint_pcg_diagnostics(
cycle: usize,
total_p: usize,
total_n: usize,
preconditioner_diag: &Array1<f64>,
info: &crate::linalg::utils::PcgSolveInfo,
) {
let (diag_min, diag_max) = preconditioner_diag.iter().fold(
(f64::INFINITY, 0.0_f64),
|(min_value, max_value), &value| {
if value.is_finite() {
(min_value.min(value), max_value.max(value))
} else {
(min_value, max_value)
}
},
);
let diag_ratio = if diag_min.is_finite() && diag_min > 0.0 && diag_max.is_finite() {
Some(diag_max / diag_min)
} else {
None
};
log::info!(
"[PIRLS/blockwise joint-Newton/PCG] cycle={} p={} n={} iters={} rel_res={:.3e} res0={:.3e} res_final={:.3e} res_ratio={:.3e} ritz_cond~{} jacobi_diag_ratio~{}",
cycle,
total_p,
total_n,
info.iterations,
info.relative_residual_norm,
info.initial_residual_norm,
info.final_residual_norm,
info.residual_reduction,
info.condition_estimate
.map(|value| format!("{value:.3e}"))
.unwrap_or_else(|| "NA".to_string()),
diag_ratio
.map(|value| format!("{value:.3e}"))
.unwrap_or_else(|| "NA".to_string()),
);
}
fn add_joint_penalty_to_matrix(
matrix: &mut Array2<f64>,
ranges: &[(usize, usize)],
s_lambdas: &[Array2<f64>],
diagonal_ridge: f64,
joint_full_width: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
) {
for (b, s_lambda) in s_lambdas.iter().enumerate() {
let (start, end) = ranges[b];
let mut block = matrix.slice_mut(s![start..end, start..end]);
block += s_lambda;
}
if diagonal_ridge > 0.0 {
for d in 0..matrix.nrows() {
matrix[[d, d]] += diagonal_ridge;
}
}
if let Some(bundle) = joint_full_width
&& !bundle.is_empty()
{
bundle.add_to_matrix(matrix);
}
}
fn flatten_state_betas(
states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
) -> Array1<f64> {
let total = specs.iter().map(|s| s.design.ncols()).sum::<usize>();
let mut beta = Array1::<f64>::zeros(total);
let ranges = block_param_ranges(specs);
for (b, (start, end)) in ranges.into_iter().enumerate() {
beta.slice_mut(ndarray::s![start..end])
.assign(&states[b].beta);
}
beta
}
fn set_states_from_flat_beta(
states: &mut [ParameterBlockState],
specs: &[ParameterBlockSpec],
beta_flat: &Array1<f64>,
) -> Result<(), String> {
let ranges = block_param_ranges(specs);
let total = ranges.last().map(|(_, e)| *e).unwrap_or(0);
if beta_flat.len() != total {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"flat beta length mismatch: got {}, expected {}",
beta_flat.len(),
total
),
}
.into());
}
for (b, (start, end)) in ranges.into_iter().enumerate() {
states[b]
.beta
.assign(&beta_flat.slice(ndarray::s![start..end]).to_owned());
}
Ok(())
}
fn synchronized_states_from_flat_beta<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
states: &[ParameterBlockState],
beta_flat: &Array1<f64>,
) -> Result<Vec<ParameterBlockState>, String> {
let mut synced = states.to_vec();
set_states_from_flat_beta(&mut synced, specs, beta_flat)?;
refresh_all_block_etas(family, specs, &mut synced)?;
Ok(synced)
}
/// Inf-norm of the penalized stationarity residual with valid KKT multipliers
/// projected out at active linear constraints.
///
/// For a linearly constrained convex quadratic with constraints `Aβ ≥ b`,
/// the KKT conditions at β̂ read
///
/// S·β̂ − ∇ℓ(β̂) = A_activeᵀ λ
/// Aβ̂ − b ≥ 0
/// λ ≥ 0
/// λᵢ(Aᵢβ̂ − bᵢ) = 0
///
/// The residual component represented by nonnegative active multipliers is
/// therefore not a convergence defect. This helper removes that normal-cone
/// component before taking the inf-norm. Axis-aligned lower bounds are just a
/// special case; coupled derivative-guard rows must use the same KKT geometry.
///
/// `known_active_rows`, when provided, seeds the working set with the QP
/// solver's authoritative active rows. Trust-region damping and finite
/// precision can leave the committed β with row slacks slightly above the slack
/// tolerance even though the QP identified the row as binding; slack-based
/// detection alone then misses the row and leaves its Lagrange-multiplier mass
/// in the projected residual. Seeding from the QP's active set is exact; the
/// non-negative-multiplier iteration below then removes any seeded row whose
/// least-squares multiplier turns out to be strictly negative, so the union
/// of (QP active) ∪ (slack-detected) never declares false convergence.
fn projected_stationarity_inf_norm(
residual: &Array1<f64>,
beta: &Array1<f64>,
constraints: Option<&LinearInequalityConstraints>,
known_active_rows: Option<&[usize]>,
) -> f64 {
assert_eq!(residual.len(), beta.len());
let raw_inf = residual.iter().fold(0.0_f64, |acc, &v| acc.max(v.abs()));
let Some(constraints) = constraints else {
return raw_inf;
};
projected_linear_constraint_stationarity_inf_norm(
residual,
beta,
constraints,
known_active_rows,
)
.unwrap_or(raw_inf)
}
fn projected_linear_constraint_stationarity_inf_norm(
residual: &Array1<f64>,
beta: &Array1<f64>,
constraints: &LinearInequalityConstraints,
known_active_rows: Option<&[usize]>,
) -> Option<f64> {
let projected = projected_linear_constraint_stationarity_vector(
residual,
beta,
constraints,
known_active_rows,
)?;
let primal_violation = linear_constraint_primal_violation(beta, constraints)?;
Some(
projected
.iter()
.fold(0.0_f64, |acc, &v| acc.max(v.abs()))
.max(primal_violation),
)
}
fn linear_constraint_primal_violation(
beta: &Array1<f64>,
constraints: &LinearInequalityConstraints,
) -> Option<f64> {
if constraints.a.ncols() != beta.len() || constraints.a.nrows() != constraints.b.len() {
return None;
}
let mut primal_violation = 0.0_f64;
for row in 0..constraints.a.nrows() {
if constraints.b[row] == f64::NEG_INFINITY {
continue;
}
if !constraints.b[row].is_finite() {
return None;
}
let value = constraints.a.row(row).dot(beta);
let slack = value - constraints.b[row];
if !slack.is_finite() {
return None;
}
primal_violation = primal_violation.max((-slack).max(0.0));
}
Some(primal_violation)
}
fn projected_linear_constraint_stationarity_vector(
residual: &Array1<f64>,
beta: &Array1<f64>,
constraints: &LinearInequalityConstraints,
known_active_rows: Option<&[usize]>,
) -> Option<Array1<f64>> {
let p = beta.len();
if residual.len() != p
|| constraints.a.ncols() != p
|| constraints.a.nrows() != constraints.b.len()
{
return None;
}
let n_rows = constraints.a.nrows();
// Union the slack-detected active rows with the optional QP-supplied
// hint. Using a boolean membership table preserves a canonical row order
// (matching the constraint matrix) so the rank-reduction below is
// deterministic across calls.
let mut in_active = vec![false; n_rows];
if let Some(hint) = known_active_rows {
for &row in hint {
if row < n_rows && constraints.b[row].is_finite() {
in_active[row] = true;
}
}
}
for row in 0..n_rows {
if constraints.b[row] == f64::NEG_INFINITY {
continue;
}
if !constraints.b[row].is_finite() {
return None;
}
let a_row = constraints.a.row(row);
let value = a_row.dot(beta);
let slack = value - constraints.b[row];
if !slack.is_finite() {
return None;
}
if in_active[row] {
continue;
}
// Active-row inclusion band for the stationarity-residual cone projection.
// A constraint binding at the constrained optimum carries a Lagrange
// multiplier whose mass IS the stationarity residual (`r = A_activeᵀ λ`,
// λ >= 0); to project it out, every genuinely tight row must be a candidate.
// The constrained QP only reports rows it drove tight during a
// non-degenerate step, so monotone derivative-guard rows tight at the
// optimum but never explicitly stepped sit just above the old `1e-6·scale`
// band, get excluded, and leave the multiplier unresolved — tripping the
// `active_set_incomplete` refusal on an exactly constrained-stationary
// iterate (gam#797 survival time block). Widen the band so every near-tight
// row is a CANDIDATE; over-inclusion is safe because the downstream NNLS
// (`project_stationarity_residual_on_constraint_cone`) assigns λ = 0 to any
// candidate carrying no multiplier mass, so a non-binding row cannot
// spuriously shrink the residual.
let scale = value.abs().max(constraints.b[row].abs()).max(1.0);
let active_tol = 1e-3 * scale + 1e-8;
if slack <= active_tol {
in_active[row] = true;
}
}
let active_rows: Vec<usize> = (0..n_rows).filter(|&row| in_active[row]).collect();
if active_rows.is_empty() {
return Some(residual.clone());
}
let mut a_active = Array2::<f64>::zeros((active_rows.len(), p));
for (pos, &row) in active_rows.iter().enumerate() {
a_active.row_mut(pos).assign(&constraints.a.row(row));
}
project_stationarity_residual_on_constraint_cone(residual, &a_active)
.map(|(projected, _)| projected)
}
fn exact_newton_joint_stationarity_inf_norm<F: CustomFamily + ?Sized>(
family: &F,
specs: &[ParameterBlockSpec],
eval: &FamilyEvaluation,
states: &[ParameterBlockState],
s_lambdas: &[Array2<f64>],
ridge: f64,
ridge_policy: RidgePolicy,
block_active_sets: Option<&[Option<Vec<usize>>]>,
) -> Result<Option<f64>, String> {
if eval.blockworking_sets.len() != states.len() || states.len() != s_lambdas.len() {
return Err(CustomFamilyError::DimensionMismatch {
reason: "exact-newton joint stationarity check: block dimension mismatch".to_string(),
}
.into());
}
if specs.len() != states.len() {
return Err(CustomFamilyError::DimensionMismatch {
reason: "exact-newton joint stationarity check: spec/state count mismatch".to_string(),
}
.into());
}
if let Some(sets) = block_active_sets
&& sets.len() != states.len()
{
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"exact-newton joint stationarity check: active-set count mismatch, got {}, expected {}",
sets.len(),
states.len()
) }.into());
}
let block_constraints = collect_block_linear_constraints(family, states, specs)?;
let mut inf_norm = 0.0_f64;
for b in 0..states.len() {
let gradient = match &eval.blockworking_sets[b] {
// For exact-Newton families the block score is ∇ log L with respect
// to that block, while the penalized negative objective is
//
// Q(beta, rho) = -log L(beta) + 0.5 beta^T P_mode(rho) beta,
//
// where `P_mode` includes the rho-independent stabilization ridge
// exactly when that ridge participates in the quadratic objective.
//
// The coupled first-order condition is therefore
//
// ∇Q = -∇ log L + P beta = 0.
//
// So the exact penalized stationarity residual for block b is
//
// r_b = P_mode,b * beta_b - gradient_b.
//
// For blocks with simple lower-bound constraints (e.g. I-spline
// monotone time coefficients, monotone wiggle coefficients) the
// residual on an active-bound coordinate is the KKT multiplier
// λ_j ≥ 0 rather than a convergence defect; the projection in
// `projected_stationarity_inf_norm` drops those entries so the
// inf-norm measures only the free-set residual that must be
// driven to zero. Using only coordinate step size or an
// unprojected norm can declare convergence too early OR fail to
// ever declare convergence at a constrained optimum.
BlockWorkingSet::ExactNewton { gradient, .. } => gradient,
_ => return Ok(None),
};
let mut residual = s_lambdas[b].dot(&states[b].beta) - gradient;
if ridge_policy.include_quadratic_penalty && ridge > 0.0 {
residual += &states[b].beta.mapv(|v| ridge * v);
}
let block_active_hint = block_active_sets
.and_then(|sets| sets.get(b))
.and_then(|opt| opt.as_deref());
let block_inf = projected_stationarity_inf_norm(
&residual,
&states[b].beta,
block_constraints[b].as_ref(),
block_active_hint,
);
inf_norm = inf_norm.max(block_inf);
}
Ok(Some(inf_norm))
}
fn exact_newton_joint_gradient_from_eval(
eval: &FamilyEvaluation,
specs: &[ParameterBlockSpec],
states: &[ParameterBlockState],
) -> Result<Option<Array1<f64>>, String> {
if eval.blockworking_sets.len() != specs.len() {
return Err(format!(
"exact-newton joint gradient extraction: family returned {} block working sets, expected {}",
eval.blockworking_sets.len(),
specs.len()
));
}
if states.len() != specs.len() {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"exact-newton joint gradient extraction: state count {} does not match spec count {}",
states.len(),
specs.len()
) }.into());
}
let total_p = specs.iter().map(|spec| spec.design.ncols()).sum::<usize>();
let mut gradient = Array1::<f64>::zeros(total_p);
let mut offset = 0usize;
for ((spec, work), state) in specs
.iter()
.zip(eval.blockworking_sets.iter())
.zip(states.iter())
{
let width = spec.design.ncols();
match work {
BlockWorkingSet::ExactNewton {
gradient: block_gradient,
..
} => {
if block_gradient.len() != width {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"exact-newton joint gradient extraction: block gradient length mismatch, got {}, expected {}",
block_gradient.len(),
width
) }.into());
}
gradient
.slice_mut(ndarray::s![offset..offset + width])
.assign(block_gradient);
}
BlockWorkingSet::Diagonal {
working_response,
working_weights,
} => {
// Recover the per-block log-likelihood score from the IRLS
// working set. By construction of the IRLS pseudo-response
//
// z_i = η_i + (∂ℓ/∂η_i) / w_i,
//
// so the row score is `w_i (z_i − η_i)` and the
// coefficient-space score is
//
// ∇_β_b log L = X_b^T (w ⊙ (z − η)).
//
// Without this branch the joint-Newton path is unable to
// assemble its RHS for families that emit Diagonal working
// sets alongside an exact joint Hessian (e.g. Gaussian
// location-scale): the inner fit returns non-converged, and
// the outer evaluator falls into the nonconverged-result
// branch and reports a zero outer gradient.
let n = working_response.len();
if working_weights.len() != n || state.eta.len() != n || spec.design.nrows() != n {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"exact-newton joint gradient extraction: diagonal working-set length mismatch (z={}, w={}, η={}, X_rows={})",
working_response.len(),
working_weights.len(),
state.eta.len(),
spec.design.nrows()
) }.into());
}
let mut weighted = Array1::<f64>::zeros(n);
for i in 0..n {
weighted[i] = working_weights[i] * (working_response[i] - state.eta[i]);
}
let block_gradient =
<DesignMatrix as LinearOperator>::apply_transpose(&spec.design, &weighted);
if block_gradient.len() != width {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"exact-newton joint gradient extraction: diagonal block transpose length mismatch, got {}, expected {}",
block_gradient.len(),
width
) }.into());
}
gradient
.slice_mut(ndarray::s![offset..offset + width])
.assign(&block_gradient);
}
}
offset += width;
}
Ok(Some(gradient))
}
fn exact_newton_joint_stationarity_inf_norm_from_gradient(
gradient: &Array1<f64>,
states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
s_lambdas: &[Array2<f64>],
ridge: f64,
ridge_policy: RidgePolicy,
block_constraints: &[Option<LinearInequalityConstraints>],
block_active_sets: Option<&[Option<Vec<usize>>]>,
) -> Result<f64, String> {
if states.len() != specs.len() || states.len() != s_lambdas.len() {
return Err(
"exact-newton joint stationarity check from gradient: block dimension mismatch"
.to_string(),
);
}
if block_constraints.len() != states.len() {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"exact-newton joint stationarity check from gradient: constraint count mismatch, got {}, expected {}",
block_constraints.len(),
states.len()
) }.into());
}
if let Some(sets) = block_active_sets
&& sets.len() != states.len()
{
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"exact-newton joint stationarity check from gradient: active-set count mismatch, got {}, expected {}",
sets.len(),
states.len()
) }.into());
}
let total_p = specs.iter().map(|spec| spec.design.ncols()).sum::<usize>();
if gradient.len() != total_p {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"exact-newton joint stationarity check from gradient: joint gradient length mismatch, got {}, expected {}",
gradient.len(),
total_p
) }.into());
}
// Same KKT projection as `exact_newton_joint_stationarity_inf_norm`:
// multipliers at active lower bounds are not convergence defects, so we
// measure only the free-set residual. See `projected_stationarity_inf_norm`
// for the tolerance choice and its parallel with `projected_gradient_norm`
// in `pirls.rs`.
//
// The optional `block_active_sets` arrives from the joint-Newton inner
// loop's `cached_active_sets` and carries the QP solver's authoritative
// active rows per block. Threading it through is what makes the
// stationarity test correctly fire at the constrained optimum: a damped
// constrained step may commit β with row slacks slightly above the slack
// tolerance even though the QP identified the rows as binding, and
// slack-based detection alone then misses the rows and leaves the
// Lagrange-multiplier mass in the residual.
let mut inf_norm = 0.0_f64;
let mut offset = 0usize;
for b in 0..states.len() {
let width = specs[b].design.ncols();
let mut residual =
s_lambdas[b].dot(&states[b].beta) - gradient.slice(ndarray::s![offset..offset + width]);
if ridge_policy.include_quadratic_penalty && ridge > 0.0 {
residual += &states[b].beta.mapv(|v| ridge * v);
}
let block_active_hint = block_active_sets
.and_then(|sets| sets.get(b))
.and_then(|opt| opt.as_deref());
let block_inf = projected_stationarity_inf_norm(
&residual,
&states[b].beta,
block_constraints[b].as_ref(),
block_active_hint,
);
inf_norm = inf_norm.max(block_inf);
offset += width;
}
Ok(inf_norm)
}
fn exact_newton_joint_stationarity_vector_from_gradient(
gradient: &Array1<f64>,
states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
s_lambdas: &[Array2<f64>],
ridge: f64,
ridge_policy: RidgePolicy,
) -> Result<Array1<f64>, String> {
if states.len() != specs.len() || states.len() != s_lambdas.len() {
return Err(
"exact-newton joint stationarity vector from gradient: block dimension mismatch"
.to_string(),
);
}
let total_p = specs.iter().map(|spec| spec.design.ncols()).sum::<usize>();
if gradient.len() != total_p {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"exact-newton joint stationarity vector from gradient: joint gradient length mismatch, got {}, expected {}",
gradient.len(),
total_p
) }.into());
}
let mut residual = Array1::<f64>::zeros(total_p);
let mut offset = 0usize;
for b in 0..states.len() {
let width = specs[b].design.ncols();
let start = offset;
let end = offset + width;
let mut block = s_lambdas[b].dot(&states[b].beta) - gradient.slice(ndarray::s![start..end]);
if ridge_policy.include_quadratic_penalty && ridge > 0.0 {
block += &states[b].beta.mapv(|v| ridge * v);
}
residual.slice_mut(ndarray::s![start..end]).assign(&block);
offset = end;
}
Ok(residual)
}
fn exact_newton_joint_projected_stationarity_vector_from_gradient(
gradient: &Array1<f64>,
states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
s_lambdas: &[Array2<f64>],
ridge: f64,
ridge_policy: RidgePolicy,
block_constraints: &[Option<LinearInequalityConstraints>],
block_active_sets: Option<&[Option<Vec<usize>>]>,
) -> Result<Array1<f64>, String> {
if states.len() != specs.len()
|| states.len() != s_lambdas.len()
|| states.len() != block_constraints.len()
{
return Err(
"exact-newton projected stationarity vector from gradient: block dimension mismatch"
.to_string(),
);
}
if let Some(sets) = block_active_sets
&& sets.len() != states.len()
{
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"exact-newton projected stationarity vector from gradient: active-set count mismatch, got {}, expected {}",
sets.len(),
states.len()
) }.into());
}
let total_p = specs.iter().map(|spec| spec.design.ncols()).sum::<usize>();
if gradient.len() != total_p {
return Err(CustomFamilyError::DimensionMismatch { reason: format!(
"exact-newton projected stationarity vector from gradient: joint gradient length mismatch, got {}, expected {}",
gradient.len(),
total_p
) }.into());
}
let mut residual = Array1::<f64>::zeros(total_p);
let mut offset = 0usize;
for b in 0..states.len() {
let width = specs[b].design.ncols();
let start = offset;
let end = offset + width;
let mut block = s_lambdas[b].dot(&states[b].beta) - gradient.slice(ndarray::s![start..end]);
if ridge_policy.include_quadratic_penalty && ridge > 0.0 {
block += &states[b].beta.mapv(|v| ridge * v);
}
if let Some(constraints) = block_constraints[b].as_ref() {
let block_active_hint = block_active_sets
.and_then(|sets| sets.get(b))
.and_then(|opt| opt.as_deref());
block = projected_linear_constraint_stationarity_vector(
&block,
&states[b].beta,
constraints,
block_active_hint,
)
.ok_or_else(|| {
format!("exact-newton projected stationarity vector: failed to project block {b}")
})?;
}
residual.slice_mut(ndarray::s![start..end]).assign(&block);
offset = end;
}
Ok(residual)
}
/// Build the free-space-projected KKT residual for the IFT correction.
///
/// The active set passed via `block_active_sets` is consumed by the inner
/// projection so the returned vector lies in `range(I − P_normal_cone)`. The
/// [`crate::solver::estimate::reml::unified::ProjectedKktResidual`] return type makes
/// that invariant visible at every call site — callers cannot forget to
/// project, and `reml/unified.rs` cannot accidentally accept an unprojected
/// vector.
fn exact_newton_joint_kkt_residual_for_ift<F: CustomFamily + ?Sized>(
family: &F,
specs: &[ParameterBlockSpec],
states: &[ParameterBlockState],
s_lambdas: &[Array2<f64>],
ridge: f64,
ridge_policy: RidgePolicy,
block_active_sets: Option<&[Option<Vec<usize>>]>,
) -> Result<Option<ProjectedKktResidual>, String> {
let eval = family.evaluate(states)?;
let Some(gradient) = exact_newton_joint_gradient_from_eval(&eval, specs, states)? else {
return Ok(None);
};
let block_constraints = collect_block_linear_constraints(family, states, specs)?;
exact_newton_joint_projected_kkt_residual_for_ift_from_gradient(
&gradient,
specs,
states,
s_lambdas,
ridge,
ridge_policy,
&block_constraints,
block_active_sets,
)
}
fn exact_newton_joint_kkt_residual_for_ift_from_cached_gradient<F: CustomFamily + ?Sized>(
family: &F,
specs: &[ParameterBlockSpec],
states: &[ParameterBlockState],
s_lambdas: &[Array2<f64>],
ridge: f64,
ridge_policy: RidgePolicy,
block_active_sets: Option<&[Option<Vec<usize>>]>,
cached_gradient: Option<&Array1<f64>>,
) -> Result<Option<ProjectedKktResidual>, String> {
if let Some(gradient) = cached_gradient {
let block_constraints = collect_block_linear_constraints(family, states, specs)?;
return exact_newton_joint_projected_kkt_residual_for_ift_from_gradient(
gradient,
specs,
states,
s_lambdas,
ridge,
ridge_policy,
&block_constraints,
block_active_sets,
);
}
exact_newton_joint_kkt_residual_for_ift(
family,
specs,
states,
s_lambdas,
ridge,
ridge_policy,
block_active_sets,
)
}
fn exact_newton_joint_projected_kkt_residual_for_ift_from_gradient(
gradient: &Array1<f64>,
specs: &[ParameterBlockSpec],
states: &[ParameterBlockState],
s_lambdas: &[Array2<f64>],
ridge: f64,
ridge_policy: RidgePolicy,
block_constraints: &[Option<LinearInequalityConstraints>],
block_active_sets: Option<&[Option<Vec<usize>>]>,
) -> Result<Option<ProjectedKktResidual>, String> {
let residual = exact_newton_joint_projected_stationarity_vector_from_gradient(
gradient,
states,
specs,
s_lambdas,
ridge,
ridge_policy,
block_constraints,
block_active_sets,
)?;
if residual.iter().all(|v| v.is_finite()) {
Ok(Some(ProjectedKktResidual::from_active_projected(residual)))
} else {
// Surface this clearly: a non-finite projected residual reaches the
// unified evaluator as `kkt_residual = None`, which then makes the
// envelope-consistency tripwire fire with "no projected residual"
// as the suspected cause. Emit the count and magnitude so the
// failure is diagnosable from a single log line.
let nan_count = residual.iter().filter(|v| v.is_nan()).count();
let inf_count = residual.iter().filter(|v| v.is_infinite()).count();
let finite_max = residual
.iter()
.filter(|v| v.is_finite())
.copied()
.map(f64::abs)
.fold(0.0_f64, f64::max);
log::warn!(
"[exact-newton kkt-residual projection] dropping projected KKT residual to None: \
len={} nan_count={} inf_count={} finite_max={:.3e}. The unified evaluator will \
treat this convergent path as if no residual were available, which silently \
disables the IFT correction and can trip the envelope-gradient consistency check \
on near-singular H. Investigate which block produced the non-finite entry.",
residual.len(),
nan_count,
inf_count,
finite_max,
);
Ok(None)
}
}
fn compute_joint_covariance<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
states: &[ParameterBlockState],
per_block_log_lambdas: &[Array1<f64>],
options: &BlockwiseFitOptions,
) -> Result<Array2<f64>, String> {
let ranges = block_param_ranges(specs);
let total = ranges.last().map(|(_, e)| *e).unwrap_or(0);
let Some(mut h) = exact_newton_joint_hessian_symmetrized(
family,
states,
specs,
total,
"joint exact-newton Hessian shape mismatch in covariance",
)?
else {
return Err(
"joint covariance requires an exact analytic Hessian; objective perturbation is forbidden"
.to_string(),
);
};
for (b, spec) in specs.iter().enumerate() {
let (start, end) = ranges[b];
let lambdas = per_block_log_lambdas[b].mapv(f64::exp);
let mut s_lambda = Array2::<f64>::zeros((end - start, end - start));
for (k, s) in spec.penalties.iter().enumerate() {
s.add_scaled_to(lambdas[k], &mut s_lambda);
}
h.slice_mut(ndarray::s![start..end, start..end])
.scaled_add(1.0, &s_lambda);
}
symmetrize_dense_in_place(&mut h);
if use_exact_newton_strict_spd(family) {
// #748: the strict posterior precision is `H + S_λ` AT THE CONVERGED
// OPTIMUM. A δ-ridge inverse `(H + S_λ + δI)⁻¹` would mask a genuinely
// non-PD curvature and report it as if it were the posterior
// covariance, biasing every standard error. Instead: eigendecompose and
// **reject** when the precision is genuinely indefinite (a real
// fit-quality failure — the mode is not a strict maximum), and on the
// PSD case return the honest positive-eigenspace pseudo-inverse (the
// structural null space of a penalised model is a flat posterior
// direction, not something to ridge away).
let p = h.nrows();
let (evals, _) = FaerEigh::eigh(&h, Side::Lower).map_err(|e| {
format!("strict pseudo-laplace covariance eigendecomposition failed: {e}")
})?;
let max_abs_eval = evals.iter().fold(0.0_f64, |acc, &ev| acc.max(ev.abs()));
let eps_np = f64::EPSILON * (p as f64) * (p as f64);
let tol = (10.0 * eps_np * max_abs_eval).max(100.0 * f64::EPSILON);
if let Some(&min_eval) = evals
.iter()
.filter(|&&ev| ev < -tol)
.min_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
{
let below = evals.iter().filter(|&&ev| ev < -tol).count();
return Err(format!(
"strict pseudo-laplace covariance: joint coefficient Hessian is non-PD at the \
converged optimum ({below} eigenvalue(s) below -tol, min(λ)={min_eval:.6e}, \
max|λ|={max_abs_eval:.6e}, tol={tol:.6e}); the mode is not a strict posterior \
maximum, so the reported covariance would be meaningless — fit-quality failure \
surfaced instead of δ-ridge masking (gam#748)"
));
}
pinv_positive_part(&h, effective_solverridge(options.ridge_floor))
} else {
match inverse_spdwith_retry(&h, effective_solverridge(options.ridge_floor), 8) {
Ok(cov) => Ok(cov),
Err(_) => pinv_positive_part(&h, effective_solverridge(options.ridge_floor)),
}
}
}
fn compute_joint_covariance_required<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
states: &[ParameterBlockState],
per_block_log_lambdas: &[Array1<f64>],
options: &BlockwiseFitOptions,
) -> Result<Option<Array2<f64>>, CustomFamilyError> {
if !options.compute_covariance {
return Ok(None);
}
compute_joint_covariance(family, specs, states, per_block_log_lambdas, options)
.map(Some)
.map_err(|e| CustomFamilyError::InvalidInput {
context: "compute_joint_covariance_required",
reason: format!("joint covariance computation failed: {e}"),
})
}
/// Compute joint working-set geometry at convergence for ALO diagnostics.
fn compute_joint_geometry<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
states: &[ParameterBlockState],
per_block_log_lambdas: &[Array1<f64>],
) -> Result<Option<FitGeometry>, String> {
if specs.len() != per_block_log_lambdas.len() {
return Ok(None);
}
if specs.len() == 1 {
let eval = family.evaluate(states).ok();
let Some(eval) = eval else {
return Ok(None);
};
let spec = &specs[0];
let lambdas = per_block_log_lambdas[0].mapv(f64::exp);
// The penalized joint Hessian `H_pen = H_lik + Σ_k λ_k S_k` is the exact
// mgcv quantity the trace edf `p − Σ_k λ_k·tr(H_pen⁻¹ S_k)` consumes. Two
// single-block working-set shapes reach here:
//
// * `Diagonal` — IRLS/GLM families expose only the diagonal working
// weights, so the likelihood curvature is reconstructed as the
// Gauss–Newton gram `XᵀWX`.
// * `ExactNewton` — coefficient-space exact-curvature families (CTN
// transformation-normal, …) already carry the dense negative
// log-likelihood Hessian `−∇²log L = H_lik` directly. Materialize it
// and add the penalties, so these families report inference / total
// edf instead of dropping geometry (and therefore inference) for the
// whole fit (#720).
let (mut h, working_weights, working_response) = match eval.blockworking_sets.as_slice() {
[
BlockWorkingSet::Diagonal {
working_response,
working_weights,
},
] => {
let Some(h) = spec
.design
.xt_diag_x_signed_op(SignedWeightsView::from_array(working_weights))
.ok()
else {
return Ok(None);
};
(h, working_weights.clone(), working_response.clone())
}
[BlockWorkingSet::ExactNewton { hessian, .. }] => {
let h = hessian.to_dense();
if h.nrows() != spec.design.ncols() || h.ncols() != spec.design.ncols() {
return Ok(None);
}
// The exact-Newton block carries no IRLS pseudo-data; the
// trace edf reads only the penalized Hessian, and the
// downstream IRLS covariance path is unused for these
// families (they report dispersion = 1). Match the joint
// multi-block branch's zero-length convention.
let working_len = states.first().map(|state| state.eta.len()).unwrap_or(0);
(h, Array1::zeros(working_len), Array1::zeros(working_len))
}
_ => return Ok(None),
};
for (k, s) in spec.penalties.iter().enumerate() {
let s_dense = s.as_dense_cow();
h.scaled_add(lambdas[k], &*s_dense);
}
// Exact-Newton families may return a Hessian assembled from directional
// callbacks whose off-diagonal entries differ by floating-point order
// or, for pseudo-Laplace tests, by a deliberately non-symmetric input
// that is accepted only after symmetrization. Export the same symmetric
// penalized Hessian used by the determinant/covariance path instead of
// letting result assembly reject an otherwise valid fit geometry.
symmetrize_dense_in_place(&mut h);
return Ok(Some(FitGeometry {
penalized_hessian: h.into(),
working_weights,
working_response,
}));
}
let requires_explicit_joint_hessian = specs.iter().enumerate().any(|(idx, spec)| {
custom_family_block_role(&spec.name, idx, specs.len())
== crate::solver::estimate::BlockRole::LinkWiggle
});
let total_p: usize = specs.iter().map(|spec| spec.design.ncols()).sum();
let Some(mut h) = exact_newton_joint_hessian_symmetrized(
family,
states,
specs,
total_p,
"compute_joint_geometry",
)?
else {
if requires_explicit_joint_hessian {
return Err(
"link-wiggle fits require an exact explicit joint Hessian for posterior sampling"
.to_string(),
);
}
return Ok(None);
};
let ranges = block_param_ranges(specs);
for (block_idx, spec) in specs.iter().enumerate() {
let Some(block_log_lambdas) = per_block_log_lambdas.get(block_idx) else {
return Ok(None);
};
let lambdas = block_log_lambdas.mapv(f64::exp);
if lambdas.len() != spec.penalties.len() {
return Ok(None);
}
let (start, end) = ranges[block_idx];
let block_dim = end - start;
for (penalty_idx, penalty) in spec.penalties.iter().enumerate() {
let scale = lambdas[penalty_idx];
if scale == 0.0 {
continue;
}
let dense = penalty.as_dense_cow();
if dense.nrows() == block_dim && dense.ncols() == block_dim {
h.slice_mut(ndarray::s![start..end, start..end])
.scaled_add(scale, &*dense);
} else if dense.nrows() == total_p && dense.ncols() == total_p {
h.scaled_add(scale, &*dense);
} else {
return Ok(None);
}
}
}
let working_len = states.first().map(|state| state.eta.len()).unwrap_or(0);
Ok(Some(FitGeometry {
penalized_hessian: h.into(),
working_weights: Array1::zeros(working_len),
working_response: Array1::zeros(working_len),
}))
}
pub fn fit_custom_family<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
) -> Result<crate::solver::estimate::UnifiedFitResult, CustomFamilyError> {
fit_custom_family_with_rho_prior(family, specs, options, crate::types::RhoPrior::Flat)
}
/// Lift reduced-space `ParameterBlockState`s back to the raw block
/// dimensions described by `canonical.gauge`. Each block's
/// `beta` becomes `T_i · θ_i` (selection-T zeros dropped raw entries);
/// `eta = design · beta` is invariant under the transform, so the
/// reduced-space `eta` field carries through unchanged.
fn lift_block_states_to_raw(
canonical: &crate::solver::identifiability_canonical::CanonicalSpecs,
reduced: Vec<ParameterBlockState>,
) -> Vec<ParameterBlockState> {
let theta_blocks: Vec<Array1<f64>> = reduced.iter().map(|s| s.beta.clone()).collect();
let raw_betas = canonical.gauge.lift_block_betas(&theta_blocks);
reduced
.into_iter()
.zip(raw_betas.into_iter())
.map(|(state, beta_raw)| ParameterBlockState {
beta: beta_raw,
eta: state.eta,
})
.collect()
}
/// Lift a reduced-space conditional covariance / joint geometry pair
/// back to the raw coordinate system by sandwiching with the joint
/// block-diagonal transform `T_full = blockdiag(T_i)`. Selection-T
/// zero-pads the dropped raw rows/cols; the lifted Hessian is exactly
/// the post-canonicalisation Hessian as seen in raw coordinates and is
/// rank-deficient by construction along the dropped directions
/// (matching the inner-solve geometry the canonical step produced).
fn lift_fit_geometry_to_raw(
canonical: &crate::solver::identifiability_canonical::CanonicalSpecs,
covariance_conditional: Option<Array2<f64>>,
geometry: Option<FitGeometry>,
) -> (Option<Array2<f64>>, Option<FitGeometry>) {
let lifted_cov = covariance_conditional.map(|c| canonical.gauge.lift_covariance(&c));
let lifted_geom = geometry.map(|g| {
let h_red = g.penalized_hessian.into_array();
let h_raw = canonical.gauge.lift_covariance(&h_red);
FitGeometry {
penalized_hessian: h_raw.into(),
working_weights: g.working_weights,
working_response: g.working_response,
}
});
(lifted_cov, lifted_geom)
}
struct BlockwiseFitAssembly<'a> {
rho_physical: Array1<f64>,
covariance_conditional: Option<Array2<f64>>,
geometry: Option<FitGeometry>,
canonical: Option<&'a crate::solver::identifiability_canonical::CanonicalSpecs>,
result_specs: &'a [ParameterBlockSpec],
penalized_objective: f64,
outer_iterations: usize,
outer_gradient_norm: Option<f64>,
criterion_certificate: Option<crate::solver::outer_strategy::CriterionCertificate>,
outer_converged: bool,
context: &'static str,
}
fn assemble_custom_family_fit_result(
inner: BlockwiseInnerResult,
assembly: BlockwiseFitAssembly<'_>,
) -> Result<crate::solver::estimate::UnifiedFitResult, CustomFamilyError> {
let BlockwiseFitAssembly {
rho_physical,
covariance_conditional,
geometry,
canonical,
result_specs,
penalized_objective,
outer_iterations,
outer_gradient_norm,
criterion_certificate,
outer_converged,
context,
} = assembly;
let lambdas = rho_physical.mapv(f64::exp);
let log_lambdas = lambdas.mapv(|v| v.max(1e-300).ln());
let (block_states, covariance_conditional, geometry, precomputed_edf) =
if let Some(canonical) = canonical {
let precomputed_edf = reduced_blockwise_edf(geometry.as_ref(), canonical, &lambdas);
let block_states = lift_block_states_to_raw(canonical, inner.block_states);
let (covariance_conditional, geometry) =
lift_fit_geometry_to_raw(canonical, covariance_conditional, geometry);
(
block_states,
covariance_conditional,
geometry,
precomputed_edf,
)
} else {
(inner.block_states, covariance_conditional, geometry, None)
};
blockwise_fit_from_parts(
BlockwiseFitResultParts {
block_states,
log_likelihood: inner.log_likelihood,
log_lambdas,
lambdas,
covariance_conditional,
stable_penalty_term: 2.0 * inner.penalty_value,
penalized_objective,
outer_iterations,
outer_gradient_norm,
criterion_certificate,
inner_cycles: inner.cycles,
outer_converged,
geometry,
precomputed_edf,
},
result_specs,
)
.map_err(|reason| CustomFamilyError::Optimization { context, reason })
}
/// Install the channel-aware `AdditiveBlockJacobian` callbacks declared by a
/// family's [`CustomFamily::output_channel_assignment`].
///
/// Multi-output families that build their specs by hand (or through the
/// low-level `fit_custom_family` API) declare their per-block output channel
/// here so the pre-fit identifiability audit routes channel-aware instead of
/// mistaking a shared covariate basis for cross-block aliases (#558). Blocks
/// that already carry an explicit `jacobian_callback` are left untouched
/// (the family wired its own, possibly β-dependent, multi-output Jacobian).
///
/// Returns `None` when the family declares no assignment (single-output flat
/// route, the default) so the caller can keep borrowing the original specs
/// without an allocation.
fn wire_output_channels<F: CustomFamily + ?Sized>(
family: &F,
specs: &[ParameterBlockSpec],
) -> Result<Option<Vec<ParameterBlockSpec>>, CustomFamilyError> {
validate_blockspecs(specs)?;
let Some(channels) = family.output_channel_assignment(specs) else {
return Ok(None);
};
if channels.len() != specs.len() {
return Err(CustomFamilyError::DimensionMismatch {
reason: format!(
"output_channel_assignment returned {} channels for {} blocks",
channels.len(),
specs.len(),
),
});
}
let n_family_outputs = channels.iter().copied().max().map(|m| m + 1).unwrap_or(1);
if n_family_outputs <= 1 {
// A single output channel is exactly the flat route — nothing to wire.
return Ok(None);
}
// When every block already carries an explicit (family-wired) callback,
// the channel-aware route is already taken — avoid cloning the specs.
if specs.iter().all(|s| s.jacobian_callback.is_some()) {
return Ok(None);
}
let mut wired = specs.to_vec();
for (idx, spec) in wired.iter_mut().enumerate() {
// Respect a family-supplied callback (e.g. multinomial / location-scale
// already wire their own multi-output, possibly β-dependent Jacobian).
if spec.jacobian_callback.is_some() {
continue;
}
let own_output = channels[idx];
// The block's effective design at β=0 (with no callback) is exactly
// its linear design — the additive-block Jacobian for an `η_r = X_r β_r`
// channel.
let dense = spec.effective_design("wire_output_channels").map_err(|e| {
CustomFamilyError::DimensionMismatch {
reason: format!("block {idx} effective design for channel wiring: {e}"),
}
})?;
spec.jacobian_callback = Some(Arc::new(AdditiveBlockJacobian {
design: dense,
own_output,
n_family_outputs,
}));
}
Ok(Some(wired))
}
/// True iff an outer-smoothing `Err` is a POST-AUDIT NUMERICAL pathology that
/// the never-fail posterior-sampling rung can recover from (gam#860), rather
/// than an ill-posed input that must keep raising.
///
/// All structural guards (the #531-class identifiability audit, the #789B
/// zero-events guard, the #859 cross-fit alignment check) raise BEFORE the outer
/// solver runs, so by the time the outer optimizer reports "no candidate seeds
/// passed outer startup validation" (every seed rejected during exact-eval
/// validation, e.g. the #787 kappa-driven penalty-topology dim-mismatch that
/// surfaces as a non-finite cost) the design is structurally well-posed and a
/// posterior mode exists to sample about. Those two signatures are the
/// escalatable ones. Any other `Err` (a genuine solver contract violation,
/// dimension error, etc.) keeps the hard raise.
fn outer_startup_failure_is_escalatable(err: &EstimationError) -> bool {
match err {
EstimationError::RemlOptimizationFailed(message) => {
message.contains("no candidate seeds passed outer startup validation")
|| message.contains("objective returned a non-finite cost")
// Data-driven inner non-convergence on a structurally-audited design:
// the coupled exact-joint Newton path could not drive a weakly-identified
// block's penalized stationarity residual below tol at every screened seed
// (the #787 weak marginal/logslope-coupling KKT-flooring regime). This
// surfaces as a hard `Err` from the inner solve (rather than the
// `Ok(!inner_converged)` retreat sentinel), so when it rejects every seed
// BEFORE the outer optimizer starts it would otherwise dead-end short of
// the post-run escalation rung. It is a post-audit NUMERICAL pathology, not
// an ill-posed input — the best inner mode reached during screening is a
// usable posterior mode — so route it into the same never-fail escalation
// (gam#860).
//
// Both coupled-exact-joint non-convergence signatures qualify: the
// pre-budget "exited the joint Newton path before convergence" exit and
// the "exhausted the joint Newton budget without KKT convergence" exit are
// the same #787-class weak-identification floor reached two ways.
//
// The SAME prefixes are also emitted for GENUINELY STRUCTURAL cert
// refusals (the diagnosis is carried in the trailing `; diagnosis: <label>`
// slot of the bubbled error). Those — a rank-deficient joint design, an
// unresolved active set, or a cross-block alias surfaced at fit time — are
// NOT recoverable by sampling about the mode (the mode itself is
// degenerate), so they must keep hard-raising. We therefore escalate the
// coupled-joint failure only when it carries no structural diagnosis label.
|| ((message
.contains("coupled exact-joint inner solve exited the joint Newton path")
|| message.contains(
"coupled exact-joint inner solve exhausted the joint Newton budget",
))
&& !message.contains("diagnosis: rank_deficient_H_pen")
&& !message.contains("diagnosis: active_set_incomplete")
&& !message.contains("diagnosis: aliasing_detected_at_fit"))
}
_ => false,
}
}
/// Minimum effective degrees of freedom a penalized term must retain in the
/// outer λ-selection. One effective dimension is the smallest non-arbitrary
/// floor: it asserts the penalized component must explain at least ONE effective
/// direction of its own range space, i.e. it has not collapsed entirely onto its
/// unpenalized polynomial null space. It is NOT a tuning constant — `1.0` is the
/// boundary between "the smooth contributes" and "the smooth is statistically
/// indistinguishable from its null-space limit".
const EFFECTIVE_DF_FLOOR: f64 = 1.0;
/// Unit-weight effective degrees of freedom of a single penalized term as a
/// function of `ρ = log λ`, expressed through the design/penalty generalized
/// eigenvalues `γ_j` on the penalty range space:
///
/// ```text
/// edf(ρ) = Σ_j γ_j / (γ_j + e^ρ), γ_j = (design range curvature)_j / (penalty)_j.
/// ```
///
/// This is the data-FREE structural edf: it uses the design column Gram `XᵀX`
/// (unit weights), NOT the family's Fisher weight, so it is the same regardless
/// of where the inner solve sits on a near-flat Fisher surface. It is the
/// quantity whose collapse the #715/#684 over-shrinkage describes — when the
/// Fisher curvature vanishes the REML objective flattens in ρ and the optimizer
/// lets λ drift past the point where this structural edf falls below the floor.
fn unit_weight_term_edf(gammas: &[f64], rho: f64) -> f64 {
let lambda = rho.exp();
gammas
.iter()
.map(|&g| if g > 0.0 { g / (g + lambda) } else { 0.0 })
.sum()
}
/// Generalized eigenvalues `γ_j` of the design column Gram `G = XᵀX` against the
/// penalty `S` on `range(S)`, computed structurally (unit weights).
///
/// These are the eigenvalues of the pencil `(UᵀG U, D)` where `S = U D Uᵀ` and
/// the index runs over `range(S)` (the positive eigenvalues `d_j` of `S`).
/// Equivalently they are the eigenvalues of the symmetric matrix
///
/// ```text
/// B = D^{-1/2} (Uᵀ G U) D^{-1/2} restricted to range(S),
/// ```
///
/// with `D = diag(d_j)` over the range and `U` the corresponding penalty
/// eigenvectors. With these `γ_j` the structural effective df is the EXACT
/// trace identity
///
/// ```text
/// Σ_j γ_j/(γ_j + λ) = tr{ G (G + λ S)⁻¹ } for all λ > 0.
/// ```
///
/// This is NOT a per-direction Rayleigh quotient `(u_jᵀ G u_j)/d_j`: that would
/// keep only the diagonal of `B` and is correct only when `G` and `S` commute
/// (are simultaneously diagonalizable). Smooth Gram/penalty pairs generally do
/// not commute, so the off-diagonal coupling of `B` must be retained — it is
/// what makes the eigenvalue sum match the trace identity above.
///
/// Returns `None` (caller falls back to the uniform ρ bound) whenever the
/// geometry cannot be materialized safely as a `p×p` block-local pair — Kronecker
/// penalties are expanded, but `Blockwise`/total-dim penalties whose dense form
/// is not `p×p` are skipped rather than risk a mis-projected curvature that could
/// bias the REML selection.
fn design_penalty_range_gammas(design: &DesignMatrix, penalty: &PenaltyMatrix) -> Option<Vec<f64>> {
let p = design.ncols();
if p == 0 {
return None;
}
let s_dense = penalty.to_dense();
if s_dense.nrows() != p || s_dense.ncols() != p {
// Blockwise/total-dim layout or shape mismatch: not safely projectable
// here. Fall back to the uniform bound.
return None;
}
let x = design.to_dense();
if x.ncols() != p {
return None;
}
let gram = x.t().dot(&x);
// Eigendecompose the penalty to find its range space S = U D Uᵀ.
let (s_evals, s_evecs) = s_dense.eigh(Side::Lower).ok()?;
let s_max = s_evals.iter().fold(0.0_f64, |a, &b| a.max(b.abs()));
if !(s_max > 0.0) {
return None;
}
let s_thresh = positive_eigenvalue_threshold(s_evals.as_slice()?);
// Collect the range-space columns U_r (penalty eigenvectors with d_j above
// the numerical-zero threshold) and their inverse square-root weights
// d_j^{-1/2}. Directions in ker(S) are dropped: they are unpenalized and do
// not enter the structural edf of this term.
let mut range_cols: Vec<usize> = Vec::new();
let mut inv_sqrt_d: Vec<f64> = Vec::new();
for (j, &dj) in s_evals.iter().enumerate() {
if dj <= s_thresh {
continue; // null space of S: not a penalized direction.
}
range_cols.push(j);
inv_sqrt_d.push(1.0 / dj.sqrt());
}
let r = range_cols.len();
if r == 0 {
return None;
}
// Form U_r (p×r) and the symmetric pencil matrix
// B = D_r^{-1/2} (U_rᵀ G U_r) D_r^{-1/2} (r×r),
// whose eigenvalues are the generalized eigenvalues of (UᵀGU, D) on
// range(S). Scaling U_r's columns by d_j^{-1/2} up front gives
// Y = U_r D_r^{-1/2} (p×r), B = Yᵀ G Y,
// which is symmetric by construction (Gram of G in the Y-columns).
let mut y = Array2::<f64>::zeros((p, r));
for (col, (&src, &w)) in range_cols.iter().zip(inv_sqrt_d.iter()).enumerate() {
let u = s_evecs.column(src);
for row in 0..p {
y[(row, col)] = u[row] * w;
}
}
let b = y.t().dot(&gram).dot(&y);
// Symmetrize defensively against round-off before the symmetric solver, then
// take eigenvalues. These are the γ_j (data-free, unit-weight).
let mut b_sym = b.clone();
for i in 0..r {
for j in (i + 1)..r {
let avg = 0.5 * (b_sym[(i, j)] + b_sym[(j, i)]);
b_sym[(i, j)] = avg;
b_sym[(j, i)] = avg;
}
}
let (b_evals, _) = b_sym.eigh(Side::Lower).ok()?;
let mut gammas = Vec::with_capacity(r);
for &gj in b_evals.iter() {
// A penalized direction with no design support has γ→0: edf→0 for any
// λ>0, so it cannot be floored by bounding ρ. Clamp tiny negative
// round-off to 0; it never contributes to the retained df sum.
if gj.is_finite() && gj > 0.0 {
gammas.push(gj);
} else {
gammas.push(0.0);
}
}
if gammas.is_empty() {
return None;
}
Some(gammas)
}
/// Per-outer-coordinate ρ UPPER bound enforcing the effective-df floor.
///
/// For each penalized term, the structural unit-weight edf `Σ_j γ_j/(γ_j+e^ρ)`
/// is monotone decreasing in ρ. The bound is the ρ at which it equals
/// `EFFECTIVE_DF_FLOOR` (when the term's max attainable edf exceeds the floor),
/// found by bisection on the closed-form edf. Tied coordinates (shared precision
/// label) take the TIGHTEST (smallest) per-term bound: the shared λ must retain
/// the floor for EVERY contributing term, so the binding constraint is the most
/// restrictive one — relaxing to a looser term's bound would let some other term
/// fall below its floor. Every coordinate is additionally capped at the caller's
/// uniform `ceiling` so this can only TIGHTEN, never loosen, the existing bound.
///
/// This enters ONLY the λ-selection domain. The inner β solve is exact
/// CONDITIONAL on the selected λ, so there is no per-λ approximation (same
/// discipline as the #747 solver-only ridge). It is NOT, however, a bias-free
/// no-op: whenever the unconstrained REML optimum lies beyond this upper bound,
/// the bound changes the SELECTED λ, and the selected λ changes the fitted
/// β̂ = argmin{−ℓ + ½λ βᵀSβ} (∂β̂/∂λ = −(H + λS)⁻¹ S β̂ ≠ 0). The floor is an
/// explicit smoothing-regularization constraint on the λ-selection — it
/// deliberately moves the estimate away from the (flat-Fisher) null-space
/// collapse, not a transparent reparameterization. It is the λ-upper-side dual
/// of the #752
/// full-subspace logdet work — there the value/gradient subspace was fixed on the
/// λ→∞ side of a near-collinear block; here the selection domain is bounded so a
/// flat Fisher surface cannot push a term past null-space collapse (#715/#684).
fn effective_df_floor_rho_upper_bounds(
specs: &[ParameterBlockSpec],
layout: &PenaltyLabelLayout,
n_rho: usize,
ceiling: f64,
) -> Array1<f64> {
let mut upper = Array1::<f64>::from_elem(n_rho, ceiling);
let mut physical = 0usize;
for spec in specs {
for penalty in &spec.penalties {
let outer = layout.physical_to_outer.get(physical).copied().flatten();
physical += 1;
let Some(outer) = outer else {
continue; // fixed penalty: not an outer coordinate.
};
let Some(gammas) = design_penalty_range_gammas(&spec.design, penalty) else {
continue; // un-projectable geometry: keep the uniform ceiling.
};
// Maximum attainable structural edf (ρ → −∞) is the number of
// design-supported penalized directions. If it cannot reach the
// floor even unpenalized, the floor is not enforceable for this term
// (a single-dimension range space with the floor at its own cap), so
// keep the uniform ceiling.
let edf_max = unit_weight_term_edf(&gammas, f64::NEG_INFINITY);
if !(edf_max > EFFECTIVE_DF_FLOOR) {
continue;
}
// Bisect for ρ* with edf(ρ*) = floor on [−ceiling, ceiling]; edf is
// monotone decreasing in ρ. If edf at the ceiling still exceeds the
// floor, the uniform ceiling already retains enough df — keep it.
if unit_weight_term_edf(&gammas, ceiling) >= EFFECTIVE_DF_FLOOR {
continue;
}
let mut lo = -ceiling;
let mut hi = ceiling;
for _ in 0..64 {
let mid = 0.5 * (lo + hi);
if unit_weight_term_edf(&gammas, mid) >= EFFECTIVE_DF_FLOOR {
lo = mid;
} else {
hi = mid;
}
}
let rho_star = 0.5 * (lo + hi);
// Tied coordinates: take the tightest (smallest) bound across terms,
// so every term sharing this λ retains at least the floor.
let slot = &mut upper[outer];
if rho_star > -ceiling && rho_star < *slot {
*slot = rho_star;
}
}
}
upper
}
pub fn fit_custom_family_with_rho_prior<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
rho_prior: crate::types::RhoPrior,
) -> Result<crate::solver::estimate::UnifiedFitResult, CustomFamilyError> {
// Multi-output families that omitted the per-block channel callback get it
// installed here from their declared `output_channel_assignment`, so the
// identifiability audit routes channel-aware (single source of truth for
// the channel-wiring; no per-test/per-builder duplication — #558).
let wired = wire_output_channels(family, specs)?;
let raw_specs: &[ParameterBlockSpec] = wired.as_deref().unwrap_or(specs);
validate_blockspecs(raw_specs)?;
// Pre-fit cross-block identifiability canonicalisation. Every
// blockwise fit path in the tree (standard, gaussian/binomial
// location-scale, survival, BMS, transformation-normal, custom
// families) reaches this entry point with a finalised
// `ParameterBlockSpec` list, so wiring the canonicalisation here
// covers all four `solver::workflow.rs` entry points plus every
// direct caller of `fit_custom_family` without each family needing
// its own canonicalisation hook.
//
// Contract: specs arrive *after* `nullspace-lead`'s
// `joint_null_rotation` absorption. The canonical step inspects
// post-rotation columns only, runs the joint RRQR identifiability
// audit, and converts attributed cross-block drops into a per-block
// selection transform `T_i`. The inner solve runs in the reduced
// coordinate space; coefficients and joint geometry are lifted back
// to the raw space at result assembly via `T_i` and the joint
// block-diagonal `T_full = blockdiag(T_i)`.
//
// An audit that is fatal *without* attributed drops (the >2-way
// structural alias case where RRQR couldn't pin redundancy onto a
// single block/column) still aborts: silently absorbing it would
// change model semantics beyond what canonicalisation can repair.
// Per the panic-vs-Err contract: never panic mid-construction.
let canonical_started = std::time::Instant::now();
let canonical_n_rows = raw_specs.first().map(|s| s.design.nrows()).unwrap_or(0);
let canonical_n_cols_raw: usize = raw_specs.iter().map(|s| s.design.ncols()).sum();
log::info!(
"[STAGE] identifiability canonicalise: start blocks={} n={} p_total_raw={}",
raw_specs.len(),
canonical_n_rows,
canonical_n_cols_raw,
);
let canonical =
crate::solver::identifiability_canonical::canonicalize_for_identifiability(raw_specs)?;
let canonical_n_cols_red: usize = canonical
.reduced_specs
.iter()
.map(|s| s.design.ncols())
.sum();
log::info!(
"[STAGE] identifiability canonicalise: end elapsed={:.3}s alias_pairs={} dropped_cols={} \
p_total_raw={} p_total_reduced={} fatal_attributed={}",
canonical_started.elapsed().as_secs_f64(),
canonical.audit.aliased_pairs.len(),
canonical.audit.dropped_columns.len(),
canonical_n_cols_raw,
canonical_n_cols_red,
canonical.audit.fatal,
);
if !canonical.audit.aliased_pairs.is_empty() {
log::info!("[identifiability audit] {}", canonical.audit.summary);
// Aggregate by (block_a, block_b) so the log stays bounded by the
// block-pair count rather than the quadratic direction-pair count
// — a few wide blocks alone produce 100+ pair-lines and bury the
// useful structural signal. INFO carries the cluster shape (count,
// overlap range, perfect-collinearity count); DEBUG prints the
// worst three sample pairs per cluster for forensic users.
let mut by_pair: BTreeMap<(&str, &str), Vec<&_>> = BTreeMap::new();
for pair in &canonical.audit.aliased_pairs {
by_pair
.entry((pair.block_a.as_str(), pair.block_b.as_str()))
.or_default()
.push(pair);
}
for ((a, b), pairs) in &by_pair {
let count = pairs.len();
let max = pairs
.iter()
.map(|p| p.overlap)
.fold(f64::NEG_INFINITY, f64::max);
let min = pairs
.iter()
.map(|p| p.overlap)
.fold(f64::INFINITY, f64::min);
let near_one = pairs.iter().filter(|p| p.overlap >= 0.9999).count();
log::info!(
"[identifiability audit] alias-cluster {a} ~ {b}: {count} direction-pair{plural} \
(overlap {min:.4}..{max:.4}; {near_one} ≥0.9999)",
plural = if count == 1 { "" } else { "s" },
);
}
if log::log_enabled!(log::Level::Debug) {
for ((a, b), pairs) in &by_pair {
let mut sorted = pairs.clone();
sorted.sort_by(|p, q| {
q.overlap
.partial_cmp(&p.overlap)
.unwrap_or(std::cmp::Ordering::Equal)
});
for pair in sorted.iter().take(3) {
log::debug!(
"[identifiability audit] sample {a}[{ai}] ~ {b}[{bi}] overlap={ov:.4}",
ai = pair.direction_a,
bi = pair.direction_b,
ov = pair.overlap,
);
}
}
}
}
for drop in &canonical.audit.dropped_columns {
log::info!(
"[identifiability audit] dropped: block='{}' local_col={} ({})",
drop.block,
drop.column,
drop.reason,
);
}
let specs: &[ParameterBlockSpec] = &canonical.reduced_specs;
let penalty_counts = validate_blockspecs(specs)?;
let label_layout = penalty_label_layout(specs, penalty_counts.clone())?;
let rho0 = label_layout.initial_rho.clone();
let (persistent_warm_start_key, persistent_warm_start) =
load_persistent_custom_family_warm_start::<F>(family, specs, options, rho0.len());
if rho0.is_empty() {
let physical_rho0 = expand_labeled_log_lambdas(&rho0, &label_layout)?;
let per_block = split_labeled_log_lambdas(&rho0, &label_layout)?;
let mut inner = inner_blockwise_fit(
family,
specs,
&per_block,
options,
persistent_warm_start.as_ref(),
)?;
refresh_all_block_etas(family, specs, &mut inner.block_states)?;
let covariance_conditional = compute_joint_covariance_required(
family,
specs,
&inner.block_states,
&per_block,
options,
)?;
let reml_term = if options.use_remlobjective {
0.5 * (inner.block_logdet_h - inner.block_logdet_s)
} else {
0.0
};
let geometry = compute_joint_geometry(family, specs, &inner.block_states, &per_block)
.map_err(|reason| CustomFamilyError::Optimization {
context: "fit_custom_family no-smoothing joint geometry",
reason,
})?;
let penalized_objective = checked_penalizedobjective(
inner.log_likelihood,
inner.penalty_value,
reml_term,
"custom-family fit without smoothing parameters",
)
.map_err(|reason| CustomFamilyError::Optimization {
context: "fit_custom_family no-smoothing penalized objective",
reason,
})?;
let warm_start = constrained_warm_start_from_inner(&rho0, &inner);
store_persistent_custom_family_warm_start(
persistent_warm_start_key.as_deref(),
specs,
&warm_start,
);
let inner_converged = inner.converged;
return assemble_custom_family_fit_result(
inner,
BlockwiseFitAssembly {
rho_physical: physical_rho0,
covariance_conditional,
geometry,
canonical: Some(&canonical),
result_specs: raw_specs,
penalized_objective,
outer_iterations: 0,
outer_gradient_norm: None,
criterion_certificate: None,
outer_converged: inner_converged,
context: "fit_custom_family no-smoothing result assembly",
},
);
}
// Exact Hessians are primary whenever the assembled family can supply them.
// If a particular outer step is ill-conditioned, strategy fallback handles
// the downgrade; we do not suppress second-order capability preemptively
// based on the presence of a wiggle block.
if options.inner_max_cycles <= 1 && options.outer_max_iter <= 1 {
log::info!(
"[OUTER] custom family: skipping smoothing outer solve for explicit one-cycle inner probe"
);
let per_block = split_labeled_log_lambdas(&rho0, &label_layout)?;
let mut inner = inner_blockwise_fit(family, specs, &per_block, options, None)?;
refresh_all_block_etas(family, specs, &mut inner.block_states).map_err(|reason| {
CustomFamilyError::Optimization {
context: "fit_custom_family one-cycle eta refresh",
reason,
}
})?;
let penalized_objective = inner_penalized_objective(
&inner,
include_exact_newton_logdet_h(family, options),
include_exact_newton_logdet_s(family, options),
"custom-family explicit one-cycle inner probe",
)
.map_err(|reason| CustomFamilyError::Optimization {
context: "fit_custom_family one-cycle penalized objective",
reason,
})?;
let physical_rho0 = expand_labeled_log_lambdas(&rho0, &label_layout)?;
let inner_converged = inner.converged;
return assemble_custom_family_fit_result(
inner,
BlockwiseFitAssembly {
rho_physical: physical_rho0,
covariance_conditional: None,
geometry: None,
canonical: Some(&canonical),
result_specs: raw_specs,
penalized_objective,
outer_iterations: 0,
outer_gradient_norm: Some(0.0),
criterion_certificate: None,
outer_converged: inner_converged,
context: "fit_custom_family one-cycle result assembly",
},
);
}
use crate::estimate::EstimationError;
use crate::solver::outer_strategy::{FallbackPolicy, OuterEval, OuterEvalOrder, OuterProblem};
let screening_cap = Arc::new(AtomicUsize::new(0));
let outer_inner_cap = options
.outer_inner_max_iterations
.clone()
.unwrap_or_else(|| Arc::new(AtomicUsize::new(options.inner_max_cycles.max(1))));
outer_inner_cap.store(options.inner_max_cycles.max(1), Ordering::Relaxed);
let mut outer_options = options.clone();
outer_options.screening_max_inner_iterations = Some(Arc::clone(&screening_cap));
outer_options.outer_inner_max_iterations = Some(Arc::clone(&outer_inner_cap));
let n_rho = rho0.len();
let (cap_gradient, cap_hessian) =
custom_family_outer_derivatives(family, specs, &outer_options);
let derivative_policy = family.outer_derivative_policy(specs, 0, &outer_options);
let hessian = cap_hessian;
let need_outer_hessian = hessian.is_analytic();
log::info!(
"[OUTER] custom family derivative-policy: n_params={} gradient={:?} hessian={:?} capability={:?} requested_outer_hessian={} predicted_gradient_work={} predicted_hessian_work={} inner_hvp_available={} outer_hvp_available={} outer_dense_available={}",
n_rho,
cap_gradient,
hessian,
derivative_policy.capability,
need_outer_hessian,
derivative_policy.predicted_gradient_work,
derivative_policy.predicted_hessian_work,
family.inner_coefficient_hessian_hvp_available(specs),
family.outer_hyper_hessian_hvp_available(specs),
family.outer_hyper_hessian_dense_available(specs),
);
let outer_max_iter = cost_gated_first_order_max_iter(
options.outer_max_iter,
family.coefficient_gradient_cost(specs),
need_outer_hessian,
);
let bfgs_step_cap = first_order_bfgs_loglambda_step_cap(need_outer_hessian);
if outer_max_iter < options.outer_max_iter {
log::info!(
"[OUTER] custom family: first-order work gate reduced outer_max_iter {} -> {}",
options.outer_max_iter,
outer_max_iter,
);
}
// EFS / HybridEfs structural property (`H^{-1/2} B_k H^{-1/2} ≽ 0` plus a
// parameter-independent nullspace, Wood-Fasiolo) fails for multi-block
// families whose joint likelihood Hessian depends on β. Disable
// fixed-point only for genuinely first-order capabilities; exact-Hessian
// capabilities route to ARC before EFS is considered.
let multi_block_beta_dependent =
specs.len() > 1 && family.exact_newton_joint_hessian_beta_dependent();
// Exact-Hessian plans must fail on their own terms rather than silently
// retrying on a quasi-Newton surface. First-order-only families keep the
// automatic cascade because there is no second-order geometry to discard.
let fallback_policy = if need_outer_hessian {
FallbackPolicy::Disabled
} else {
FallbackPolicy::Automatic
};
let problem = OuterProblem::new(n_rho)
.with_gradient(cap_gradient)
.with_hessian(hessian)
.with_disable_fixed_point(multi_block_beta_dependent)
.with_fallback_policy(fallback_policy)
.with_tolerance(options.outer_tol)
.with_max_iter(outer_max_iter)
.with_bfgs_step_cap(bfgs_step_cap)
.with_seed_config(family.outer_seed_config(n_rho))
.with_initial_rho(rho0.clone())
.with_screen_initial_rho(options.screen_initial_rho)
// Per-coordinate ρ box bounds. The uniform ceiling of 10 is the
// belt-and-suspenders cap: λ = exp(10) ≈ 22k is already extremely strong
// shrinkage, and the bound keeps the optimizer out of the dead-flat
// λ ≈ 10⁹ region where ARC's quadratic model breaks down, the retry-stall
// detector fires, and downstream empty-block_states crashes surface.
//
// ON TOP of that uniform ceiling, each penalized term's UPPER bound is
// tightened to the ρ at which its structural (unit-weight) effective df
// would fall to one — the EFFECTIVE_DF_FLOOR. Near a flat Fisher surface
// (multinomial simplex boundary diag(p)−ppᵀ→0, #715; Gaussian log-σ on a
// gently-varying scale, #684) the REML criterion loses ρ-curvature and
// the optimizer would otherwise let some λ_{class,term} drift past the
// point where the term collapses onto its unpenalized polynomial null
// space, over-smoothing the cubic/sigmoid/log-σ signal below the mature
// reference. The floor is derived from the penalty RANGE-SPACE
// eigenstructure (design/penalty generalized eigenvalues), not from the
// vanishing Fisher weight, and enters ONLY the λ-selection domain — the
// inner β solve at the selected ρ is unchanged and exact, so the
// converged β is unbiased (cf. the #747 solver-only ridge). This is the
// λ-upper-side dual of the #752 full-subspace logdet work.
.with_bounds(
Array1::<f64>::from_elem(n_rho, -10.0),
effective_df_floor_rho_upper_bounds(specs, &label_layout, n_rho, 10.0),
);
// Install the seed-screening cap only when initial-rho screening is
// wanted. A caller that pins an already-identified `initial_rho` and
// opts out (`screen_initial_rho == false`) leaves the OuterConfig
// screening cap `None`, so `should_screen_seeds` short-circuits and the
// screening cascade never runs. This is the lever the survival
// constant-scale (parametric-AFT) regime uses: its time-warp ρ seed is
// pinned AT the inner ρ box bound (the affine-baseline limit) on a
// dead-flat, statistically-unidentified time ridge where every capped
// proxy fit collapses to non-finite cost and the cascade escalates to a
// full uncapped inner solve per seed on the near-singular Hessian — the
// multi-minute no-iteration-log stall (#736, #735, #721). With the cap
// unset, the pinned seed flows straight to the outer solver, which
// certifies box-constraint stationarity at iteration 0. Every other
// custom-family caller defaults `screen_initial_rho = true` and keeps
// full screening; genuinely flexible scale/spatial survival fits carry
// log-sigma penalties, never set the flag false, and screen normally.
let problem = if options.screen_initial_rho {
problem.with_screening_cap(Arc::clone(&screening_cap))
} else {
problem
};
// Attach the workflow-level warm-start session if one was threaded
// through. This makes the custom-family outer optimizer (BFGS / ARC
// depending on derivative capabilities) use the same persistent
// cache infrastructure as standard REML — every accepted outer step
// is checkpointed to disk, every fit starts by consulting the disk
// for a prior best iterate. Without this, every survival-marginal-
// slope / GAMLSS / latent fit starts cold even when a converged ρ
// from a near-identical prior fit is sitting in `~/.cache/gam/warm`.
let problem = if let Some(session) = options.cache_session.clone() {
let key_hex = session.key().to_hex();
log::info!(
"[CACHE] attach key={}.. family-tag={} backend=outer-strategy mirrors={}",
&key_hex[..8.min(key_hex.len())],
std::any::type_name::<F>()
.rsplit("::")
.next()
.unwrap_or("?"),
options.cache_mirror_sessions.len(),
);
let mut p = problem.with_cache_session(session);
if !options.cache_mirror_sessions.is_empty() {
p = p.with_cache_mirror_sessions(options.cache_mirror_sessions.clone());
}
p
} else {
problem
};
// Robustness is unconditional, so escalation is always armed: the inner-non-
// convergence branch inside `eval_outer` marks a trial rho *infeasible*
// (recoverable) rather than hard-erroring, letting the outer optimizer retreat
// and the run reach the terminal HMC sampling rung instead of dead-ending
// before it (the gap `verify` located at this site).
let eval_outer = |outer: &mut CustomOuterState,
rho: &Array1<f64>,
order: OuterEvalOrder|
-> Result<OuterEval, EstimationError> {
let warm_ref = screened_outer_warm_start(outer.warm_cache.as_ref(), rho);
let request_hessian =
matches!(order, OuterEvalOrder::ValueGradientHessian) && need_outer_hessian;
let eval_result = match outerobjectivegradienthessian_labeled(
family,
specs,
&outer_options,
&label_layout,
rho,
warm_ref,
&rho_prior,
if request_hessian {
EvalMode::ValueGradientHessian
} else {
EvalMode::ValueAndGradient
},
) {
Ok(eval) if !eval.inner_converged => {
outer.warm_cache = Some(eval.warm_start.clone());
outer.last_error = Some("custom-family inner solve did not converge".to_string());
// Recoverable: this trial rho is infeasible (inner solve did not
// converge), so the outer optimizer retreats rather than the whole
// run hard-erroring. When the search ultimately reports
// `converged == false`, the post-run rung samples the proper
// posterior (never-fail).
return Ok(OuterEval::infeasible(rho.len()));
}
Ok(eval)
if eval.objective.is_finite()
&& eval.gradient.iter().all(|v| v.is_finite())
&& match &eval.outer_hessian {
crate::solver::outer_strategy::HessianResult::Analytic(hessian) => {
hessian.iter().all(|v| v.is_finite())
}
crate::solver::outer_strategy::HessianResult::Operator(op) => {
!request_hessian || op.dim() == rho.len()
}
crate::solver::outer_strategy::HessianResult::Unavailable => {
!request_hessian
}
} =>
{
let warm_start = eval.warm_start.clone();
let gradient_norm = eval
.gradient
.iter()
.map(|value| value * value)
.sum::<f64>()
.sqrt();
update_custom_outer_inner_cap_from_warm_start(
&outer_options,
&warm_start,
Some(gradient_norm),
&mut outer.initial_gradient_norm,
);
outer.warm_cache = Some(warm_start.clone());
store_persistent_custom_family_warm_start(
persistent_warm_start_key.as_deref(),
specs,
&warm_start,
);
outer.last_error = None;
eval
}
Ok(_) => {
outer.last_error =
Some("custom-family outer objective/derivatives became non-finite".to_string());
// Recoverable (data-driven): the objective/derivatives became
// non-finite at this trial rho (e.g. separation / near-singular
// information), so the outer optimizer retreats from this infeasible
// point rather than the whole run hard-erroring. When the search
// ultimately reports `converged == false`, the post-run rung samples
// the proper posterior (never-fail).
return Ok(OuterEval::infeasible(rho.len()));
}
Err(e) => {
// Genuine eval-error (internal computation failure: linalg error,
// etc.) — NOT data-driven. Leave as a hard Err even when escalation
// is armed: a real bug must surface, not be silently sampled over.
// Only the "did not converge" / "non-finite objective" data-driven
// paths above convert to infeasible-when-armed.
outer.last_error = Some(e.clone());
return Err(EstimationError::RemlOptimizationFailed(e));
}
};
let inner_beta_hint = Some(Array1::from_iter(
eval_result
.warm_start
.block_beta
.iter()
.flat_map(|beta| beta.iter().copied()),
));
Ok(OuterEval {
cost: eval_result.objective,
gradient: eval_result.gradient,
hessian: eval_result.outer_hessian,
inner_beta_hint,
})
};
let mut obj = problem.build_objective_with_screening_proxy(
CustomOuterState::new(persistent_warm_start.clone()),
|outer: &mut CustomOuterState, rho: &Array1<f64>| {
// Always use warm cache when available — the previous inner solution
// gives a much better starting point. This was previously disabled for
// exact-Hessian families, forcing every inner solve to start from
// scratch (5-10 Newton steps instead of 1-2 with warm start).
let warm_ref = screened_outer_warm_start(outer.warm_cache.as_ref(), rho);
match outerobjectivegradienthessian_labeled(
family,
specs,
&outer_options,
&label_layout,
rho,
warm_ref,
&rho_prior,
EvalMode::ValueOnly,
) {
Ok(eval) if eval.inner_converged && eval.objective.is_finite() => {
outer.warm_cache = Some(eval.warm_start);
outer.last_error = None;
Ok(eval.objective)
}
Ok(eval) => {
outer.warm_cache = Some(eval.warm_start);
outer.last_error = Some(
"custom-family value-only inner solve did not converge or objective was non-finite"
.to_string(),
);
// Recoverable (data-driven): this value-only probe is the
// line-search cost the outer optimizer calls most often. A
// non-converged inner solve / non-finite objective at this trial
// rho means the point is infeasible — return an infinite cost so
// the line search retreats, rather than hard-erroring out of
// `problem.run` and bypassing the post-run escalation (sampling)
// rung. When the search reports `converged == false` the never-fail
// rung samples the proper posterior.
Ok(f64::INFINITY)
}
Err(e) => {
// Genuine eval-error (internal computation failure) — NOT
// data-driven. Leave as a hard Err even when escalation is armed
// so a real bug surfaces instead of being silently sampled over.
outer.last_error = Some(e.clone());
Err(EstimationError::RemlOptimizationFailed(e))
}
}
},
|outer: &mut CustomOuterState, rho: &Array1<f64>| {
eval_outer(
outer,
rho,
if need_outer_hessian {
OuterEvalOrder::ValueGradientHessian
} else {
OuterEvalOrder::ValueAndGradient
},
)
},
|outer: &mut CustomOuterState, rho: &Array1<f64>, order: OuterEvalOrder| {
eval_outer(outer, rho, order)
},
Some(|outer: &mut CustomOuterState| {
outer.reset();
}),
Some(|outer: &mut CustomOuterState, rho: &Array1<f64>| {
if label_layout.has_tied_coordinates() {
return Err(EstimationError::RemlOptimizationFailed(
"custom-family EFS is not available for tied coefficient-group precision labels"
.to_string(),
));
}
let warm_ref = screened_outer_warm_start(outer.warm_cache.as_ref(), rho);
match outerobjectiveefs(
family,
specs,
&outer_options,
&label_layout.penalty_counts,
rho,
warm_ref,
rho_prior.clone(),
) {
Ok((eval, warm, true)) => {
outer.warm_cache = Some(warm);
outer.last_error = None;
Ok(eval)
}
Ok((_eval, warm, false)) => {
outer.warm_cache = Some(warm);
outer.last_error =
Some("custom-family EFS inner solve did not converge".to_string());
// Intentionally LEFT as a hard Err even when escalation is armed.
// Unlike the BFGS/value-only paths above, an EFS error does NOT
// dead-end the run: it surfaces as a recoverable objective-eval
// error at the fixed-point bridge (outer_strategy.rs:2409-2410
// `into_objective_error` -> `ObjectiveEvalError::recoverable`),
// so the EFS seed is rejected / the FixedPoint run returns Err,
// and `run_outer`'s fallback cascade (outer_strategy.rs:5297) routes
// to the fixed-point-disabled analytic-gradient BFGS attempt. That
// attempt is always present here because custom-family declares an
// analytic outer gradient (custom_family.rs:11826), so
// `automatic_fallback_attempts` (outer_strategy.rs:1502) adds it.
// BFGS then evaluates via `eval_outer` / the value-only cost
// closure, both of which now retreat-when-armed, so the run reaches
// `Ok(converged == false)` and the post-run sampling rung. No
// analogous infeasible sentinel is needed at this site.
Err(EstimationError::RemlOptimizationFailed(
"custom-family EFS inner solve did not converge".to_string(),
))
}
Err(e) => {
// Genuine eval-error (internal computation failure) — NOT
// data-driven. Hard Err so a real bug surfaces.
outer.last_error = Some(e.clone());
Err(EstimationError::RemlOptimizationFailed(e))
}
}
}),
|outer: &mut CustomOuterState, rho: &Array1<f64>| {
let warm_ref = screened_outer_warm_start(outer.warm_cache.as_ref(), rho);
match custom_family_seed_screening_proxy_labeled(
family,
specs,
&outer_options,
&label_layout,
rho,
warm_ref,
&rho_prior,
) {
Ok((score, warm_start, _inner_converged)) if score.is_finite() => {
outer.warm_cache = Some(warm_start);
outer.last_error = None;
Ok(score)
}
Ok((score, warm_start, _inner_converged)) => {
outer.warm_cache = Some(warm_start);
outer.last_error = Some(format!(
"custom-family seed-screening proxy produced non-finite score {score}"
));
Err(EstimationError::RemlOptimizationFailed(
"custom-family seed-screening proxy produced non-finite score".to_string(),
))
}
Err(e) => {
outer.last_error = Some(e.clone());
Err(EstimationError::RemlOptimizationFailed(e))
}
}
},
)
.with_seed_inner_state(|outer: &mut CustomOuterState, beta: &Array1<f64>| {
outer.seed_cached_beta(n_rho, specs, beta)
});
let outer_result = problem.run(&mut obj, "custom family");
let last_error_detail = obj
.state
.last_error
.as_ref()
.map(|e| {
format!(
" last objective error: {}",
normalize_outer_eval_error_detail(e)
)
})
.unwrap_or_default();
// Startup-validation escalation net (gam#860). When the outer optimizer
// returns `Err` because no candidate seed passed startup validation, the
// raise is a POST-AUDIT NUMERICAL pathology, not an ill-posed input: by the
// time we reach the outer solve the structural audits have already passed
// (the #531-class identifiability audit, the #789B zero-events guard, and
// the #859 cross-fit alignment all raise BEFORE the solver). So an
// all-seeds-rejected / non-finite-cost failure HERE is a solver numerical
// defect (e.g. the #787 kappa-driven penalty-topology dim-mismatch) on a
// structurally-well-posed design — exactly the regime the never-fail
// posterior-sampling rung exists for. Route it into the SAME AUTO-ESCALATE
// the non-convergence path below uses, seeding the sampler at the initial ρ
// (`rho0`, the bootstrap seed), instead of hard-raising. The carve-out is
// strict: this only catches the post-audit startup-validation failure, never
// the structural guards above (they keep raising with their own messages),
// and the degraded refit below STILL raises if even `rho0` produces a
// non-finite mode (sampling about NaN would manufacture meaningless
// infinite-width intervals that masquerade as a fit — see the finite-mode
// check after the refit). The result carries the existing escalation's
// degraded / sampled-not-certified flagging so confidence is honest.
let (rho_star, outer_grad_norm, outer_iters, nonconvergence_escalation, outer_certificate) =
match outer_result {
Ok(outer_result) => {
// Geometry-driven terminal escalation. When the outer smoothing
// optimizer cannot certify convergence, the objective is always
// *proper* (Jeffreys/PC term unconditionally armed), so a
// non-convergence here is a geometry signal (indefinite / non-smooth
// LAML landscape that stalled Strong-Wolfe) — not a reason to fail.
// Instead we AUTO-ESCALATE to sampling the proper posterior about the
// best mode the inner solve reached (the never-fail bottom rung; see
// `hmc::sample_gaussian_mode_posterior`). The fast Arc/EFS path is
// untouched: this branch is only reached after the optimizer reports
// non-convergence, so nice landscapes never pay any sampling cost.
let nonconvergence_escalation = !outer_result.converged;
if nonconvergence_escalation {
log::info!(
"[robust] outer smoothing did not certify convergence (plan={} iters={} |g|={}); \
AUTO-ESCALATE to never-fail posterior sampling about the best mode",
outer_result.plan_used,
outer_result.iterations,
outer_result.final_grad_norm_report(),
);
}
(
outer_result.rho,
outer_result.final_grad_norm,
outer_result.iterations,
nonconvergence_escalation,
outer_result.criterion_certificate,
)
}
Err(e) if outer_startup_failure_is_escalatable(&e) => {
log::warn!(
"[robust] outer smoothing raised at startup validation on a structurally-audited \
design (post-audit numerical pathology, gam#860): {e}.{last_error_detail} \
AUTO-ESCALATE to never-fail posterior sampling about the initial ρ seed; the \
degraded refit below still raises if even the seed produces a non-finite mode.",
);
(rho0.clone(), None, 0, true, None)
}
Err(e) => {
return Err(format!(
"outer smoothing optimization failed after exhausting strategy fallbacks: {e}.{last_error_detail}"
)
.into());
}
};
screening_cap.store(0, Ordering::Relaxed);
let per_block = split_labeled_log_lambdas(&rho_star, &label_layout)?;
let final_seed = obj.state.warm_cache.clone();
let mut final_options = options.clone();
final_options.outer_inner_max_iterations = None;
let mut inner = inner_blockwise_fit(
family,
specs,
&per_block,
&final_options,
final_seed.as_ref(),
)
.map_err(|e| {
format!(
"outer smoothing optimization failed during final inner refit: \
{e}.{last_error_detail}"
)
})?;
if !inner.converged && !nonconvergence_escalation {
return Err(CustomFamilyError::Optimization {
context: "fit_custom_family final inner refit",
reason: format!(
"outer smoothing optimization final inner refit did not converge after {} cycles.{}",
inner.cycles, last_error_detail
),
});
}
if !inner.converged && nonconvergence_escalation {
// The mode the inner solve reached is still the seed for the proper
// posterior; a marginal inner non-convergence only widens the sampled
// intervals (honest, not wrong). Proceed to assemble + sample.
log::info!(
"[robust] final inner refit did not fully converge ({} cycles) under escalation; \
sampling the proper posterior about the reached mode",
inner.cycles,
);
}
// Finite-mode carve-out for the escalation net (gam#860). The never-fail
// rung samples a Gaussian posterior ABOUT the reached mode; that is honest
// only when the mode is finite (a non-converged-but-finite mode just widens
// the sampled intervals). If the refit produced a NON-FINITE β — e.g. the
// degraded startup-validation fallback (`rho0`) still lands on garbage —
// sampling about NaN would manufacture meaningless infinite-width intervals
// that masquerade as a fit, so KEEP the hard raise with a clear message
// rather than escalate. (On the certified path β is finite by construction,
// so this guard only ever fires on a genuinely broken escalation seed.)
if nonconvergence_escalation
&& inner
.block_states
.iter()
.any(|state| state.beta.iter().any(|value| !value.is_finite()))
{
return Err(CustomFamilyError::Optimization {
context: "fit_custom_family escalation finite-mode check",
reason: format!(
"outer smoothing escalation cannot sample a posterior: the refit mode is \
non-finite (β contains NaN/inf), so there is no valid mode to sample about; \
this is an ill-posed problem, not a recoverable numerical non-convergence.{}",
last_error_detail
),
});
}
let final_warm_start = constrained_warm_start_from_inner(&rho_star, &inner);
store_persistent_custom_family_warm_start(
persistent_warm_start_key.as_deref(),
specs,
&final_warm_start,
);
refresh_all_block_etas(family, specs, &mut inner.block_states).map_err(|e| {
format!(
"outer smoothing optimization failed during final eta refresh: \
{e}.{last_error_detail}"
)
})?;
let mut covariance_conditional =
compute_joint_covariance_required(family, specs, &inner.block_states, &per_block, options)?;
let geometry = compute_joint_geometry(family, specs, &inner.block_states, &per_block).map_err(
|reason| CustomFamilyError::Optimization {
context: "fit_custom_family joint geometry",
reason,
},
)?;
let penalized_objective = inner_penalized_objective(
&inner,
include_exact_newton_logdet_h(family, options),
include_exact_newton_logdet_s(family, options),
"custom-family fit final outer refit",
)
.map_err(|reason| CustomFamilyError::Optimization {
context: "fit_custom_family penalized objective",
reason,
})?;
// Never-fail terminal rung. Under escalation, sample the proper posterior
// `N(β̂, H⁻¹)` whose precision `H` is the SAME penalized (Jeffreys-augmented)
// joint Hessian the inner solve produced at the reached mode `β̂`, and report
// its honest covariance in place of the optimizer-conditional one. Both `H`
// and `β̂` are in the reduced (canonical) coordinate space here; the joint
// lift below (`lift_fit_geometry_to_raw`) carries the sampled covariance back
// to raw space exactly like the conditional covariance it replaces.
//
// Sampling a multivariate normal cannot dead-end: `sample_gaussian_mode_posterior`
// jitters and Cholesky-factors `H`, so a marginally indefinite boundary
// Hessian only widens the intervals. If that structural factorization is
// genuinely impossible (e.g. a non-PSD precision after symmetrization) the
// sampler returns `Err`; rather than re-introducing the dead-end we then keep
// the optimizer-conditional covariance (a finite point with its existing SEs)
// and still return a fit — never an `Err` for non-convergence.
if nonconvergence_escalation {
if let Some(geom) = geometry.as_ref() {
let joint_mode: Array1<f64> = {
let mut mode = Vec::new();
for state in &inner.block_states {
mode.extend(state.beta.iter().copied());
}
Array1::from(mode)
};
let precision = geom.penalized_hessian.as_array();
if joint_mode.len() == precision.nrows()
&& precision.nrows() == precision.ncols()
&& joint_mode.iter().all(|v| v.is_finite())
{
let sampling_config =
crate::inference::hmc::NutsConfig::for_dimension(joint_mode.len());
match crate::inference::hmc::sample_gaussian_mode_posterior(
joint_mode.view(),
precision.view(),
&sampling_config,
) {
Ok(posterior) => {
let dim = joint_mode.len();
let n = posterior.samples.nrows();
if n > 1 {
// Sample posterior covariance about the posterior mean
// (honest intervals; not the Laplace inverse-Hessian).
let mean = &posterior.posterior_mean;
let mut cov = Array2::<f64>::zeros((dim, dim));
for row in posterior.samples.rows() {
let centered = &row.to_owned() - mean;
for a in 0..dim {
for b in 0..dim {
cov[[a, b]] += centered[a] * centered[b];
}
}
}
cov.mapv_inplace(|v| v / (n as f64 - 1.0));
// DIAGNOSTIC GUARD (no false-confident intervals).
// The sampler NEVER fails, so without checking its
// mixing diagnostics a divergent (R̂ ≫ 1) / near-zero-
// ESS draw would be reported as an "honest" covariance.
// That is especially dangerous here: the seed `H` is
// the Jeffreys-AUGMENTED precision evaluated at β̂, which
// may be NON-converged on a flat (unidentified) joint
// direction — so a poorly-mixed chain can report a
// FINITE, NARROW interval around an arbitrary point on
// that flat direction (the prior's interval), masquer-
// ading as data-driven. We therefore only accept the
// sampled covariance as honest when the chain actually
// mixed; otherwise we INFLATE it to reflect the non-
// convergence and flag it low-confidence rather than
// silently reporting a Jeffreys-narrowed interval.
//
// R̂ ≤ 1.05 is the standard "mixed" gate (stricter than
// the 1.1 used for a coarse converged/not flag, because
// this covariance is reported as honest uncertainty).
// The ESS floor scales with dimension (≥ 10 effective
// draws per parameter, absolute floor 50) so a chain
// that produced essentially no independent information
// about the posterior is caught independent of model
// size.
const RHAT_MIXED_MAX: f64 = 1.05;
let ess_floor = (10.0 * dim as f64).max(50.0);
let rhat = posterior.rhat;
let ess = posterior.ess;
let diagnostics_ok = rhat.is_finite()
&& ess.is_finite()
&& rhat <= RHAT_MIXED_MAX
&& ess >= ess_floor;
if diagnostics_ok {
log::info!(
"[robust] never-fail posterior sampling mixed: dim={dim} \
draws={n} rhat={rhat:.3} ess={ess:.0}; reporting sampled \
covariance as honest intervals",
);
covariance_conditional = Some(cov);
} else {
// Non-converged: do NOT report the narrow sampled
// covariance as data-driven. Inflate it so the
// reported uncertainty reflects the failure to
// resolve the posterior — widen by the R̂ excess (a
// divergent chain widens hard) and an ESS-deficit
// factor (too few independent draws ⇒ the sample
// covariance is itself unreliable / too narrow). The
// result is a clearly-flagged LOW-CONFIDENCE summary,
// never an artificially tight interval, and we still
// return a fit (the never-fail guarantee stands).
let rhat_factor = if rhat.is_finite() {
rhat.max(1.0)
} else {
// R̂ unestimable (too few chains/samples) ⇒
// treat as maximally unresolved.
RHAT_MIXED_MAX
};
let ess_factor = if ess.is_finite() && ess > 0.0 {
(ess_floor / ess).sqrt().max(1.0)
} else {
ess_floor.sqrt()
};
let inflation = (rhat_factor * rhat_factor) * ess_factor;
cov.mapv_inplace(|v| v * inflation);
log::warn!(
"[robust] never-fail posterior sampling DID NOT MIX: dim={dim} \
draws={n} rhat={rhat:.3} (>{RHAT_MIXED_MAX}) ess={ess:.0} \
(<{ess_floor:.0}); reporting LOW-CONFIDENCE inflated covariance \
(x{inflation:.2}) instead of a possibly false-confident \
Jeffreys-narrowed interval (intervals are prior-dominated on \
any unidentified joint direction, NOT data-driven)",
);
covariance_conditional = Some(cov);
}
}
}
Err(reason) => {
log::warn!(
"[robust] never-fail posterior sampling could not factor the precision \
({reason}); retaining optimizer-conditional covariance (still no dead-end)",
);
}
}
}
}
}
let rho_star_physical = expand_labeled_log_lambdas(&rho_star, &label_layout)?;
let outer_converged = !nonconvergence_escalation;
assemble_custom_family_fit_result(
inner,
BlockwiseFitAssembly {
rho_physical: rho_star_physical,
covariance_conditional,
geometry,
canonical: Some(&canonical),
result_specs: raw_specs,
penalized_objective,
outer_iterations: outer_iters,
outer_gradient_norm: outer_grad_norm,
criterion_certificate: outer_certificate,
outer_converged,
context: "fit_custom_family result assembly",
},
)
}
pub(crate) fn fit_custom_family_fixed_log_lambdas<
F: CustomFamily + Clone + Send + Sync + 'static,
>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
warm_start: Option<&CustomFamilyWarmStart>,
outer_iterations: usize,
outer_gradient_norm: Option<f64>,
outer_converged: bool,
) -> Result<crate::solver::estimate::UnifiedFitResult, CustomFamilyError> {
let penalty_counts = validate_blockspecs(specs)?;
let rho = flatten_log_lambdas(specs);
let per_block = split_log_lambdas(&rho, &penalty_counts)?;
let mut inner = inner_blockwise_fit(
family,
specs,
&per_block,
options,
warm_start.map(|warm| &warm.inner),
)?;
if !inner.converged {
return Err(CustomFamilyError::Optimization {
context: "fit_custom_family_fixed_log_lambdas inner solve",
reason: format!(
"fixed-log-lambda inner solve did not converge after {} cycles",
inner.cycles
),
});
}
refresh_all_block_etas(family, specs, &mut inner.block_states)?;
let covariance_conditional =
compute_joint_covariance_required(family, specs, &inner.block_states, &per_block, options)?;
let geometry = compute_joint_geometry(family, specs, &inner.block_states, &per_block).map_err(
|reason| CustomFamilyError::Optimization {
context: "fit_custom_family_fixed_log_lambdas joint geometry",
reason,
},
)?;
let penalized_objective = inner_penalized_objective(
&inner,
include_exact_newton_logdet_h(family, options),
include_exact_newton_logdet_s(family, options),
"custom-family fixed-log-lambda fit",
)
.map_err(|reason| CustomFamilyError::Optimization {
context: "fit_custom_family_fixed_log_lambdas penalized objective",
reason,
})?;
assemble_custom_family_fit_result(
inner,
BlockwiseFitAssembly {
rho_physical: rho,
covariance_conditional,
geometry,
canonical: None,
result_specs: specs,
penalized_objective,
outer_iterations,
outer_gradient_norm,
criterion_certificate: None,
outer_converged,
context: "fit_custom_family_fixed_log_lambdas result assembly",
},
)
}
pub(crate) fn fit_custom_family_fixed_log_lambda_warm_start<
F: CustomFamily + Clone + Send + Sync + 'static,
>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
) -> Result<(Vec<Array1<f64>>, bool, usize), CustomFamilyError> {
// Pre-fit identifiability gate. Mirrors the outer-fit gate so
// warm-start callers (e.g. the survival marginal-slope rigid pilot
// at survival_marginal_slope.rs ~18078) fail in milliseconds on
// rank-deficient joint designs instead of spending minutes inside
// a singular penalised Newton inner system.
//
// We deliberately do NOT call `canonicalize_for_identifiability`
// here: blockwise families capture their per-block designs at
// construction time (e.g. SurvivalMarginalSlopeFamily holds
// `self.marginal_design` and `self.logslope_design` at raw width)
// and their `evaluate*` paths assert on those raw widths when
// assembling per-row Hessian contributions. Substituting a
// column-reduced spec under that family would produce a runtime
// shape mismatch in the family's syr_row_into / row_outer_into
// calls, masking the audit's diagnostic with a panic later in the
// pipeline.
//
// The principled construction-time orthogonalisation lives in
// `crate::families::identifiability_compiler` (and the per-family
// `*_identifiability.rs` modules). Once Phase 4b threads those
// compiled operators through the family construction sites, the
// raw joint design will already be rank-clean on entry and this
// gate becomes a defensive check.
let audit =
crate::solver::identifiability_audit::audit_identifiability(specs).map_err(|reason| {
CustomFamilyError::DimensionMismatch {
reason: format!(
"fit_custom_family_fixed_log_lambda_warm_start identifiability audit failed: {reason}"
),
}
})?;
if audit.fatal {
return Err(CustomFamilyError::Optimization {
context: "fit_custom_family_fixed_log_lambda_warm_start identifiability audit",
reason: format!(
"fatal pre-fit identifiability audit: {summary}",
summary = audit.summary
),
});
}
let penalty_counts = validate_blockspecs(specs)?;
let rho = flatten_log_lambdas(specs);
let per_block = split_log_lambdas(&rho, &penalty_counts)?;
let inner = inner_blockwise_fit(family, specs, &per_block, options, None)?;
let block_beta: Vec<Array1<f64>> = inner
.block_states
.iter()
.map(|state| state.beta.clone())
.collect();
if !block_beta
.iter()
.flat_map(|beta| beta.iter())
.all(|value| value.is_finite())
{
return Err(CustomFamilyError::Optimization {
context: "fit_custom_family_fixed_log_lambda_warm_start",
reason: "fixed-log-lambda warm start produced non-finite coefficients".to_string(),
});
}
Ok((block_beta, inner.converged, inner.cycles))
}
#[cfg(test)]
mod test_support {
use super::*;
use ndarray::{Array1, Array2};
pub(crate) fn outerobjectivegradienthessian<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
penalty_counts: &[usize],
rho: &Array1<f64>,
warm_start: Option<&ConstrainedWarmStart>,
eval_mode: EvalMode,
) -> Result<(f64, Array1<f64>, Option<Array2<f64>>, ConstrainedWarmStart), String> {
let result = super::outerobjectivegradienthessian_internal(
family,
specs,
options,
penalty_counts,
rho,
warm_start,
crate::types::RhoPrior::Flat,
eval_mode,
)?;
Ok((
result.objective,
result.gradient,
result.outer_hessian.materialize_dense()?,
result.warm_start,
))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[derive(Clone)]
struct BatchedOuterHessianTestFamily {
matrix: Array2<f64>,
}
struct TestOuterHessianOperator {
matrix: Array2<f64>,
}
impl crate::solver::outer_strategy::OuterHessianOperator for TestOuterHessianOperator {
fn dim(&self) -> usize {
self.matrix.nrows()
}
fn matvec(&self, v: &Array1<f64>) -> Result<Array1<f64>, String> {
Ok(self.matrix.dot(v))
}
fn is_cheap_to_materialize(&self) -> bool {
true
}
}
impl CustomFamily for BatchedOuterHessianTestFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
assert!(block_states.len() <= isize::MAX as usize);
Ok(FamilyEvaluation {
log_likelihood: 0.0,
blockworking_sets: vec![],
})
}
fn outer_hyper_hessian_hvp_available(&self, block_specs: &[ParameterBlockSpec]) -> bool {
assert!(block_specs.len() <= isize::MAX as usize);
true
}
fn outer_hyper_hessian_operator(
&self,
block_specs: &[ParameterBlockSpec],
) -> Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>> {
assert!(block_specs.len() <= isize::MAX as usize);
Some(Arc::new(TestOuterHessianOperator {
matrix: self.matrix.clone(),
}))
}
}
#[test]
fn blockwise_fit_from_parts_accepts_stacked_solver_eta_with_canonical_geometry_rows() {
let canonical_design = DesignMatrix::from(Array2::ones((2, 1)));
let stacked_design = DesignMatrix::from(Array2::ones((6, 1)));
let spec = ParameterBlockSpec {
name: "stacked".to_string(),
design: canonical_design,
offset: Array1::zeros(2),
penalties: Vec::new(),
nullspace_dims: Vec::new(),
initial_log_lambdas: Array1::zeros(0),
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: Some(stacked_design),
stacked_offset: Some(Array1::zeros(6)),
};
let state = ParameterBlockState {
beta: array![0.25],
eta: Array1::zeros(6),
};
let fit = blockwise_fit_from_parts(
BlockwiseFitResultParts {
block_states: vec![state],
log_likelihood: -1.0,
log_lambdas: Array1::zeros(0),
lambdas: Array1::zeros(0),
covariance_conditional: Some(Array2::eye(1)),
stable_penalty_term: 0.0,
penalized_objective: 1.0,
outer_iterations: 0,
outer_gradient_norm: Some(0.0),
criterion_certificate: None,
inner_cycles: 0,
outer_converged: true,
geometry: Some(FitGeometry {
penalized_hessian: Array2::eye(1).into(),
working_weights: Array1::ones(2),
working_response: Array1::zeros(2),
}),
precomputed_edf: Some((1.0, Vec::new(), vec![1.0])),
},
&[spec],
)
.expect("stacked solver eta should assemble against canonical geometry rows");
assert_eq!(fit.block_states[0].eta.len(), 6);
assert_eq!(fit.geometry.as_ref().unwrap().working_weights.len(), 2);
}
#[test]
fn batched_outer_hessian_terms_materialize_to_exact_small_matrix() {
let exact = array![[4.0, -1.0], [-1.0, 3.0]];
let family = BatchedOuterHessianTestFamily {
matrix: exact.clone(),
};
// rho.len() must equal sum(spec.penalties.len()); empty specs ⇒ empty rho.
let terms = family
.batched_outer_hessian_terms(&[], &[], &[], &Array1::<f64>::zeros(0), None)
.expect("batched Hessian hook succeeds")
.expect("test family exposes batched HVP terms");
let operator = match terms.outer_hessian {
crate::solver::outer_strategy::HessianResult::Operator(operator) => operator,
_ => panic!("batched hook should expose an operator"),
};
let dense = operator
.mul_mat(Array2::<f64>::eye(2).view())
.expect("operator materializes on small exact case");
assert_eq!(dense, exact);
}
#[test]
fn batched_outer_hessian_operator_selected_only_for_hessian_eval() {
let family = BatchedOuterHessianTestFamily {
matrix: array![[2.0, 0.5], [0.5, 5.0]],
};
let selected = custom_family_batched_outer_hessian_operator(
&family,
&[],
&[],
&[],
&Array1::<f64>::zeros(0),
None,
EvalMode::ValueGradientHessian,
)
.expect("selection check succeeds");
assert!(
selected.is_some(),
"supported Hessian/HVP families should select the batched operator path"
);
let not_selected = custom_family_batched_outer_hessian_operator(
&family,
&[],
&[],
&[],
&Array1::<f64>::zeros(0),
None,
EvalMode::ValueAndGradient,
)
.expect("non-Hessian selection check succeeds");
assert!(
not_selected.is_none(),
"batched Hessian terms must not run for gradient-only evaluations"
);
}
#[test]
fn batched_outer_gradient_override_rejected_when_jeffreys_curvature_is_active() {
assert!(
batched_outer_gradient_contract_allows_override(None),
"released objective without robust Jeffreys curvature may use a family-owned batched gradient"
);
let zero_hphi = Array2::<f64>::zeros((2, 2));
assert!(
batched_outer_gradient_contract_allows_override(Some(&zero_hphi)),
"a gated zero Jeffreys curvature leaves the batched gradient contract unchanged"
);
let active_hphi = array![[0.0, 0.0], [0.0, 1.0e-6]];
assert!(
!batched_outer_gradient_contract_allows_override(Some(&active_hphi)),
"nonzero H_phi changes the logdet operator and needs the unified H_phi-aware gradient"
);
}
use crate::families::gamlss::{BinomialLocationScaleFamily, BinomialLocationScaleWiggleFamily};
use crate::matrix::DesignMatrix;
use crate::test_support::binomial_location_scale_base_fixture;
use approx::assert_relative_eq;
use faer::sparse::{SparseColMat, Triplet};
use ndarray::{Array1, Array2, array};
/// The marker-free coupled-joint-Hessian gate (#727, #729) trusts a family
/// that returns a genuinely coupled joint Hessian — nonzero off-diagonal
/// blocks — without a hand-set `has_explicit_joint_hessian()`. Pin the
/// structural probe that drives every `_with_specs` dispatch: block-diagonal
/// (the trait default) is NOT coupling, a single nonzero off-block IS, and a
/// shape disagreement must never be claimed as coupling.
#[test]
fn joint_hessian_coupling_probe_detects_off_diagonal_blocks() {
// Two blocks of width 2 each → a 4×4 joint Hessian. Only `beta.len()`
// is read, so the `eta` lengths are immaterial.
let states = vec![
ParameterBlockState {
beta: Array1::zeros(2),
eta: Array1::zeros(3),
},
ParameterBlockState {
beta: Array1::zeros(2),
eta: Array1::zeros(3),
},
];
// Strictly block-diagonal (per-block curvature, zero off-blocks): the
// trait default shape, NOT coupling.
let block_diagonal = array![
[1.0_f64, 0.5, 0.0, 0.0],
[0.5, 1.0, 0.0, 0.0],
[0.0, 0.0, 2.0, 0.3],
[0.0, 0.0, 0.3, 2.0],
];
assert!(
!joint_hessian_has_cross_block_coupling(&block_diagonal, &states),
"block-diagonal joint Hessian must not be treated as coupled"
);
// A single nonzero off-diagonal-block entry (and its transpose) is
// genuine cross-block curvature the block-diagonal default can never
// produce, so it must be trusted as coupled.
let mut coupled = block_diagonal.clone();
coupled[[0, 2]] = 1.0e-9;
coupled[[2, 0]] = 1.0e-9;
assert!(
joint_hessian_has_cross_block_coupling(&coupled, &states),
"a nonzero off-diagonal block must be detected as coupling"
);
// A matrix whose dimension disagrees with the total β width is
// malformed; the probe must answer the coupling question with `false`
// rather than claim coupling for a mis-shaped Hessian.
let wrong_shape = Array2::<f64>::zeros((3, 3));
assert!(
!joint_hessian_has_cross_block_coupling(&wrong_shape, &states),
"shape disagreement must not be claimed as coupling"
);
}
fn solve_blockweighted_system(
x: &DesignMatrix,
y_star: &Array1<f64>,
w: &Array1<f64>,
s_lambda: &Array2<f64>,
ridge_floor: f64,
ridge_policy: RidgePolicy,
) -> Result<Array1<f64>, String> {
let n = x.nrows();
if y_star.len() != n || w.len() != n {
return Err(CustomFamilyError::DimensionMismatch {
reason: "weighted-system dimension mismatch".to_string(),
}
.into());
}
let xtwy = x.compute_xtwy(w, y_star)?;
x.solve_systemwith_policy(w, &xtwy, Some(s_lambda), ridge_floor, ridge_policy)
.map_err(|_| "block solve failed after ridge retries".to_string())
}
#[test]
fn default_inner_cycle_budget_covers_large_scale_joint_newton_tail() {
let options = BlockwiseFitOptions::default();
assert_eq!(
options.inner_max_cycles,
DEFAULT_CUSTOM_FAMILY_INNER_MAX_CYCLES
);
assert!(
options.inner_max_cycles > 300,
"startup validation must not reject still-descending exact joint solves at the old cap"
);
}
#[test]
fn startup_validation_failure_routes_to_never_fail_escalation() {
use crate::estimate::EstimationError;
let all_seeds_rejected = EstimationError::RemlOptimizationFailed(
"no candidate seeds passed outer startup validation (custom family):\n generated=4"
.to_string(),
);
assert!(
outer_startup_failure_is_escalatable(&all_seeds_rejected),
"post-audit all-seeds startup rejection must reach the never-fail escalation net"
);
let non_finite_eval = EstimationError::RemlOptimizationFailed(
"outer eval failed: objective returned a non-finite cost".to_string(),
);
assert!(
outer_startup_failure_is_escalatable(&non_finite_eval),
"non-finite startup evals are the same post-audit numerical pathology"
);
let structural_input = EstimationError::InvalidInput(
"zero-event survival marginal-slope input remains structurally invalid".to_string(),
);
assert!(
!outer_startup_failure_is_escalatable(&structural_input),
"structural input errors must not be converted into sampled fits"
);
}
#[test]
fn joint_penalty_subspace_trace_matches_projected_logdet_derivative() {
let ranges = vec![(0, 3)];
let s_lambda = array![[1.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 0.0]];
let penalties = vec![s_lambda];
let h = array![[4.0, 0.2, 7.0], [0.2, 9.0, -3.0], [7.0, -3.0, 30.0]];
// `∂Sλ/∂ρ` is supported on range(Sλ) (here the leading 2×2 block, the
// positive-eigenvalue subspace of `S`). Since #901 the kernel is the
// full spectral `M⁺`, whose trace differentiates `log|H+Sλ|₊` exactly
// for EVERY drift; a range(Sλ)-supported drift exercises the same
// contract the production `∂Sλ/∂ρ` does (and is where the old
// range(Sλ)-block kernel and `M⁺` agree, so this pin is stable
// across the kernel generalization).
let drift = array![[0.7, -0.4, 0.0], [-0.4, 1.3, 0.0], [0.0, 0.0, 0.0]];
let (logdet, kernel) = joint_penalty_subspace_trace_parts(
&JointHessianSource::Dense(h.clone()),
&ranges,
&penalties,
3,
0.0,
None,
)
.expect("projection parts build");
let kernel = kernel.expect("rank-deficient penalty still has an identified subspace");
// Kernel basis = kept eigenvectors of M = H + Sλ (full rank 3 here),
// NOT the rank-2 range(Sλ) basis of the pre-#901 reduced kernel.
assert_eq!(kernel.u_s.ncols(), 3);
// logdet is the FULL identifiable-subspace `log|H + Sλ|₊`. Here H + Sλ
// is full rank (3), so this is the ordinary log-det of
// M = [[5, 0.2, 7], [0.2, 11, -3], [7, -3, 30]], det(M) = 1056.4.
let m = array![[5.0, 0.2, 7.0], [0.2, 11.0, -3.0], [7.0, -3.0, 30.0]];
let (m_evals, _) = m.eigh(faer::Side::Lower).expect("M eigendecomposition");
let expected_logdet: f64 = m_evals.iter().map(|&v| v.ln()).sum();
assert_relative_eq!(logdet, expected_logdet, epsilon = 1e-10);
let analytic = kernel.trace_projected_logdet(&drift);
let eps = 1.0e-6;
let h_plus = &h + &(drift.mapv(|v| eps * v));
let h_minus = &h - &(drift.mapv(|v| eps * v));
let (logdet_plus, _) = joint_penalty_subspace_trace_parts(
&JointHessianSource::Dense(h_plus),
&ranges,
&penalties,
3,
0.0,
None,
)
.expect("plus projection parts build");
let (logdet_minus, _) = joint_penalty_subspace_trace_parts(
&JointHessianSource::Dense(h_minus),
&ranges,
&penalties,
3,
0.0,
None,
)
.expect("minus projection parts build");
let finite_difference = (logdet_plus - logdet_minus) / (2.0 * eps);
assert_relative_eq!(
analytic,
finite_difference,
epsilon = 1e-8,
max_relative = 1e-8
);
}
#[test]
fn joint_outer_gradient_uses_projected_trace_for_rank_deficient_penalty() {
let ranges = vec![(0, 3)];
let rho = array![0.0];
let beta = array![1.0, -1.0, 3.0];
let s_lambda = array![[1.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 0.0]];
let h = array![[4.0, 0.2, 7.0], [0.2, 9.0, -3.0], [7.0, -3.0, 30.0]];
let spec = ParameterBlockSpec {
name: "surface".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros((
1, 3,
)))),
offset: Array1::zeros(1),
penalties: vec![PenaltyMatrix::Dense(s_lambda.clone())],
nullspace_dims: vec![1],
initial_log_lambdas: rho.clone(),
initial_beta: Some(beta.clone()),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let specs = vec![spec];
let inner = BlockwiseInnerResult {
block_states: vec![ParameterBlockState {
beta: beta.clone(),
eta: Array1::zeros(1),
}],
active_sets: vec![None],
log_likelihood: 0.0,
penalty_value: 0.5 * beta.dot(&fast_av(&s_lambda, &beta)),
cycles: 1,
converged: true,
block_logdet_h: 0.0,
block_logdet_s: 0.0,
s_lambdas: vec![s_lambda.clone()],
joint_workspace: None,
kkt_residual: None,
active_constraints: None,
};
let per_block = vec![rho.clone()];
let options = BlockwiseFitOptions {
use_remlobjective: true,
use_outer_hessian: false,
..BlockwiseFitOptions::default()
};
let no_dh =
|_direction: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> { Ok(None) };
let no_d2h = |_u: &Array1<f64>,
_v: &Array1<f64>|
-> Result<Option<DriftDerivResult>, String> { Ok(None) };
let projected = joint_outer_evaluate(
&inner,
&specs,
&per_block,
&rho,
&beta,
JointHessianSource::Dense(h.clone()),
&ranges,
3,
0.0,
0.0,
0.0,
1.0,
0.0,
true,
true,
false,
true,
EvalMode::ValueAndGradient,
&options,
crate::types::RhoPrior::Flat,
PseudoLogdetMode::Smooth,
&no_dh,
None,
&no_d2h,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
)
.expect("projected outer evaluation succeeds");
let unprojected = joint_outer_evaluate(
&inner,
&specs,
&per_block,
&rho,
&beta,
JointHessianSource::Dense(h.clone()),
&ranges,
3,
0.0,
0.0,
0.0,
1.0,
0.0,
true,
true,
false,
false,
EvalMode::ValueAndGradient,
&options,
crate::types::RhoPrior::Flat,
PseudoLogdetMode::Smooth,
&no_dh,
None,
&no_d2h,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
)
.expect("unprojected outer evaluation succeeds");
let (_, kernel) = joint_penalty_subspace_trace_parts(
&JointHessianSource::Dense(h.clone()),
&ranges,
std::slice::from_ref(&s_lambda),
3,
0.0,
None,
)
.expect("projection kernel builds");
let projected_trace = kernel
.expect("rank-deficient penalty has positive subspace")
.trace_projected_logdet(&s_lambda);
let expected_gradient =
0.5 * beta.dot(&fast_av(&s_lambda, &beta)) + 0.5 * projected_trace - 0.5 * 2.0;
assert_relative_eq!(
projected.gradient[0],
expected_gradient,
epsilon = 1e-12,
max_relative = 1e-12
);
// Post gh#752/#901 contract: the trace kernel is the FULL spectral
// pseudo-inverse `M⁺ = (H+Sλ)⁺` over range(H+Sλ). On a NONSINGULAR `M`
// (this fixture) that is exactly `M⁻¹`, so the projected route and the
// full-space operator route compute the same generalized determinant
// and the same ρ-trace — the projection must be INVARIANT here. (The
// historical assertion that they differ encoded the pre-#752 range(Sλ)
// reduction, which dropped the penalty-null likelihood curvature and
// was itself the bug. The case where the routes genuinely diverge — a
// singular `M` whose ker(H+Sλ) the pseudo-logdet must drop — is
// asserted in `joint_outer_gradient_projected_trace_drops_joint_null`.)
assert_relative_eq!(
projected.gradient[0],
unprojected.gradient[0],
epsilon = 1e-8,
max_relative = 1e-8
);
}
/// The discriminating case for `project_hessian_logdet`: a joint Hessian
/// whose ker(H) overlaps ker(Sλ), so `M = H + Sλ` is genuinely singular.
/// The projected route must drop the unidentified direction (pseudo-logdet
/// + `M⁺` trace kernel over range(M)) and produce the exact closed-form
/// gradient; a full-space `M⁻¹` route has no finite answer here. This is
/// the routing guard the nonsingular fixture above cannot provide (there
/// the two routes coincide by design).
#[test]
fn joint_outer_gradient_projected_trace_drops_joint_null() {
let ranges = vec![(0, 3)];
let rho = array![0.0];
let beta = array![1.0, -1.0, 3.0];
let s_lambda = array![[1.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 0.0]];
// ker(h) = span(e3) = ker(s_lambda) ⇒ M = H + Sλ is singular with the
// unidentified direction e3.
let h = array![[4.0, 0.2, 0.0], [0.2, 9.0, 0.0], [0.0, 0.0, 0.0]];
let spec = ParameterBlockSpec {
name: "surface".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros((
1, 3,
)))),
offset: Array1::zeros(1),
penalties: vec![PenaltyMatrix::Dense(s_lambda.clone())],
nullspace_dims: vec![1],
initial_log_lambdas: rho.clone(),
initial_beta: Some(beta.clone()),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let specs = vec![spec];
let inner = BlockwiseInnerResult {
block_states: vec![ParameterBlockState {
beta: beta.clone(),
eta: Array1::zeros(1),
}],
active_sets: vec![None],
log_likelihood: 0.0,
penalty_value: 0.5 * beta.dot(&fast_av(&s_lambda, &beta)),
cycles: 1,
converged: true,
block_logdet_h: 0.0,
block_logdet_s: 0.0,
s_lambdas: vec![s_lambda.clone()],
joint_workspace: None,
kkt_residual: None,
active_constraints: None,
};
let per_block = vec![rho.clone()];
let options = BlockwiseFitOptions {
use_remlobjective: true,
use_outer_hessian: false,
..BlockwiseFitOptions::default()
};
let no_dh =
|_direction: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> { Ok(None) };
let no_d2h = |_u: &Array1<f64>,
_v: &Array1<f64>|
-> Result<Option<DriftDerivResult>, String> { Ok(None) };
let projected = joint_outer_evaluate(
&inner,
&specs,
&per_block,
&rho,
&beta,
JointHessianSource::Dense(h.clone()),
&ranges,
3,
0.0,
0.0,
0.0,
1.0,
0.0,
true,
true,
false,
true,
EvalMode::ValueAndGradient,
&options,
crate::types::RhoPrior::Flat,
PseudoLogdetMode::Smooth,
&no_dh,
None,
&no_d2h,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
)
.expect("projected outer evaluation succeeds on a singular joint Hessian");
let (_, kernel) = joint_penalty_subspace_trace_parts(
&JointHessianSource::Dense(h.clone()),
&ranges,
std::slice::from_ref(&s_lambda),
3,
0.0,
None,
)
.expect("projection kernel builds");
let projected_trace = kernel
.expect("rank-deficient joint Hessian has a positive subspace")
.trace_projected_logdet(&s_lambda);
let expected_gradient =
0.5 * beta.dot(&fast_av(&s_lambda, &beta)) + 0.5 * projected_trace - 0.5 * 2.0;
assert!(
projected.objective.is_finite(),
"pseudo-logdet objective must stay finite when ker(H+Sλ) is dropped"
);
assert_relative_eq!(
projected.gradient[0],
expected_gradient,
epsilon = 1e-10,
max_relative = 1e-10
);
}
// Experimental scan documenting that on THIS fixture's geometry the
// joint_outer_evaluate path does not show divergence between
// project_hessian_logdet=true and =false at large-scale ρ: the dominant
// term ½ λ β'Sβ grows linearly in λ regardless of projection, and the trace
// pair cancels in both routes here. The clustered-PC marginal-slope failure
// (#808/#787) is a DIFFERENT geometry — a near-collinear penalty-null trend
// whose likelihood determinant the range(Sλ)-only route drops. That route is
// now disabled for all marginal-slope families: the project_hessian_logdet
// flag at every joint_outer_evaluate/_efs call site reads
// `use_projected_penalty_logdet()` (default true), so value and analytic
// gradient share the range(H+Sλ) generalized determinant.
#[test]
fn large_scale_rho_scan_joint_outer_evaluate_is_projection_invariant() {
// Same fixture shape as the rank-deficient projected-trace test,
// but with H_unpen scaled to data-Hessian magnitude (n ~ 2e5).
let ranges = vec![(0, 3)];
let beta = array![1.0, -1.0, 3.0];
let s_unit: Array2<f64> = array![[1.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 0.0]];
let n_scale = 2.0e5_f64;
let h: Array2<f64> =
array![[4.0, 0.2, 7.0], [0.2, 9.0, -3.0], [7.0, -3.0, 30.0]].mapv(|v| v * n_scale);
let no_dh = |_d: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> { Ok(None) };
let no_d2h = |_u: &Array1<f64>,
_v: &Array1<f64>|
-> Result<Option<DriftDerivResult>, String> { Ok(None) };
eprintln!("\n=== large-scale rho-scan: unprojected vs projected outer gradient ===");
eprintln!(
"{:>5} {:>10} {:>16} {:>16} {:>10}",
"rho", "lambda", "g_unprojected", "g_projected", "ratio"
);
let mut g_un_at_10 = 0.0_f64;
let mut g_pr_at_10 = 0.0_f64;
for &rho_val in &[0.0_f64, 2.0, 4.0, 6.0, 8.0, 10.0] {
let lam = rho_val.exp();
let rho = array![rho_val];
let s_lambda = s_unit.mapv(|v| v * lam);
let spec = ParameterBlockSpec {
name: "surface".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros(
(1, 3),
))),
offset: Array1::zeros(1),
penalties: vec![PenaltyMatrix::Dense(s_unit.clone())],
nullspace_dims: vec![1],
initial_log_lambdas: rho.clone(),
initial_beta: Some(beta.clone()),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let specs = vec![spec];
let inner = BlockwiseInnerResult {
block_states: vec![ParameterBlockState {
beta: beta.clone(),
eta: Array1::zeros(1),
}],
active_sets: vec![None],
log_likelihood: 0.0,
penalty_value: 0.5 * lam * beta.dot(&fast_av(&s_unit, &beta)),
cycles: 1,
converged: true,
block_logdet_h: 0.0,
block_logdet_s: 0.0,
s_lambdas: vec![s_lambda.clone()],
joint_workspace: None,
kkt_residual: None,
active_constraints: None,
};
let per_block = vec![rho.clone()];
let options = BlockwiseFitOptions {
use_remlobjective: true,
use_outer_hessian: false,
..BlockwiseFitOptions::default()
};
// project_hessian_logdet = true (current main behavior)
let projected = joint_outer_evaluate(
&inner,
&specs,
&per_block,
&rho,
&beta,
JointHessianSource::Dense(h.clone()),
&ranges,
3,
0.0,
0.0,
0.0,
1.0,
0.0,
true,
true,
false,
true,
EvalMode::ValueAndGradient,
&options,
crate::types::RhoPrior::Flat,
PseudoLogdetMode::Smooth,
&no_dh,
None,
&no_d2h,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
)
.expect("projected eval ok");
// project_hessian_logdet = false (the 0.1.92 / pre-fix behavior)
let unprojected = joint_outer_evaluate(
&inner,
&specs,
&per_block,
&rho,
&beta,
JointHessianSource::Dense(h.clone()),
&ranges,
3,
0.0,
0.0,
0.0,
1.0,
0.0,
true,
true,
false,
false,
EvalMode::ValueAndGradient,
&options,
crate::types::RhoPrior::Flat,
PseudoLogdetMode::Smooth,
&no_dh,
None,
&no_d2h,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
)
.expect("unprojected eval ok");
let g_un = unprojected.gradient[0];
let g_pr = projected.gradient[0];
eprintln!(
"{:>5.1} {:>10.3e} {:>16.6e} {:>16.6e} {:>10.3e}",
rho_val,
lam,
g_un,
g_pr,
g_un.abs() / (g_pr.abs() + 1e-30)
);
if rho_val == 10.0 {
g_un_at_10 = g_un.abs();
g_pr_at_10 = g_pr.abs();
}
}
// Finding: at this fixture geometry the two routes agree to
// ~1e-6 relative precision at every ρ in [0, 10]. Both grow
// linearly in λ (≈ ½ λ β'Sβ + bounded trace contribution).
// The optimizer-visible blow-up in large-scale therefore cannot be
// a missing projection in joint_outer_evaluate — it must live
// in the survival-marginal-slope custom gradient path.
let rel_diff = (g_un_at_10 - g_pr_at_10).abs() / g_pr_at_10.max(1e-30);
assert!(
rel_diff < 1e-4,
"projection should be near-invariant on this fixture at rho=10; \
got g_un={:.6e}, g_pr={:.6e}, rel_diff={:.3e}",
g_un_at_10,
g_pr_at_10,
rel_diff
);
}
// ── Large-scale reproducer for the marginal-slope ρ-saturation
// failure ────────────────────────────────────────────────────────────
//
// Failure being investigated:
// outer iter=60, |g|=4.18e13, three of four ρ-coords pinned at the
// box bound ±10 (`with_rho_bound(10.0)`). The dominant explicit term
// ½λβ'Sβ at large scale (n≈2e5, p≈60, β'Sβ~10⁴, λ=exp(10)≈22k) is
// only ~10⁸ — observed gradient is ~10¹³, FIVE orders of magnitude
// beyond what the projected-trace kernel cancellation predicts.
//
// The existing `large_scale_rho_scan_joint_outer_evaluate_is_projection_invariant`
// test uses single-block, p=3, nullspace_dims=1, and supplies
// `compute_dh = Ok(None)` — that path SKIPS the trace pair entirely and
// therefore cannot reproduce the failure. The large-scale fit has:
// - 3 blocks (time_surface, marginal_surface, logslope_surface)
// - 4 penalty coords (time:1, marginal:2 [anisotropic], logslope:1)
// - Duchon-shape penalties: large nullspace_dims (d+1=4 for d=3 PCs)
// producing rank-deficient S with many zero eigenvalues
// - n ~ 2e5 → H_unpen scale ~ n × diag-of-design-Gram
// - Realistic `compute_dh(d)` returning the per-coord penalty drift
// ∂H/∂ρ_k = λ_k S_k (chained through the direction d)
//
// This test reproduces the SHAPE: builds large-scale-dimensioned blocks
// with rank-deficient Duchon-shape penalties, scales H to large-scale
// magnitude, supplies a realistic penalty-drift `compute_dh`, evaluates
// `joint_outer_evaluate` at the actual failure ρ point
// [time=10, marg=10, marg=10, logslope=4.5], and asserts every gradient
// entry is BOUNDED by a physically reasonable multiple of the dominant
// ½λβ'Sβ term.
//
// If this test passes with reasonable bounds: the bug is NOT in
// joint_outer_evaluate itself — it must live in the marginal-slope-
// specific drift derivatives (`evaluate_exact_newton_joint_gradient_*`
// in survival_marginal_slope.rs) that feed the closure.
// If this test fails: joint_outer_evaluate has a numerical defect that
// surfaces at large scale + realistic Ḣ. We then bisect inside the
// evaluator.
//
#[test]
fn large_scale_multiblock_outer_gradient_with_realistic_drift_is_bounded() {
// LargeScale-realistic dimensions for binary-outcome marginal-slope.
// Duchon(PC1,PC2,PC3, centers=10, order=1) → p_basis = centers +
// null_basis(d+1=4) = 14 columns per spatial block, nullspace dim=4.
// The actual fit has time_surface with a different basis (B-spline
// along entry/exit age) — we approximate with p_time=10, null=2.
let p_time = 10usize;
let p_marg = 14usize;
let p_logs = 14usize;
let p_total = p_time + p_marg + p_logs;
// Block ranges in the joint coefficient vector.
let ranges = vec![
(0, p_time),
(p_time, p_time + p_marg),
(p_time + p_marg, p_total),
];
// ── Build rank-deficient Duchon-shape penalty matrices.
// S = U diag(σ) Uᵀ where σ has `nullspace_dims` trailing zeros.
// We use deterministic orthonormal columns from a simple QR of a
// structured matrix to mimic the eigenstructure without random.
fn build_duchon_shape(p: usize, nullspace: usize, signal_scale: f64) -> Array2<f64> {
// Diagonal eigenvalue spectrum, geometric decay across the
// signal subspace then zeros on the nullspace.
let rank = p - nullspace;
let mut eigvals = vec![0.0_f64; p];
for i in 0..rank {
// 1.0, 0.5, 0.25, ... — physical Duchon penalty spectrum
// has spectrum decaying like 1/k for high-frequency modes;
// geometric decay is a faithful caricature.
eigvals[i] = signal_scale * 0.5_f64.powi(i as i32);
}
// Use a deterministic orthogonal basis: discrete cosine basis.
// U[i,j] = sqrt(2/p) cos(π (i+0.5) j / p) for j>0; U[i,0]=1/√p.
let mut u = Array2::<f64>::zeros((p, p));
for i in 0..p {
u[[i, 0]] = 1.0 / (p as f64).sqrt();
for j in 1..p {
u[[i, j]] = (2.0 / p as f64).sqrt()
* (std::f64::consts::PI * (i as f64 + 0.5) * j as f64 / p as f64).cos();
}
}
// S = U diag(eigvals) Uᵀ.
let mut s = Array2::<f64>::zeros((p, p));
for k in 0..p {
if eigvals[k] == 0.0 {
continue;
}
for i in 0..p {
for j in 0..p {
s[[i, j]] += eigvals[k] * u[[i, k]] * u[[j, k]];
}
}
}
s
}
// time_surface: 1 penalty (nullspace=2: constant + linear in age).
let s_time = build_duchon_shape(p_time, 2, 1.0);
// marginal_surface: 2 penalties (nullspace=4 each, anisotropic).
let s_marg_0 = build_duchon_shape(p_marg, 4, 1.0);
let s_marg_1 = build_duchon_shape(p_marg, 4, 0.7);
// logslope_surface: 1 penalty (nullspace=4).
let s_logs = build_duchon_shape(p_logs, 4, 1.0);
// ── Failure-point ρ = [10, 10, 10, 4.5]. λ = exp(ρ).
let rho = array![10.0_f64, 10.0, 10.0, 4.5];
let lams: Array1<f64> = rho.mapv(f64::exp);
// λ-scaled S matrices (per-block, in block-local indexing — this
// is what BlockwiseInnerResult.s_lambdas stores).
let s_lambdas_local: Vec<Array2<f64>> = vec![
s_time.mapv(|v| v * lams[0]),
// marginal block has TWO penalties — they are summed into one
// local s_lambda (this matches how BlockwiseInnerResult stores
// a per-block sum of all penalties in that block):
(&s_marg_0 * lams[1]) + &(&s_marg_1 * lams[2]),
s_logs.mapv(|v| v * lams[3]),
];
// β at large scale: |β|∞ ~ 1, β'Sβ ~ trace(S) ~ O(p) ~ 10.
let beta_flat = Array1::<f64>::from_iter((0..p_total).map(|i| ((i as f64) * 0.13).sin()));
// ── Large-scale joint unpenalized Hessian.
// Real survival Hessian = Xᵀ W X with W diagonal and n=2e5. We
// mimic the SCALE by H = n * (I + small dense perturbation).
let n_scale = 2.0e5_f64;
let mut h = Array2::<f64>::eye(p_total) * n_scale;
// Add a small off-diagonal coupling to make it non-trivial but SPD.
for i in 0..p_total {
for j in 0..p_total {
if i != j {
let v = 0.05_f64
* n_scale
* ((i as f64 - j as f64).abs() / p_total as f64).exp().recip();
h[[i, j]] = v;
}
}
}
// ── Hessian β-chain closure.
// CONTRACT: `compute_dh(v_k)` takes a β-space direction `v_k`
// (length p_total = `∂β/∂ρ_k` under the envelope) and returns
// `D_beta H[v_k]` — the third-order tensor of H contracted with
// `v_k`. The penalty-drift component `λ_k S_k` is added by
// `joint_outer_evaluate` automatically from `inner.s_lambdas` —
// this closure adds ONLY the β-chained piece.
//
// For an idealized H_unpen that is independent of β (linear model
// limit, no nonlinear inner geometry), `D_beta H = 0` and the
// closure returns `Ok(None)`. This is exactly the regime the
// existing single-block `large_scale_rho_scan_*` test exercises
// and finds projection-invariant. The marginal-slope family's
// Hessian DOES depend on β (through the joint geometry), so the
// closure is non-trivial in production — and that is the
// candidate source of the gradient blowup.
//
// This test takes the idealized path (`Ok(None)`) so any blowup
// observed here is attributable to `joint_outer_evaluate`'s
// multi-block / rank-deficient-S handling alone. If this test
// PASSES (gradient bounded), the bug must live in the family's
// `hessian_derivative_correction_result` β-chain — not in the
// evaluator. If it FAILS, the evaluator itself has the defect at
// large scale + Duchon-shape S.
let no_dh = |_v_k: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> { Ok(None) };
let compute_dh = no_dh;
let no_d2h = |_u: &Array1<f64>,
_v: &Array1<f64>|
-> Result<Option<DriftDerivResult>, String> { Ok(None) };
// ── ParameterBlockSpec for each block.
let mk_spec = |name: &str,
p: usize,
penalties: Vec<Array2<f64>>,
null: usize,
rho_block: Array1<f64>|
-> ParameterBlockSpec {
ParameterBlockSpec {
name: name.to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
Array2::<f64>::zeros((1, p)),
)),
offset: Array1::zeros(1),
penalties: penalties.into_iter().map(PenaltyMatrix::Dense).collect(),
nullspace_dims: vec![null],
initial_log_lambdas: rho_block,
initial_beta: Some(beta_flat.slice(s![..p]).to_owned()),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
}
};
let specs = vec![
mk_spec(
"time_surface",
p_time,
vec![s_time.clone()],
2,
array![rho[0]],
),
mk_spec(
"marginal_surface",
p_marg,
vec![s_marg_0.clone(), s_marg_1.clone()],
4,
array![rho[1], rho[2]],
),
mk_spec(
"logslope_surface",
p_logs,
vec![s_logs.clone()],
4,
array![rho[3]],
),
];
let per_block = vec![array![rho[0]], array![rho[1], rho[2]], array![rho[3]]];
let inner = BlockwiseInnerResult {
block_states: vec![
ParameterBlockState {
beta: beta_flat.slice(s![0..p_time]).to_owned(),
eta: Array1::zeros(1),
},
ParameterBlockState {
beta: beta_flat.slice(s![p_time..p_time + p_marg]).to_owned(),
eta: Array1::zeros(1),
},
ParameterBlockState {
beta: beta_flat.slice(s![p_time + p_marg..p_total]).to_owned(),
eta: Array1::zeros(1),
},
],
active_sets: vec![None, None, None],
log_likelihood: 0.0,
penalty_value: 0.5
* (lams[0]
* beta_flat.slice(s![0..p_time]).dot(&fast_av(
&s_time,
&beta_flat.slice(s![0..p_time]).to_owned(),
))
+ lams[1]
* beta_flat.slice(s![p_time..p_time + p_marg]).dot(&fast_av(
&s_marg_0,
&beta_flat.slice(s![p_time..p_time + p_marg]).to_owned(),
))
+ lams[2]
* beta_flat.slice(s![p_time..p_time + p_marg]).dot(&fast_av(
&s_marg_1,
&beta_flat.slice(s![p_time..p_time + p_marg]).to_owned(),
))
+ lams[3]
* beta_flat.slice(s![p_time + p_marg..p_total]).dot(&fast_av(
&s_logs,
&beta_flat.slice(s![p_time + p_marg..p_total]).to_owned(),
))),
cycles: 1,
converged: true,
block_logdet_h: 0.0,
block_logdet_s: 0.0,
s_lambdas: s_lambdas_local,
joint_workspace: None,
kkt_residual: None,
active_constraints: None,
};
let options = BlockwiseFitOptions {
use_remlobjective: true,
use_outer_hessian: false,
..BlockwiseFitOptions::default()
};
let projected = joint_outer_evaluate(
&inner,
&specs,
&per_block,
&rho,
&beta_flat,
JointHessianSource::Dense(h.clone()),
&ranges,
p_total,
0.0,
0.0,
0.0,
1.0,
0.0,
true,
true,
false,
true,
EvalMode::ValueAndGradient,
&options,
crate::types::RhoPrior::Flat,
PseudoLogdetMode::Smooth,
&compute_dh,
None,
&no_d2h,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
)
.expect("large-scale projected eval");
eprintln!("\n=== large-scale multi-block reproducer with realistic Ḣ ===");
eprintln!("ρ = {:?}", rho.as_slice().unwrap());
eprintln!("λ = {:?}", lams.as_slice().unwrap());
eprintln!(
"|β|∞ = {:.3}",
beta_flat.iter().fold(0.0_f64, |a, &b| a.max(b.abs()))
);
eprintln!("objective = {:.6e}", projected.objective);
eprintln!("gradient = {:?}", projected.gradient.as_slice().unwrap());
// Physical-bound check: ½λ_k β'_k S_k β_k is the dominant explicit
// term per coord. For large-scale shape this is ~10⁸ at ρ=10 with
// β-scale O(1). The full gradient including the projected trace
// pair should be of THE SAME ORDER (or smaller after cancellation),
// never 10⁵× larger.
let dominant_terms = [
0.5 * lams[0]
* beta_flat.slice(s![0..p_time]).dot(&fast_av(
&s_time,
&beta_flat.slice(s![0..p_time]).to_owned(),
)),
0.5 * lams[1]
* beta_flat.slice(s![p_time..p_time + p_marg]).dot(&fast_av(
&s_marg_0,
&beta_flat.slice(s![p_time..p_time + p_marg]).to_owned(),
)),
0.5 * lams[2]
* beta_flat.slice(s![p_time..p_time + p_marg]).dot(&fast_av(
&s_marg_1,
&beta_flat.slice(s![p_time..p_time + p_marg]).to_owned(),
)),
0.5 * lams[3]
* beta_flat.slice(s![p_time + p_marg..p_total]).dot(&fast_av(
&s_logs,
&beta_flat.slice(s![p_time + p_marg..p_total]).to_owned(),
)),
];
assert_eq!(
projected.gradient.len(),
dominant_terms.len(),
"projected gradient dimension changed"
);
for (k, (&g, &dominant_term)) in projected
.gradient
.iter()
.zip(dominant_terms.iter())
.enumerate()
{
// Bound: trace pair adds ~p contributions, plus H⁻¹ Ḣ trace
// bounded by Σ |λ_k| / |H_diag| × p ~ λ_k p / n ~ tiny at
// large scale. Total gradient should be within 10× of the
// dominant term (allowing for projection-correction sign).
let bound = dominant_term.abs().max(1.0) * 100.0;
assert!(g.is_finite(), "gradient[{k}] is non-finite: {g}");
assert!(
g.abs() <= bound,
"gradient[{k}] = {:.6e} exceeds physical bound 100·|½λβ'Sβ| = {:.6e} \
(dominant_term={:.6e}); this reproduces the large-scale blowup \
inside joint_outer_evaluate.",
g,
bound,
dominant_term
);
}
}
#[test]
fn direct_joint_hyper_inner_tolerance_follows_outer_target() {
let options = BlockwiseFitOptions {
inner_tol: 1e-6,
outer_tol: 1e-5,
inner_max_cycles: 100,
..BlockwiseFitOptions::default()
};
let (eval_options, strict_warm_start) =
derivative_quality_options_and_warm_start(&options, None, true);
assert_eq!(
eval_options.inner_tol, options.outer_tol,
"default exact joint-hyper eval should use the outer optimizer scale"
);
assert_eq!(eval_options.inner_max_cycles, options.inner_max_cycles);
assert!(
strict_warm_start.is_none(),
"loosening to the outer scale should not discard cached inner state"
);
let large_scale_objective = 3.689e5;
let posted_residual = 6.788e-1;
let posted_objective_change = 4.209e-2;
let eval_tol = eval_options.inner_tol * (1.0 + large_scale_objective);
assert!(
posted_residual <= 2.0 * eval_tol && posted_objective_change <= eval_tol,
"the exact outer startup validation should accept numerically flat inner solves at outer scale"
);
let (rho_default, _) = derivative_quality_options_and_warm_start(&options, None, false);
assert_eq!(
rho_default.inner_tol, options.inner_tol,
"rho-only exact joint-hyper eval must preserve the rho-only outer surface"
);
let tighter_options = BlockwiseFitOptions {
inner_tol: 1e-3,
outer_tol: 1e-5,
inner_max_cycles: 100,
..BlockwiseFitOptions::default()
};
let (tightened, _) =
derivative_quality_options_and_warm_start(&tighter_options, None, true);
assert_eq!(tightened.inner_tol, tighter_options.outer_tol);
assert_eq!(tightened.inner_max_cycles, 200);
let (rho_only, _) =
derivative_quality_options_and_warm_start(&tighter_options, None, false);
assert_eq!(rho_only.inner_tol, tighter_options.inner_tol);
assert_eq!(rho_only.inner_max_cycles, tighter_options.inner_max_cycles);
}
#[test]
fn exact_spatial_joint_hyper_inner_tolerance_follows_spatial_outer_target() {
let options = BlockwiseFitOptions {
inner_tol: 1e-6,
outer_tol: 1e-10,
inner_max_cycles: 200,
..BlockwiseFitOptions::default()
};
let spatial_outer_tol = 1e-4;
let eval_input = joint_hyper_options_for_outer_tolerance(&options, spatial_outer_tol);
let (eval_options, strict_warm_start) =
derivative_quality_options_and_warm_start(&eval_input, None, true);
assert_eq!(eval_options.outer_tol, spatial_outer_tol);
assert_eq!(
eval_options.inner_tol, spatial_outer_tol,
"exact spatial [rho, psi] evaluations should certify beta only to the tolerance of the outer optimizer consuming the derivative"
);
assert!(
strict_warm_start.is_none(),
"loosening an over-tight caller tolerance should preserve the cached inner state"
);
let large_scale_objective = 3.689e5;
let posted_residual_plateau = 6.788e-1;
let posted_objective_change = 4.209e-2;
let eval_tol = eval_options.inner_tol * (1.0 + large_scale_objective);
assert!(
posted_residual_plateau <= eval_tol && posted_objective_change <= eval_tol,
"the posted saturated Newton plateau is below the spatial outer derivative accuracy target"
);
}
fn outerobjective_andgradient<F: CustomFamily + Clone + Send + Sync + 'static>(
family: &F,
specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
penalty_counts: &[usize],
rho: &Array1<f64>,
warm_start: Option<&ConstrainedWarmStart>,
) -> Result<(f64, Array1<f64>, ConstrainedWarmStart), String> {
let (obj, grad, _, warm) = super::test_support::outerobjectivegradienthessian(
family,
specs,
options,
penalty_counts,
rho,
warm_start,
EvalMode::ValueAndGradient,
)?;
Ok((obj, grad, warm))
}
struct BinomialLocationScaleWiggleOuterFixture {
family: BinomialLocationScaleWiggleFamily,
specs: Vec<ParameterBlockSpec>,
penalty_counts: Vec<usize>,
rho: Array1<f64>,
options: BlockwiseFitOptions,
}
fn binomial_location_scale_wiggle_outer_fixture() -> BinomialLocationScaleWiggleOuterFixture {
let base = binomial_location_scale_base_fixture();
let q_seed = Array1::linspace(-1.4, 1.4, base.n);
let knots = crate::families::wiggle::initializewiggle_knots_from_seed(q_seed.view(), 3, 4)
.expect("knots");
let wiggle_block = crate::families::wiggle::buildwiggle_block_input_from_knots(
q_seed.view(),
&knots,
3,
2,
false,
)
.expect("wiggle block");
let wigglespec = ParameterBlockSpec {
name: "wiggle".to_string(),
design: wiggle_block.design.clone(),
offset: wiggle_block.offset.clone(),
penalties: wiggle_block
.penalties
.iter()
.map(|ps| match ps {
crate::solver::estimate::PenaltySpec::Block {
local, col_range, ..
} => PenaltyMatrix::Blockwise {
local: local.clone(),
col_range: col_range.clone(),
total_dim: wiggle_block.design.ncols(),
},
crate::solver::estimate::PenaltySpec::Dense(m)
| crate::solver::estimate::PenaltySpec::DenseWithMean { matrix: m, .. } => {
PenaltyMatrix::Dense(m.clone())
}
})
.collect(),
nullspace_dims: wiggle_block.nullspace_dims.clone(),
initial_log_lambdas: array![0.1],
initial_beta: Some(Array1::from_elem(wiggle_block.design.ncols(), 0.03)),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let family = BinomialLocationScaleWiggleFamily {
y: base.y,
weights: base.weights,
link_kind: crate::types::InverseLink::Standard(crate::types::StandardLink::Probit),
threshold_design: Some(base.threshold_design),
log_sigma_design: Some(base.log_sigma_design),
wiggle_knots: knots,
wiggle_degree: 3,
policy: crate::resource::ResourcePolicy::default_library(),
};
BinomialLocationScaleWiggleOuterFixture {
family,
specs: vec![base.threshold_spec, base.log_sigma_spec, wigglespec],
penalty_counts: vec![1usize, 1usize, 1usize],
rho: array![0.05, -0.15, 0.1],
options: BlockwiseFitOptions {
use_remlobjective: true,
ridge_floor: 1e-10,
outer_max_iter: 1,
..BlockwiseFitOptions::default()
},
}
}
#[derive(Clone)]
struct OneBlockIdentityFamily;
#[test]
fn joint_coupled_coefficient_hessian_cost_matches_n_times_p_total_squared() {
// Three blocks p_b = (12, 20, 8), n=200. Joint-coupled cost is
// n·(Σp_b)² = 200·40² = 320_000. Block-diagonal default with the
// same designs would give n·Σp_b² = 200·(144+400+64) = 121_600.
// The cross-block fill 2·n·(p_t·p_m + p_t·p_l + p_m·p_l) =
// 2·200·(240+96+160) = 198_400 accounts for the difference.
let mk_spec = |p: usize| ParameterBlockSpec {
name: "test".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros((
200, p,
)))),
offset: Array1::zeros(200),
penalties: Vec::new(),
nullspace_dims: Vec::new(),
initial_log_lambdas: Array1::zeros(0),
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let specs = vec![mk_spec(12), mk_spec(20), mk_spec(8)];
assert_eq!(
joint_coupled_coefficient_hessian_cost(200, &specs),
200 * 40 * 40
);
assert_eq!(
default_coefficient_hessian_cost(&specs),
200 * (144 + 400 + 64)
);
assert!(
joint_coupled_coefficient_hessian_cost(200, &specs)
> default_coefficient_hessian_cost(&specs)
);
}
#[test]
fn large_scale_exact_adaptive_hessian_order_stays_second_order() {
let n_train = 320_000u64;
let p = 101usize;
let retained_rho_dim = 3usize;
let spec = ParameterBlockSpec {
name: "matern60".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros((
1, p,
)))),
offset: Array1::zeros(1),
penalties: (0..retained_rho_dim)
.map(|_| PenaltyMatrix::Dense(Array2::eye(p)))
.collect(),
nullspace_dims: vec![0; retained_rho_dim],
initial_log_lambdas: Array1::zeros(retained_rho_dim),
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let coefficient_hessian_cost = n_train * (p as u64) * (p as u64);
assert_eq!(coefficient_hessian_cost, 3_264_320_000);
assert_eq!(
retained_rho_dim as u64 * coefficient_hessian_cost,
9_792_960_000
);
assert_eq!(
exact_outer_order_from_capability(&[spec], coefficient_hessian_cost),
ExactOuterDerivativeOrder::Second
);
}
#[test]
fn use_joint_matrix_free_path_triggers_at_each_documented_threshold() {
// p ≥ 512 is sufficient regardless of n.
assert!(use_joint_matrix_free_path(512, 1));
assert!(use_joint_matrix_free_path(2048, 4));
assert!(!use_joint_matrix_free_path(511, 1));
// n ≥ 50_000 AND p ≥ 128: both must hold. This keeps p≈51 FLEX
// marginal-slope large-scale fits on the bounded dense-materialized path.
assert!(use_joint_matrix_free_path(128, 50_000));
assert!(!use_joint_matrix_free_path(127, 50_000));
assert!(!use_joint_matrix_free_path(128, 31_249));
assert!(!use_joint_matrix_free_path(51, 320_000));
// n · p ≥ 4_000_000 is the linear-work fallback, but only after the
// same moderate-p guard; below that, materializing `p` columns is a
// deterministic small-p bound on expensive row-kernel HVPs.
assert!(use_joint_matrix_free_path(128, 31_250));
assert!(!use_joint_matrix_free_path(127, 31_497));
// Below every threshold: dense path.
assert!(!use_joint_matrix_free_path(8, 100));
assert!(!use_joint_matrix_free_path(64, 1000));
}
#[test]
fn large_scale_shape_margslope_flex_cycle0_uses_bounded_dense_route() {
let total_p = 51;
let total_n = 320_000;
let max_pcg_hvps_before_fix = JOINT_PCG_MAX_ITER_MULTIPLIER * total_p;
assert_eq!(max_pcg_hvps_before_fix, 204);
assert!(
!use_joint_matrix_free_path(total_p, total_n),
"p=51/n=320k should materialize exactly 51 columns instead of risking up to {max_pcg_hvps_before_fix} expensive PCG matvecs in cycle 0"
);
}
struct CountingHessianWorkspace {
dense_calls: Arc<AtomicUsize>,
matvec_calls: Arc<AtomicUsize>,
source_preference: JointHessianSourcePreference,
}
impl ExactNewtonJointHessianWorkspace for CountingHessianWorkspace {
fn hessian_dense(&self) -> Result<Option<Array2<f64>>, String> {
self.dense_calls.fetch_add(1, Ordering::Relaxed);
Ok(Some(Array2::eye(2)))
}
fn hessian_source_preference(&self) -> JointHessianSourcePreference {
self.source_preference
}
fn hessian_matvec_available(&self) -> bool {
true
}
fn hessian_matvec(&self, v: &Array1<f64>) -> Result<Option<Array1<f64>>, String> {
self.matvec_calls.fetch_add(1, Ordering::Relaxed);
Ok(Some(v.clone()))
}
fn hessian_diagonal(&self) -> Result<Option<Array1<f64>>, String> {
Ok(Some(Array1::ones(2)))
}
fn directional_derivative(&self, arr: &Array1<f64>) -> Result<Option<Array2<f64>>, String> {
assert!(arr.iter().all(|v| !v.is_nan()));
Ok(None)
}
}
#[test]
fn workspace_hessian_source_prefers_dense_without_zero_matvec_probe() {
let dense_calls = Arc::new(AtomicUsize::new(0));
let matvec_calls = Arc::new(AtomicUsize::new(0));
let workspace: Arc<dyn ExactNewtonJointHessianWorkspace> =
Arc::new(CountingHessianWorkspace {
dense_calls: Arc::clone(&dense_calls),
matvec_calls: Arc::clone(&matvec_calls),
source_preference: JointHessianSourcePreference::Dense,
});
let source = exact_newton_joint_hessian_source_from_workspace(
&workspace,
2,
MaterializationIntent::InnerSolve,
"counting workspace",
)
.expect("hessian source should build")
.expect("hessian source should be present");
assert_eq!(dense_calls.load(Ordering::Relaxed), 1);
assert_eq!(matvec_calls.load(Ordering::Relaxed), 0);
match source {
JointHessianSource::Dense(hessian) => assert_eq!(hessian, Array2::<f64>::eye(2)),
JointHessianSource::Operator { .. } => panic!("dense source was not preferred"),
}
assert_eq!(matvec_calls.load(Ordering::Relaxed), 0);
}
#[test]
fn workspace_hessian_source_honors_operator_preference_before_dense_probe() {
let dense_calls = Arc::new(AtomicUsize::new(0));
let matvec_calls = Arc::new(AtomicUsize::new(0));
let workspace: Arc<dyn ExactNewtonJointHessianWorkspace> =
Arc::new(CountingHessianWorkspace {
dense_calls: Arc::clone(&dense_calls),
matvec_calls: Arc::clone(&matvec_calls),
source_preference: JointHessianSourcePreference::Operator,
});
let source = exact_newton_joint_hessian_source_from_workspace(
&workspace,
2,
MaterializationIntent::InnerSolve,
"operator-preferred counting workspace",
)
.expect("hessian source should build")
.expect("hessian source should be present");
assert_eq!(
dense_calls.load(Ordering::Relaxed),
0,
"operator-preferred source construction must not probe hessian_dense"
);
match source {
JointHessianSource::Operator { apply, .. } => {
let v = array![3.0, -2.0];
assert_eq!(apply(&v).expect("operator apply should succeed"), v);
assert_eq!(matvec_calls.load(Ordering::Relaxed), 1);
}
JointHessianSource::Dense(_) => panic!("operator source was not preferred"),
}
}
/// A workspace that exposes both a dense build and a matrix-free HVP and
/// refines its representation per intent (#738): matrix-free for the inner
/// solve, dense for logdet factorization. Mirrors CTN's contract.
struct IntentRefiningHessianWorkspace {
dense_calls: Arc<AtomicUsize>,
matvec_calls: Arc<AtomicUsize>,
}
impl ExactNewtonJointHessianWorkspace for IntentRefiningHessianWorkspace {
fn hessian_dense(&self) -> Result<Option<Array2<f64>>, String> {
self.dense_calls.fetch_add(1, Ordering::Relaxed);
Ok(Some(Array2::eye(2)))
}
fn hessian_source_preference(&self) -> JointHessianSourcePreference {
JointHessianSourcePreference::Operator
}
fn hessian_source_preference_for_intent(
&self,
intent: MaterializationIntent,
) -> JointHessianSourcePreference {
match intent {
MaterializationIntent::LogdetFactorization => JointHessianSourcePreference::Dense,
MaterializationIntent::InnerSolve
| MaterializationIntent::OuterEvaluation
| MaterializationIntent::OuterGradient => JointHessianSourcePreference::Operator,
}
}
fn hessian_matvec_available(&self) -> bool {
true
}
fn hessian_matvec(&self, v: &Array1<f64>) -> Result<Option<Array1<f64>>, String> {
self.matvec_calls.fetch_add(1, Ordering::Relaxed);
Ok(Some(v.clone()))
}
fn hessian_diagonal(&self) -> Result<Option<Array1<f64>>, String> {
Ok(Some(Array1::ones(2)))
}
fn directional_derivative(&self, arr: &Array1<f64>) -> Result<Option<Array2<f64>>, String> {
assert!(arr.iter().all(|v| !v.is_nan()));
Ok(None)
}
}
#[test]
fn logdet_intent_takes_dense_while_inner_solve_takes_operator() {
let dense_calls = Arc::new(AtomicUsize::new(0));
let matvec_calls = Arc::new(AtomicUsize::new(0));
let workspace: Arc<dyn ExactNewtonJointHessianWorkspace> =
Arc::new(IntentRefiningHessianWorkspace {
dense_calls: Arc::clone(&dense_calls),
matvec_calls: Arc::clone(&matvec_calls),
});
// Logdet factorization intent: the consumer factorizes H + S_lambda,
// so the workspace hands back the structural dense build directly,
// probing hessian_dense and skipping the operator wrapper.
let logdet_source = exact_newton_joint_hessian_source_from_workspace(
&workspace,
2,
MaterializationIntent::LogdetFactorization,
"intent-refining logdet",
)
.expect("logdet source should build")
.expect("logdet source should be present");
assert_eq!(dense_calls.load(Ordering::Relaxed), 1);
assert_eq!(matvec_calls.load(Ordering::Relaxed), 0);
match logdet_source {
JointHessianSource::Dense(hessian) => assert_eq!(hessian, Array2::<f64>::eye(2)),
JointHessianSource::Operator { .. } => {
panic!("logdet intent must take the dense representation")
}
}
// Inner solve intent: only H · v is applied, so the same workspace
// hands back the matrix-free operator without touching hessian_dense.
let inner_source = exact_newton_joint_hessian_source_from_workspace(
&workspace,
2,
MaterializationIntent::InnerSolve,
"intent-refining inner solve",
)
.expect("inner source should build")
.expect("inner source should be present");
assert_eq!(
dense_calls.load(Ordering::Relaxed),
1,
"inner-solve intent must not probe hessian_dense"
);
match inner_source {
JointHessianSource::Operator { apply, .. } => {
let v = array![1.5, -4.0];
assert_eq!(apply(&v).expect("operator apply should succeed"), v);
assert_eq!(matvec_calls.load(Ordering::Relaxed), 1);
}
JointHessianSource::Dense(_) => {
panic!("inner-solve intent must take the operator representation")
}
}
}
#[test]
fn default_coefficient_gradient_cost_is_half_of_hessian_cost() {
// The gradient-only sweep through the inner Newton solve does
// roughly half the per-evaluation arithmetic of the full Hessian
// assembly path (skips K-fold pairwise B_{j,k} blocks and K-fold
// inner derivative solves). The default trait method preserves
// this 2× ratio; families that override `coefficient_hessian_cost`
// (e.g. GAMLSS via `joint_coupled_coefficient_hessian_cost`)
// automatically inherit a consistent gradient-cost scaling without
// a per-family override.
let mk_spec = |n: usize, p: usize| ParameterBlockSpec {
name: "test".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros((
n, p,
)))),
offset: Array1::zeros(n),
penalties: Vec::new(),
nullspace_dims: Vec::new(),
initial_log_lambdas: Array1::zeros(0),
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let specs = vec![mk_spec(500, 10), mk_spec(500, 14)];
let h_cost = default_coefficient_hessian_cost(&specs);
let g_cost = default_coefficient_gradient_cost(&specs);
assert_eq!(h_cost, 500 * 100 + 500 * 196);
assert_eq!(g_cost, h_cost / 2);
}
#[test]
fn first_order_outer_iter_gate_caps_expensive_gradient_paths() {
assert_eq!(
cost_gated_first_order_max_iter(60, 10_000_000_000, false),
8
);
assert_eq!(
cost_gated_first_order_max_iter(60, 100_000_000_000, false),
4
);
assert_eq!(
cost_gated_first_order_max_iter(60, 100_000_000_000, true),
60
);
}
#[test]
fn custom_family_default_outer_seed_config_is_tightened_for_expensive_paths() {
let family = OneBlockIdentityFamily;
let small = family.outer_seed_config(4);
assert_eq!(small.max_seeds, 6);
assert_eq!(small.seed_budget, 1);
assert_eq!(small.screen_max_inner_iterations, 2);
let large = family.outer_seed_config(16);
assert_eq!(large.max_seeds, 4);
assert_eq!(large.seed_budget, 1);
assert_eq!(large.screen_max_inner_iterations, 2);
}
#[test]
fn floor_positiveworking_weights_preserves_exactzeros() {
let weights = array![0.0, 1.0e-16, 0.25];
let floored = floor_positiveworking_weights(&weights, 1.0e-6);
assert_eq!(floored[0], 0.0);
assert_eq!(floored[1], 1.0e-6);
assert_eq!(floored[2], 0.25);
}
#[test]
fn screened_outer_warm_start_reuses_any_matching_rho_dimension() {
let rho_far = array![2.25, -0.5];
let cache = Some(ConstrainedWarmStart {
rho: array![0.0, -0.5],
block_beta: vec![array![1.0, -1.0]],
active_sets: vec![None],
cached_inner: None,
});
let retained = screened_outer_warm_start(cache.as_ref(), &rho_far)
.expect("matching-dimension warm starts should remain reusable");
assert_eq!(retained.rho, array![0.0, -0.5]);
assert_eq!(retained.block_beta[0], array![1.0, -1.0]);
assert_eq!(retained.active_sets[0], None);
}
#[test]
fn cached_beta_warm_start_splits_blocks_and_validates_shape() {
let mk_spec = |name: &str, p: usize| ParameterBlockSpec {
name: name.to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros((
3, p,
)))),
offset: Array1::zeros(3),
penalties: Vec::new(),
nullspace_dims: Vec::new(),
initial_log_lambdas: Array1::zeros(0),
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let specs = vec![mk_spec("a", 2), mk_spec("b", 3)];
let warm = constrained_warm_start_from_cached_beta(4, &specs, &array![1., 2., 3., 4., 5.])
.expect("matching beta");
assert_eq!(warm.rho.len(), 4);
assert_eq!(warm.block_beta, vec![array![1., 2.], array![3., 4., 5.]]);
assert_eq!(warm.active_sets, vec![None, None]);
assert!(warm.cached_inner.is_none());
let err = match constrained_warm_start_from_cached_beta(4, &specs, &array![1., 2., 3.]) {
Ok(_) => panic!("wrong beta length should be rejected"),
Err(err) => err,
};
assert!(
err.to_string().contains(
"cached inner beta has length 3, but custom-family blocks require length 5"
),
"{err}"
);
}
#[test]
fn cached_beta_warm_start_rejects_nonfinite_entries() {
let spec = ParameterBlockSpec {
name: "a".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros((
3, 2,
)))),
offset: Array1::zeros(3),
penalties: Vec::new(),
nullspace_dims: Vec::new(),
initial_log_lambdas: Array1::zeros(0),
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let err = match constrained_warm_start_from_cached_beta(1, &[spec], &array![1.0, f64::NAN])
{
Ok(_) => panic!("non-finite beta should be rejected"),
Err(err) => err,
};
assert!(
err.to_string()
.contains("cached inner beta contains non-finite entries"),
"{err}"
);
}
#[test]
fn custom_outer_state_reset_preserves_seeded_cached_beta() {
let spec = ParameterBlockSpec {
name: "a".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros((
3, 2,
)))),
offset: Array1::zeros(3),
penalties: Vec::new(),
nullspace_dims: Vec::new(),
initial_log_lambdas: Array1::zeros(0),
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let mut state = CustomOuterState::new(None);
state
.seed_cached_beta(1, &[spec], &array![4.0, -2.0])
.expect("cached beta seed");
state.warm_cache = None;
state.reset();
let warm = state
.warm_cache
.as_ref()
.expect("reset should restore cached beta seed");
assert_eq!(warm.rho.len(), 1);
assert_eq!(warm.block_beta, vec![array![4.0, -2.0]]);
assert!(warm.cached_inner.is_none());
}
#[test]
fn custom_outer_state_reset_preserves_existing_persistent_warm_start() {
let persistent = ConstrainedWarmStart {
rho: array![0.25],
block_beta: vec![array![1.0, 2.0]],
active_sets: vec![None],
cached_inner: None,
};
let mut state = CustomOuterState::new(Some(persistent.clone()));
state.warm_cache = None;
state.reset();
let warm = state
.warm_cache
.as_ref()
.expect("reset should restore persistent warm start");
assert_eq!(warm.rho, persistent.rho);
assert_eq!(warm.block_beta, persistent.block_beta);
}
#[test]
fn public_warm_start_compatibility_checks_rho_dimension() {
let warm = CustomFamilyWarmStart {
inner: ConstrainedWarmStart {
rho: array![0.0, -0.5],
block_beta: vec![array![1.0, -1.0]],
active_sets: vec![None],
cached_inner: None,
},
};
assert!(warm.compatible_with_rho(&array![0.75, -0.5]));
assert!(warm.compatible_with_rho(&array![1.75, -0.5]));
assert!(!warm.compatible_with_rho(&array![0.0]));
}
#[test]
fn psi_drift_deriv_workspace_preserves_block_local_operator() {
#[derive(Clone)]
struct ZeroFamily;
impl CustomFamily for ZeroFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
assert!(block_states.len() <= isize::MAX as usize);
Ok(FamilyEvaluation {
log_likelihood: 0.0,
blockworking_sets: vec![],
})
}
}
struct BlockLocalPsiWorkspace;
impl ExactNewtonJointPsiWorkspace for BlockLocalPsiWorkspace {
fn second_order_terms(
&self,
idx: usize,
idx2: usize,
) -> Result<Option<ExactNewtonJointPsiSecondOrderTerms>, String> {
assert!(idx < usize::MAX);
assert!(idx2 < usize::MAX);
Ok(None)
}
fn hessian_directional_derivative(
&self,
psi_index: usize,
arr: &Array1<f64>,
) -> Result<Option<DriftDerivResult>, String> {
assert!(arr.iter().all(|v| !v.is_nan()));
assert_eq!(psi_index, 0);
Ok(Some(DriftDerivResult::Operator(Arc::new(
crate::solver::estimate::reml::unified::BlockLocalDrift {
local: array![[3.0, 1.0], [1.0, 2.0]],
start: 1,
end: 3,
total_dim: 3,
},
))))
}
}
let callback = build_psi_drift_deriv_callback(
&ZeroFamily,
&[],
&[],
Arc::new(Vec::new()),
false,
Some(Arc::new(BlockLocalPsiWorkspace)),
)
.expect("non-Gaussian psi drift callback should be available");
let result = callback(0, &array![1.0, 2.0, 3.0])
.expect("workspace-backed psi drift derivative should be returned");
match result {
DriftDerivResult::Dense(_) => {
panic!("workspace-backed block-local psi drift derivative was densified")
}
DriftDerivResult::Operator(op) => {
let (local, start, end) = op
.block_local_data()
.expect("block-local operator metadata should be preserved");
assert_eq!((start, end), (1, 3));
assert_eq!(local, &array![[3.0, 1.0], [1.0, 2.0]]);
}
}
}
#[test]
fn custom_family_outer_derivatives_respects_missing_second_order_capability() {
#[derive(Clone)]
struct OneBlockFirstOrderOnlyFamily;
impl CustomFamily for OneBlockFirstOrderOnlyFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let n = block_states[0].eta.len();
Ok(FamilyEvaluation {
log_likelihood: 0.0,
blockworking_sets: vec![BlockWorkingSet::Diagonal {
working_response: Array1::zeros(n),
working_weights: Array1::ones(n),
}],
})
}
fn exact_outer_derivative_order(
&self,
block_specs: &[ParameterBlockSpec],
options: &BlockwiseFitOptions,
) -> ExactOuterDerivativeOrder {
assert!(block_specs.len() <= isize::MAX as usize);
assert!(std::mem::size_of_val(options) > 0);
ExactOuterDerivativeOrder::First
}
}
let specs = vec![ParameterBlockSpec {
name: "x".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
offset: array![0.0],
penalties: vec![PenaltyMatrix::Dense(array![[1.0]])],
nullspace_dims: vec![],
initial_log_lambdas: array![0.0],
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
}];
let (gradient, hessian) = custom_family_outer_derivatives(
&OneBlockFirstOrderOnlyFamily,
&specs,
&BlockwiseFitOptions::default(),
);
assert_eq!(
gradient,
crate::solver::outer_strategy::Derivative::Analytic
);
assert_eq!(
hessian,
crate::solver::outer_strategy::DeclaredHessianForm::Unavailable
);
}
#[derive(Clone)]
struct DefaultDiagonalExactHookFamily;
impl CustomFamily for DefaultDiagonalExactHookFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let eta = block_states[0].eta.clone();
let weights = eta.mapv(|value| 2.0 + value * value);
Ok(FamilyEvaluation {
log_likelihood: -0.5 * eta.dot(&eta),
blockworking_sets: vec![BlockWorkingSet::Diagonal {
working_response: Array1::zeros(eta.len()),
working_weights: weights,
}],
})
}
fn exact_newton_joint_hessian_beta_dependent(&self) -> bool {
true
}
fn diagonalworking_weights_directional_derivative(
&self,
block_states: &[ParameterBlockState],
idx: usize,
d_eta: &Array1<f64>,
) -> Result<Option<Array1<f64>>, String> {
assert!(idx < usize::MAX);
Ok(Some((&block_states[0].eta * d_eta) * 2.0))
}
fn exact_newton_joint_hessiansecond_directional_derivative(
&self,
block_states: &[ParameterBlockState],
u: &Array1<f64>,
v: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
let spec = default_diagonal_exact_hook_spec();
let u_eta = spec.design.apply(u);
let v_eta = spec.design.apply(v);
assert_eq!(block_states[0].eta.len(), u_eta.len());
spec.design
.xt_diag_x_signed_op(SignedWeightsView::from_array(&((&u_eta * &v_eta) * 2.0)))
.map(Some)
}
}
fn default_diagonal_exact_hook_spec() -> ParameterBlockSpec {
ParameterBlockSpec {
name: "default_exact".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
[1.0, 0.5],
[0.0, 1.0],
[2.0, -1.0]
])),
offset: Array1::zeros(3),
penalties: vec![PenaltyMatrix::Dense(Array2::eye(2))],
nullspace_dims: vec![],
initial_log_lambdas: array![0.0],
initial_beta: Some(array![0.2, -0.1]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
}
}
#[test]
fn default_custom_family_exact_hessian_hooks_assemble_diagonal_working_sets() {
let family = DefaultDiagonalExactHookFamily;
let spec = default_diagonal_exact_hook_spec();
let beta = array![0.2, -0.1];
let eta = spec.design.apply(&beta);
let states = vec![ParameterBlockState {
beta: beta.clone(),
eta: eta.clone(),
}];
let h = family
.exact_newton_joint_hessian_with_specs(&states, &[spec.clone()])
.expect("default joint Hessian hook should succeed")
.expect("diagonal working sets should assemble an exact joint Hessian");
let expected_h = spec
.design
.xt_diag_x_signed_op(SignedWeightsView::from_array(
&eta.mapv(|value| 2.0 + value * value),
))
.unwrap();
assert_eq!(h, expected_h);
let direction = array![0.3, -0.4];
let dh = family
.exact_newton_joint_hessian_directional_derivative_with_specs(
&states,
&[spec.clone()],
&direction,
)
.expect("default joint dH hook should succeed")
.expect("diagonal weight derivative should assemble an exact joint dH");
let d_eta = spec.design.apply(&direction);
let expected_dh = spec
.design
.xt_diag_x_signed_op(SignedWeightsView::from_array(&((&eta * &d_eta) * 2.0)))
.unwrap();
assert_eq!(dh, expected_dh);
let d2h = family
.exact_newton_joint_hessiansecond_directional_derivative(&states, &direction, &beta)
.expect("family second directional hook should succeed")
.expect("second directional hook should be exact");
let beta_eta = spec.design.apply(&beta);
let expected_d2h = spec
.design
.xt_diag_x_signed_op(SignedWeightsView::from_array(&((&d_eta * &beta_eta) * 2.0)))
.unwrap();
assert_eq!(d2h, expected_d2h);
}
#[test]
fn default_custom_family_exact_hessian_hooks_drive_profiled_outer_hessian() {
let mut spec = default_diagonal_exact_hook_spec();
spec.initial_beta = Some(Array1::zeros(2));
let result = evaluate_custom_family_joint_hyper(
&DefaultDiagonalExactHookFamily,
&[spec],
&BlockwiseFitOptions {
use_remlobjective: true,
use_outer_hessian: true,
compute_covariance: false,
inner_max_cycles: 1,
..BlockwiseFitOptions::default()
},
&array![0.0],
&[vec![]],
None,
EvalMode::ValueGradientHessian,
)
.expect("profiled outer Hessian should use default exact Hessian hooks");
assert_eq!(result.gradient.len(), 1);
match result.outer_hessian {
crate::solver::outer_strategy::HessianResult::Analytic(hessian) => {
assert_eq!(hessian.dim(), (1, 1));
assert!(hessian[[0, 0]].is_finite());
}
_ => panic!("outer Hessian should be analytic"),
}
}
#[test]
fn nonconverged_inner_refuses_profile_derivatives() {
let spec = default_diagonal_exact_hook_spec();
let result = evaluate_custom_family_joint_hyper(
&DefaultDiagonalExactHookFamily,
&[spec],
&BlockwiseFitOptions {
use_remlobjective: true,
use_outer_hessian: true,
compute_covariance: false,
inner_max_cycles: 1,
..BlockwiseFitOptions::default()
},
&array![0.0],
&[vec![]],
None,
EvalMode::ValueGradientHessian,
);
let err = match result {
Ok(_) => panic!("non-converged inner solve must not expose derivatives"),
Err(e) => e,
};
let msg = err.to_string();
assert!(
msg.contains("inner solve did not converge") && msg.contains("refusing to expose"),
"unexpected error: {msg}"
);
}
#[test]
fn custom_family_seed_screening_proxy_accepts_finite_partial_inner_fit() {
let specs = vec![default_diagonal_exact_hook_spec()];
let penalty_counts = validate_blockspecs(&specs).expect("valid test spec");
let layout = penalty_label_layout(&specs, penalty_counts).expect("valid label layout");
let options = BlockwiseFitOptions {
use_remlobjective: true,
use_outer_hessian: true,
compute_covariance: false,
inner_max_cycles: 1,
..BlockwiseFitOptions::default()
};
let (score, warm_start, inner_converged) = custom_family_seed_screening_proxy_labeled(
&DefaultDiagonalExactHookFamily,
&specs,
&options,
&layout,
&array![0.0],
None,
&crate::types::RhoPrior::Flat,
)
.expect("screening proxy should score a finite partial inner solve");
assert!(score.is_finite());
assert!(
!inner_converged,
"one-cycle screening is expected to be a partial inner fit"
);
assert_eq!(warm_start.rho, array![0.0]);
assert_eq!(warm_start.block_beta.len(), 1);
}
#[test]
fn custom_family_outer_derivatives_exposes_surrogate_second_order_geometry() {
// RidgedQuadraticReml is the default objective; its analytic outer
// Hessian is routed to ARC, which handles indefinite Hessians via
// cubic regularization. The previous behavior forced these families
// onto BFGS+BfgsApprox and caused benchmark hangs at iter 0.
#[derive(Clone)]
struct SurrogateFamily;
impl CustomFamily for SurrogateFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let n = block_states[0].eta.len();
Ok(FamilyEvaluation {
log_likelihood: 0.0,
blockworking_sets: vec![BlockWorkingSet::Diagonal {
working_response: Array1::zeros(n),
working_weights: Array1::ones(n),
}],
})
}
}
let specs = vec![ParameterBlockSpec {
name: "x".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
offset: array![0.0],
penalties: vec![PenaltyMatrix::Dense(array![[1.0]])],
nullspace_dims: vec![],
initial_log_lambdas: array![0.0],
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
}];
let options = BlockwiseFitOptions {
use_remlobjective: true,
use_outer_hessian: true,
..BlockwiseFitOptions::default()
};
let (gradient, hessian) =
custom_family_outer_derivatives(&SurrogateFamily, &specs, &options);
assert_eq!(
gradient,
crate::solver::outer_strategy::Derivative::Analytic
);
assert_eq!(
hessian,
crate::solver::outer_strategy::DeclaredHessianForm::Either
);
}
#[test]
fn custom_family_outer_derivatives_keeps_strict_second_order_geometry() {
#[derive(Clone)]
struct StrictFamily;
impl CustomFamily for StrictFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let n = block_states[0].eta.len();
Ok(FamilyEvaluation {
log_likelihood: 0.0,
blockworking_sets: vec![BlockWorkingSet::Diagonal {
working_response: Array1::zeros(n),
working_weights: Array1::ones(n),
}],
})
}
fn exact_newton_outerobjective(&self) -> ExactNewtonOuterObjective {
ExactNewtonOuterObjective::StrictPseudoLaplace
}
}
let specs = vec![ParameterBlockSpec {
name: "x".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
offset: array![0.0],
penalties: vec![PenaltyMatrix::Dense(array![[1.0]])],
nullspace_dims: vec![],
initial_log_lambdas: array![0.0],
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
}];
let options = BlockwiseFitOptions {
use_remlobjective: true,
use_outer_hessian: true,
..BlockwiseFitOptions::default()
};
let (gradient, hessian) = custom_family_outer_derivatives(&StrictFamily, &specs, &options);
assert_eq!(
gradient,
crate::solver::outer_strategy::Derivative::Analytic
);
assert_eq!(
hessian,
crate::solver::outer_strategy::DeclaredHessianForm::Either
);
}
#[derive(Clone)]
struct OneBlockQuarticExactFamily {
linear: f64,
curvature: f64,
second_scale: f64,
}
impl CustomFamily for OneBlockQuarticExactFamily {
fn exact_newton_joint_hessian_beta_dependent(&self) -> bool {
// h(β) = 1 + curvature·β² genuinely depends on β; the default
// (false for RidgedQuadraticReml) would short-circuit the joint
// d²H aggregator to zeros and drop the per-block override below
// before it ever reaches the outer Hessian's drift contribution.
true
}
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let beta = block_states[0].beta[0];
let log_likelihood =
self.linear * beta - 0.5 * beta * beta - self.curvature * beta.powi(4) / 12.0;
let gradient = self.linear - beta - self.curvature * beta.powi(3) / 3.0;
let hessian = 1.0 + self.curvature * beta * beta;
Ok(FamilyEvaluation {
log_likelihood,
blockworking_sets: vec![BlockWorkingSet::ExactNewton {
gradient: array![gradient],
hessian: SymmetricMatrix::Dense(array![[hessian]]),
}],
})
}
fn exact_newton_hessian_directional_derivative(
&self,
block_states: &[ParameterBlockState],
block_idx: usize,
direction: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
assert_eq!(block_idx, 0);
let beta = block_states[0].beta[0];
Ok(Some(array![[2.0 * self.curvature * beta * direction[0]]]))
}
fn exact_newton_hessian_second_directional_derivative(
&self,
block_states: &[ParameterBlockState],
block_idx: usize,
u: &Array1<f64>,
v: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert_eq!(block_idx, 0);
let value = 2.0 * self.curvature * self.second_scale * u[0] * v[0];
Ok(Some(array![[value]]))
}
}
#[test]
fn generic_single_block_fallback_includes_nonzero_d2h_drift() {
let spec = ParameterBlockSpec {
name: "quartic".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
offset: array![0.0],
penalties: vec![PenaltyMatrix::Dense(array![[1.0]])],
nullspace_dims: vec![],
initial_log_lambdas: array![0.0],
initial_beta: Some(array![0.75]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let options = BlockwiseFitOptions {
inner_tol: 1e-11,
use_remlobjective: true,
use_outer_hessian: true,
compute_covariance: false,
..BlockwiseFitOptions::default()
};
let penalty_counts = vec![1];
let rho = array![0.0];
let with_d2 = evaluate_custom_family_hyper_internal(
&OneBlockQuarticExactFamily {
linear: 3.0,
curvature: 0.5,
second_scale: 1.0,
},
std::slice::from_ref(&spec),
&options,
&penalty_counts,
&rho,
&[vec![]],
None,
crate::types::RhoPrior::Flat,
EvalMode::ValueGradientHessian,
)
.expect("single-block fallback with exact d2H should evaluate");
let without_d2_contribution = evaluate_custom_family_hyper_internal(
&OneBlockQuarticExactFamily {
linear: 3.0,
curvature: 0.5,
second_scale: 0.0,
},
&[spec],
&options,
&penalty_counts,
&rho,
&[vec![]],
None,
crate::types::RhoPrior::Flat,
EvalMode::ValueGradientHessian,
)
.expect("single-block fallback with zero d2H should evaluate");
let h_with = with_d2.outer_hessian.unwrap_analytic();
let h_without = without_d2_contribution.outer_hessian.unwrap_analytic();
let d2h_delta = h_with[[0, 0]] - h_without[[0, 0]];
assert!(
d2h_delta.abs() > 1e-8,
"expected nonzero outer Hessian contribution from d2H; with={:?}, without={:?}",
h_with,
h_without
);
}
#[test]
fn custom_family_outer_derivatives_keeps_second_order_for_large_inner_problem() {
// Inner (n, p) scale does not block the analytic outer Hessian: the
// outer Hessian assembled by `compute_outer_hessian` is shape
// (K+ext_dim)×(K+ext_dim) where K = total penalties. For large inner
// problems with modest K (the common case: n=50000, p=50, K=2) the
// outer Hessian is tiny and must remain available so ARC can drive
// the outer iteration. Prior versions of this test enforced an
// inner-size cutoff that disabled the Hessian for exactly the
// benchmark sizes (medium: n=50000,p=50; pathological: n=50000,p=80)
// that were hanging 45-minute GH jobs on BFGS+BfgsApprox Strong Wolfe
// failures at iter 0.
#[derive(Clone)]
struct StrictFamily;
impl CustomFamily for StrictFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let n = block_states[0].eta.len();
Ok(FamilyEvaluation {
log_likelihood: 0.0,
blockworking_sets: vec![BlockWorkingSet::Diagonal {
working_response: Array1::zeros(n),
working_weights: Array1::ones(n),
}],
})
}
fn exact_newton_outerobjective(&self) -> ExactNewtonOuterObjective {
ExactNewtonOuterObjective::StrictPseudoLaplace
}
}
let specs = vec![ParameterBlockSpec {
name: "x".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
Array2::<f64>::zeros((20_100, 50)),
)),
offset: Array1::zeros(20_100),
penalties: vec![PenaltyMatrix::Dense(Array2::<f64>::eye(50))],
nullspace_dims: vec![],
initial_log_lambdas: array![0.0],
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
}];
let options = BlockwiseFitOptions {
use_remlobjective: true,
use_outer_hessian: true,
..BlockwiseFitOptions::default()
};
let (gradient, hessian) = custom_family_outer_derivatives(&StrictFamily, &specs, &options);
assert_eq!(
gradient,
crate::solver::outer_strategy::Derivative::Analytic
);
assert_eq!(
hessian,
crate::solver::outer_strategy::DeclaredHessianForm::Either
);
}
impl CustomFamily for OneBlockIdentityFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let n = block_states[0].eta.len();
Ok(FamilyEvaluation {
log_likelihood: 0.0,
blockworking_sets: vec![BlockWorkingSet::Diagonal {
working_response: Array1::ones(n),
working_weights: Array1::ones(n),
}],
})
}
}
#[test]
fn fit_custom_family_rejects_invalid_blockspec_before_output_channel_probe() {
let spec = ParameterBlockSpec {
name: "bad_penalty".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
[1.0],
[2.0],
])),
offset: Array1::zeros(2),
penalties: vec![PenaltyMatrix::Dense(Array2::<f64>::eye(2))],
nullspace_dims: vec![0],
initial_log_lambdas: array![0.0],
initial_beta: Some(array![0.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let err = fit_custom_family(
&OneBlockIdentityFamily,
&[spec],
&BlockwiseFitOptions::default(),
)
.expect_err("invalid block spec should return a typed error");
let message = err.to_string();
assert!(
message.contains("block 0 penalty 0 must be 1x1, got 2x2"),
"unexpected error: {message}",
);
}
#[derive(Clone)]
struct OneBlockGaussianFamily {
y: Array1<f64>,
}
impl CustomFamily for OneBlockGaussianFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let eta = &block_states[0].eta;
let resid = eta - &self.y;
let ll = -0.5 * resid.dot(&resid);
Ok(FamilyEvaluation {
log_likelihood: ll,
blockworking_sets: vec![BlockWorkingSet::Diagonal {
working_response: self.y.clone(),
working_weights: Array1::ones(self.y.len()),
}],
})
}
fn diagonalworking_weights_directional_derivative(
&self,
block_states: &[ParameterBlockState],
idx: usize,
d_eta: &Array1<f64>,
) -> Result<Option<Array1<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(idx < usize::MAX);
Ok(Some(Array1::zeros(d_eta.len())))
}
fn diagonalworking_weights_second_directional_derivative(
&self,
block_states: &[ParameterBlockState],
idx: usize,
d_eta_u: &Array1<f64>,
arr: &Array1<f64>,
) -> Result<Option<Array1<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(idx < usize::MAX);
assert!(arr.iter().all(|v| !v.is_nan()));
Ok(Some(Array1::zeros(d_eta_u.len())))
}
}
#[derive(Clone)]
struct OneBlockConstrainedExactFamily {
target: f64,
lower: f64,
}
impl CustomFamily for OneBlockConstrainedExactFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let beta = block_states
.first()
.ok_or_else(|| "missing block 0".to_string())?
.beta
.first()
.copied()
.ok_or_else(|| "missing coefficient".to_string())?;
let g = self.target - beta;
let ll = -0.5 * (beta - self.target) * (beta - self.target);
Ok(FamilyEvaluation {
log_likelihood: ll,
blockworking_sets: vec![BlockWorkingSet::ExactNewton {
gradient: array![g],
hessian: SymmetricMatrix::Dense(array![[1.0]]),
}],
})
}
fn block_linear_constraints(
&self,
block_states: &[ParameterBlockState],
block_idx: usize,
block_spec: &ParameterBlockSpec,
) -> Result<Option<LinearInequalityConstraints>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(!block_spec.name.is_empty());
if block_idx != 0 {
return Ok(None);
}
Ok(Some(LinearInequalityConstraints {
a: array![[1.0]],
b: array![self.lower],
}))
}
}
#[derive(Clone)]
struct OneBlockConstrainedNaNHessianFamily;
impl CustomFamily for OneBlockConstrainedNaNHessianFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
assert!(block_states.len() <= isize::MAX as usize);
Ok(FamilyEvaluation {
log_likelihood: 0.0,
blockworking_sets: vec![BlockWorkingSet::ExactNewton {
gradient: array![0.0],
hessian: SymmetricMatrix::Dense(array![[f64::NAN]]),
}],
})
}
fn block_linear_constraints(
&self,
block_states: &[ParameterBlockState],
block_idx: usize,
block_spec: &ParameterBlockSpec,
) -> Result<Option<LinearInequalityConstraints>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(!block_spec.name.is_empty());
if block_idx != 0 {
return Ok(None);
}
Ok(Some(LinearInequalityConstraints {
a: array![[1.0]],
b: array![0.0],
}))
}
}
#[derive(Clone)]
struct OneBlockConstrainedIndefiniteHessianFamily;
impl CustomFamily for OneBlockConstrainedIndefiniteHessianFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
assert!(block_states.len() <= isize::MAX as usize);
Ok(FamilyEvaluation {
log_likelihood: 0.0,
blockworking_sets: vec![BlockWorkingSet::ExactNewton {
gradient: array![-1.0],
hessian: SymmetricMatrix::Dense(array![[-1.0]]),
}],
})
}
fn block_linear_constraints(
&self,
block_states: &[ParameterBlockState],
block_idx: usize,
block_spec: &ParameterBlockSpec,
) -> Result<Option<LinearInequalityConstraints>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(!block_spec.name.is_empty());
if block_idx != 0 {
return Ok(None);
}
Ok(Some(LinearInequalityConstraints {
a: array![[1.0]],
b: array![1.0],
}))
}
}
#[derive(Clone)]
struct OneBlockLinearLikelihoodExactFamily {
score: f64,
}
impl CustomFamily for OneBlockLinearLikelihoodExactFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let beta = block_states
.first()
.ok_or_else(|| "missing block 0".to_string())?
.beta
.first()
.copied()
.ok_or_else(|| "missing coefficient".to_string())?;
Ok(FamilyEvaluation {
log_likelihood: self.score * beta,
blockworking_sets: vec![BlockWorkingSet::ExactNewton {
gradient: array![self.score],
hessian: SymmetricMatrix::Dense(array![[0.0]]),
}],
})
}
}
#[derive(Clone)]
struct PreferJointExactFamily;
impl CustomFamily for PreferJointExactFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
assert!(block_states.len() <= isize::MAX as usize);
Ok(FamilyEvaluation {
log_likelihood: 0.0,
blockworking_sets: vec![BlockWorkingSet::ExactNewton {
gradient: array![0.0],
hessian: SymmetricMatrix::Dense(array![[2.0]]),
}],
})
}
fn exact_newton_hessian_directional_derivative(
&self,
block_states: &[ParameterBlockState],
idx: usize,
arr: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(idx < usize::MAX);
assert!(arr.iter().all(|v| !v.is_nan()));
Err(
"blockwise exact-newton path should not be used when joint path is available"
.to_string(),
)
}
fn exact_newton_joint_hessian(
&self,
block_states: &[ParameterBlockState],
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
Ok(Some(array![[2.0]]))
}
fn exact_newton_joint_hessian_directional_derivative(
&self,
block_states: &[ParameterBlockState],
arr: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(arr.iter().all(|v| !v.is_nan()));
Ok(Some(array![[0.0]]))
}
}
#[derive(Clone)]
struct TwoBlockJointConstrainedFamily {
coupling: f64,
}
impl CustomFamily for TwoBlockJointConstrainedFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let beta0 = block_states[0].beta[0];
let beta1 = block_states[1].beta[0];
let g0 = 1.0 - beta0 - self.coupling * beta1;
let g1 = 1.0 - beta1 - self.coupling * beta0;
Ok(FamilyEvaluation {
log_likelihood: -0.5
* (beta0 * beta0 + beta1 * beta1 + 2.0 * self.coupling * beta0 * beta1)
+ beta0
+ beta1,
blockworking_sets: vec![
BlockWorkingSet::ExactNewton {
gradient: array![g0],
hessian: SymmetricMatrix::Dense(array![[1.0]]),
},
BlockWorkingSet::ExactNewton {
gradient: array![g1],
hessian: SymmetricMatrix::Dense(array![[1.0]]),
},
],
})
}
fn exact_newton_joint_hessian(
&self,
block_states: &[ParameterBlockState],
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
Ok(Some(array![[1.0, self.coupling], [self.coupling, 1.0]]))
}
fn exact_newton_joint_hessian_directional_derivative(
&self,
block_states: &[ParameterBlockState],
arr: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(arr.iter().all(|v| !v.is_nan()));
Ok(Some(Array2::zeros((2, 2))))
}
fn block_linear_constraints(
&self,
block_states: &[ParameterBlockState],
block_idx: usize,
block_spec: &ParameterBlockSpec,
) -> Result<Option<LinearInequalityConstraints>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(!block_spec.name.is_empty());
if block_idx >= 2 {
return Ok(None);
}
Ok(Some(LinearInequalityConstraints {
a: array![[1.0]],
b: array![0.0],
}))
}
}
#[derive(Clone)]
struct TwoBlockPersistentGradientFamily;
impl CustomFamily for TwoBlockPersistentGradientFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let beta0 = block_states[0].beta[0];
let beta1 = block_states[1].beta[0];
Ok(FamilyEvaluation {
log_likelihood: beta0 + beta1,
blockworking_sets: vec![
BlockWorkingSet::ExactNewton {
gradient: array![1.0],
hessian: SymmetricMatrix::Dense(array![[1.0]]),
},
BlockWorkingSet::ExactNewton {
gradient: array![1.0],
hessian: SymmetricMatrix::Dense(array![[1.0]]),
},
],
})
}
fn exact_newton_joint_hessian(
&self,
block_states: &[ParameterBlockState],
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
Ok(Some(array![[1.0, 0.25], [0.25, 1.0]]))
}
fn exact_newton_joint_hessian_directional_derivative(
&self,
block_states: &[ParameterBlockState],
arr: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(arr.iter().all(|v| !v.is_nan()));
Ok(Some(Array2::zeros((2, 2))))
}
fn has_explicit_joint_hessian(&self) -> bool {
true
}
}
#[derive(Clone)]
struct TwoBlockJointSurrogateFamily;
impl CustomFamily for TwoBlockJointSurrogateFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let n0 = block_states
.first()
.ok_or_else(|| "missing block 0".to_string())?
.eta
.len();
let n1 = block_states
.get(1)
.ok_or_else(|| "missing block 1".to_string())?
.eta
.len();
Ok(FamilyEvaluation {
log_likelihood: 0.0,
blockworking_sets: vec![
BlockWorkingSet::Diagonal {
working_response: Array1::zeros(n0),
working_weights: Array1::ones(n0),
},
BlockWorkingSet::Diagonal {
working_response: Array1::zeros(n1),
working_weights: Array1::ones(n1),
},
],
})
}
fn exact_newton_joint_hessian_with_specs(
&self,
block_states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
let p: usize = specs.iter().map(|spec| spec.design.ncols()).sum();
Ok(Some(Array2::eye(p)))
}
fn exact_newton_joint_hessian_directional_derivative_with_specs(
&self,
block_states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
arr: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(arr.iter().all(|v| !v.is_nan()));
let p: usize = specs.iter().map(|spec| spec.design.ncols()).sum();
Ok(Some(Array2::zeros((p, p))))
}
fn exact_newton_joint_hessian_second_directional_derivative_with_specs(
&self,
block_states: &[ParameterBlockState],
specs: &[ParameterBlockSpec],
arr: &Array1<f64>,
arr2: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(arr.iter().all(|v| !v.is_nan()));
assert!(arr2.iter().all(|v| !v.is_nan()));
let p: usize = specs.iter().map(|spec| spec.design.ncols()).sum();
Ok(Some(Array2::zeros((p, p))))
}
}
#[derive(Clone)]
struct OneBlockPseudoLaplaceExactFamily {
target: f64,
}
impl CustomFamily for OneBlockPseudoLaplaceExactFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let beta = block_states
.first()
.ok_or_else(|| "missing block 0".to_string())?
.beta
.first()
.copied()
.ok_or_else(|| "missing coefficient".to_string())?;
let resid = beta - self.target;
Ok(FamilyEvaluation {
log_likelihood: -resid * resid,
blockworking_sets: vec![BlockWorkingSet::ExactNewton {
gradient: array![-2.0 * resid],
hessian: SymmetricMatrix::Dense(array![[2.0]]),
}],
})
}
fn exact_newton_outerobjective(&self) -> ExactNewtonOuterObjective {
ExactNewtonOuterObjective::StrictPseudoLaplace
}
fn exact_newton_joint_hessian(
&self,
block_states: &[ParameterBlockState],
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
Ok(Some(array![[2.0]]))
}
fn exact_newton_hessian_directional_derivative(
&self,
block_states: &[ParameterBlockState],
idx: usize,
arr: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(idx < usize::MAX);
assert!(arr.iter().all(|v| !v.is_nan()));
Ok(Some(array![[0.0]]))
}
fn exact_newton_joint_hessian_directional_derivative(
&self,
block_states: &[ParameterBlockState],
arr: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(arr.iter().all(|v| !v.is_nan()));
Ok(Some(array![[0.0]]))
}
}
#[derive(Clone)]
struct OneBlockExactPsiHookFamily;
impl CustomFamily for OneBlockExactPsiHookFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
assert!(block_states.len() <= isize::MAX as usize);
Ok(FamilyEvaluation {
log_likelihood: 0.0,
blockworking_sets: vec![BlockWorkingSet::ExactNewton {
gradient: array![0.0],
hessian: SymmetricMatrix::Dense(array![[1.0]]),
}],
})
}
fn exact_newton_outerobjective(&self) -> ExactNewtonOuterObjective {
ExactNewtonOuterObjective::StrictPseudoLaplace
}
fn exact_newton_joint_hessian(
&self,
block_states: &[ParameterBlockState],
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
Ok(Some(array![[1.0]]))
}
fn exact_newton_hessian_directional_derivative(
&self,
block_states: &[ParameterBlockState],
idx: usize,
arr: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(idx < usize::MAX);
assert!(arr.iter().all(|v| !v.is_nan()));
Ok(Some(array![[0.0]]))
}
fn exact_newton_joint_hessian_directional_derivative(
&self,
block_states: &[ParameterBlockState],
arr: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(arr.iter().all(|v| !v.is_nan()));
Ok(Some(array![[0.0]]))
}
fn exact_newton_joint_psi_terms(
&self,
block_states: &[ParameterBlockState],
block_specs: &[ParameterBlockSpec],
derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
idx: usize,
) -> Result<Option<ExactNewtonJointPsiTerms>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(block_specs.len() <= isize::MAX as usize);
assert!(derivative_blocks.len() <= isize::MAX as usize);
assert!(idx < usize::MAX);
Ok(Some(ExactNewtonJointPsiTerms {
objective_psi: 3.5,
score_psi: array![0.0],
hessian_psi: array![[0.0]],
hessian_psi_operator: None,
}))
}
}
#[derive(Clone)]
struct OneBlockIndefinitePseudoLaplaceFamily;
impl CustomFamily for OneBlockIndefinitePseudoLaplaceFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
assert!(block_states.len() <= isize::MAX as usize);
Ok(FamilyEvaluation {
log_likelihood: 0.0,
blockworking_sets: vec![BlockWorkingSet::ExactNewton {
gradient: array![0.0],
hessian: SymmetricMatrix::Dense(array![[-1.0]]),
}],
})
}
fn exact_newton_outerobjective(&self) -> ExactNewtonOuterObjective {
ExactNewtonOuterObjective::StrictPseudoLaplace
}
fn exact_newton_joint_hessian(
&self,
block_states: &[ParameterBlockState],
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
Ok(Some(array![[-1.0]]))
}
}
#[derive(Clone)]
struct OneBlockNearlySymmetricPseudoLaplaceFamily;
impl CustomFamily for OneBlockNearlySymmetricPseudoLaplaceFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let beta = block_states
.first()
.ok_or_else(|| "missing block 0".to_string())?
.beta
.clone();
let h = array![[2.0, 0.1], [3.0, 2.0]];
let gradient = -h.dot(&beta);
Ok(FamilyEvaluation {
log_likelihood: -0.5 * beta.dot(&h.dot(&beta)),
blockworking_sets: vec![BlockWorkingSet::ExactNewton {
gradient,
hessian: SymmetricMatrix::Dense(h),
}],
})
}
fn exact_newton_outerobjective(&self) -> ExactNewtonOuterObjective {
ExactNewtonOuterObjective::StrictPseudoLaplace
}
fn exact_newton_joint_hessian(
&self,
block_states: &[ParameterBlockState],
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
Ok(Some(array![[2.0, 0.1], [3.0, 2.0]]))
}
}
#[derive(Clone)]
struct OneBlockAlwaysErrorFamily;
impl CustomFamily for OneBlockAlwaysErrorFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
assert!(block_states.len() <= isize::MAX as usize);
Err("synthetic outer objective failure: block[0] evaluate()".to_string())
}
}
#[derive(Clone)]
struct OneBlockCovarianceErrorFamily;
impl CustomFamily for OneBlockCovarianceErrorFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let n = block_states[0].eta.len();
Ok(FamilyEvaluation {
log_likelihood: 0.0,
blockworking_sets: vec![BlockWorkingSet::Diagonal {
working_response: Array1::zeros(n),
working_weights: Array1::ones(n),
}],
})
}
fn exact_newton_joint_hessian_with_specs(
&self,
block_states: &[ParameterBlockState],
block_specs: &[ParameterBlockSpec],
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(block_specs.len() <= isize::MAX as usize);
Err("synthetic covariance assembly failure".to_string())
}
}
#[test]
fn effectiveridge_is_never_below_solver_floor() {
assert!((effective_solverridge(0.0) - 1e-15).abs() < 1e-30);
assert!((effective_solverridge(1e-8) - 1e-8).abs() < 1e-20);
}
#[test]
fn objective_includes_solverridge_quadratic_term() {
// One-parameter block with X=1, y*=1, w=1, no explicit penalties.
// Inner solve gives beta = 1 / (1 + ridge), so objective should include
// 0.5 * ridge * beta^2 even when no smoothing penalties are present.
let spec = ParameterBlockSpec {
name: "b0".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
offset: array![0.0],
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(array![0.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let options = BlockwiseFitOptions {
inner_max_cycles: 1,
inner_tol: 0.0,
outer_max_iter: 1,
outer_tol: 1e-8,
minweight: CUSTOM_FAMILY_WEIGHT_FLOOR,
ridge_floor: 1e-4,
ridge_policy: RidgePolicy::explicit_stabilization_pospart(),
use_remlobjective: false,
compute_covariance: false,
use_outer_hessian: false,
screening_max_inner_iterations: None,
outer_inner_max_iterations: None,
seed_screening: false,
early_exit_threshold: None,
outer_score_subsample: None,
auto_outer_subsample: false,
outer_eval_context: None,
cache_session: None,
cache_mirror_sessions: Vec::new(),
joint_penalties: None,
screen_initial_rho: true,
};
let result = fit_custom_family(&OneBlockIdentityFamily, &[spec], &options)
.expect("custom family fit should succeed");
let ridge = effective_solverridge(options.ridge_floor);
let beta = result.block_states[0].beta[0];
let expected_penalty = 0.5 * ridge * beta * beta;
assert!(
(result.penalized_objective - expected_penalty).abs() < 1e-12,
"penalized objective should equal ridge quadratic term when ll=0 and S=0; got {}, expected {}",
result.penalized_objective,
expected_penalty
);
}
#[test]
fn inner_block_accepts_penalty_improving_step_even_if_loglik_drops() {
let family = OneBlockGaussianFamily { y: array![1.0] };
let spec = ParameterBlockSpec {
name: "b0".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
offset: array![0.0],
penalties: vec![PenaltyMatrix::Dense(array![[1.0]])],
nullspace_dims: vec![],
initial_log_lambdas: array![10.0_f64.ln()],
initial_beta: Some(array![1.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let options = BlockwiseFitOptions {
inner_max_cycles: 20,
inner_tol: 1e-10,
outer_max_iter: 1,
outer_tol: 1e-8,
minweight: CUSTOM_FAMILY_WEIGHT_FLOOR,
ridge_floor: 0.0,
ridge_policy: RidgePolicy::explicit_stabilization_pospart(),
use_remlobjective: false,
compute_covariance: false,
use_outer_hessian: false,
screening_max_inner_iterations: None,
outer_inner_max_iterations: None,
seed_screening: false,
early_exit_threshold: None,
outer_score_subsample: None,
auto_outer_subsample: false,
outer_eval_context: None,
cache_session: None,
cache_mirror_sessions: Vec::new(),
joint_penalties: None,
screen_initial_rho: true,
};
let per_block_log_lambdas = vec![array![10.0_f64.ln()]];
let inner = inner_blockwise_fit(&family, &[spec], &per_block_log_lambdas, &options, None)
.expect("inner blockwise fit should succeed");
let beta = inner.block_states[0].beta[0];
assert!(
beta < 0.5,
"beta should shrink toward penalized mode; got {}",
beta
);
assert!(
inner.log_likelihood < -1e-8,
"raw log-likelihood should drop for this strongly penalized move; got {}",
inner.log_likelihood
);
}
#[test]
fn exact_newton_backtracking_descent_includes_explicit_ridge() {
let family = OneBlockLinearLikelihoodExactFamily { score: 0.5 };
let spec = ParameterBlockSpec {
name: "b0".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
offset: array![0.0],
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(array![1.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let options = BlockwiseFitOptions {
inner_max_cycles: 1,
inner_tol: 0.0,
outer_max_iter: 1,
outer_tol: 1e-8,
minweight: CUSTOM_FAMILY_WEIGHT_FLOOR,
ridge_floor: 1.0,
ridge_policy: RidgePolicy::explicit_stabilization_pospart(),
use_remlobjective: false,
compute_covariance: false,
use_outer_hessian: false,
screening_max_inner_iterations: None,
outer_inner_max_iterations: None,
seed_screening: false,
early_exit_threshold: None,
outer_score_subsample: None,
auto_outer_subsample: false,
outer_eval_context: None,
cache_session: None,
cache_mirror_sessions: Vec::new(),
joint_penalties: None,
screen_initial_rho: true,
};
let inner = inner_blockwise_fit(&family, &[spec], &[Array1::zeros(0)], &options, None)
.expect("inner blockwise fit should succeed");
let beta = inner.block_states[0].beta[0];
let objective = -inner.log_likelihood + inner.penalty_value;
assert!(
beta < 1.0 - 1e-12,
"ridge-aware fallback descent should shrink beta after rejecting the uphill Newton step; got {}",
beta
);
assert!(
objective < -1e-12,
"accepted fallback step should lower the penalized objective; got {}",
objective
);
}
#[test]
fn outergradient_matches_finite_difference_for_one_block() {
let n = 8usize;
let y = Array1::from_vec(vec![0.4, -0.2, 0.8, 1.0, -0.5, 0.3, 0.1, -0.7]);
let spec = ParameterBlockSpec {
name: "b0".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::from_elem(
(n, 1),
1.0,
))),
offset: Array1::zeros(n),
penalties: vec![PenaltyMatrix::Dense(Array2::eye(1))],
nullspace_dims: vec![],
initial_log_lambdas: array![0.2],
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let options = BlockwiseFitOptions {
use_remlobjective: true,
ridge_floor: 1e-10,
..BlockwiseFitOptions::default()
};
let penalty_counts = vec![1usize];
let rho = array![0.1];
let (f0, g0, _) = outerobjective_andgradient(
&OneBlockGaussianFamily { y: y.clone() },
std::slice::from_ref(&spec),
&options,
&penalty_counts,
&rho,
None,
)
.expect("objective/gradient");
let h = 1e-5;
let rho_p = array![rho[0] + h];
let rho_m = array![rho[0] - h];
let (fp, _, _) = outerobjective_andgradient(
&OneBlockGaussianFamily { y: y.clone() },
std::slice::from_ref(&spec),
&options,
&penalty_counts,
&rho_p,
None,
)
.expect("objective+");
let (fm, _, _) = outerobjective_andgradient(
&OneBlockGaussianFamily { y },
std::slice::from_ref(&spec),
&options,
&penalty_counts,
&rho_m,
None,
)
.expect("objective-");
let gfd = (fp - fm) / (2.0 * h);
let rel = (g0[0] - gfd).abs() / gfd.abs().max(1e-8);
assert!(f0.is_finite());
assert_eq!(
g0[0].signum(),
gfd.signum(),
"outer gradient sign mismatch: analytic={} fd={}",
g0[0],
gfd
);
assert!(
rel < 5e-3,
"outer gradient mismatch: analytic={} fd={} rel={}",
g0[0],
gfd,
rel
);
}
#[test]
fn outergradient_prefers_joint_exact_pathwhen_available() {
let spec = ParameterBlockSpec {
name: "joint_exact".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
offset: array![0.0],
penalties: vec![PenaltyMatrix::Dense(Array2::eye(1))],
nullspace_dims: vec![],
initial_log_lambdas: array![0.0],
initial_beta: Some(array![0.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let options = BlockwiseFitOptions {
use_remlobjective: true,
ridge_floor: 1e-10,
..BlockwiseFitOptions::default()
};
let penalty_counts = vec![1usize];
let rho = array![0.0];
let result = outerobjective_andgradient(
&PreferJointExactFamily,
std::slice::from_ref(&spec),
&options,
&penalty_counts,
&rho,
None,
);
assert!(
result.is_ok(),
"joint exact path should be preferred over blockwise fallback: {:?}",
result.err()
);
}
#[test]
fn innerfit_uses_joint_exact_path_for_multiblock_constraints() {
let spec0 = ParameterBlockSpec {
name: "block0".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
offset: array![0.0],
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(array![0.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let spec1 = ParameterBlockSpec {
name: "block1".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
offset: array![0.0],
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(array![0.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let options = BlockwiseFitOptions {
inner_max_cycles: 1,
inner_tol: 1e-10,
ridge_floor: CUSTOM_FAMILY_RIDGE_FLOOR,
..BlockwiseFitOptions::default()
};
let per_block = vec![Array1::zeros(0), Array1::zeros(0)];
let result = inner_blockwise_fit(
&TwoBlockJointConstrainedFamily { coupling: 0.25 },
&[spec0, spec1],
&per_block,
&options,
None,
)
.expect("joint constrained inner fit should succeed");
assert!(
result.converged,
"joint constrained inner fit should converge in one cycle"
);
assert_eq!(result.cycles, 1);
assert!((result.block_states[0].beta[0] - 0.8).abs() < 1e-8);
assert!((result.block_states[1].beta[0] - 0.8).abs() < 1e-8);
assert_eq!(result.active_sets, vec![None, None]);
}
#[test]
fn joint_newton_budget_exhaustion_refuses_coupled_exact_inner() {
let spec0 = ParameterBlockSpec {
name: "block0".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
offset: array![0.0],
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(array![0.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let spec1 = ParameterBlockSpec {
name: "block1".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
offset: array![0.0],
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(array![0.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let options = BlockwiseFitOptions {
inner_max_cycles: 1,
inner_tol: 1e-12,
ridge_floor: CUSTOM_FAMILY_RIDGE_FLOOR,
..BlockwiseFitOptions::default()
};
let per_block = vec![Array1::zeros(0), Array1::zeros(0)];
let err = inner_blockwise_fit(
&TwoBlockPersistentGradientFamily,
&[spec0, spec1],
&per_block,
&options,
None,
)
.expect_err("coupled exact-joint max-budget exhaustion must fail loudly");
assert!(
err.contains("exhausted the joint Newton budget without KKT convergence"),
"budget exhaustion should be named explicitly: {err}"
);
assert!(
err.contains("block_residual_inf"),
"error should carry per-block residual diagnostics: {err}"
);
}
/// gam#787 binary matern centers=12 regression. Near a flat-objective
/// optimum the joint-Newton proposal shrinks to the step-tol floor while
/// `predicted_reduction = rhs·δ − ½δᵀHδ` becomes round-off-signed. The
/// `predicted_reduction ≤ 0` branch must NOT fire the preconditioned-descent
/// substitution there (it would replace the tiny KKT-polishing step with an
/// objective-descent step that catapults the residual off the near-converged
/// iterate). `joint_proposal_at_step_floor` is the suppression gate.
#[test]
fn joint_proposal_at_step_floor_suppresses_descent_substitution_near_optimum() {
// The exact c12 cycle-10 operating point: proposal_inf=1.413e-5,
// step_tol=1.355e-5 (proposal a hair = 1.04× above tol). The iterate is
// polishing KKT, so a pred≤0 here is round-off — the gate must fire.
assert!(
joint_proposal_at_step_floor(1.413e-5, 1.355e-5),
"a proposal within 4× step_tol is at the convergence floor; \
the descent substitution must be suppressed"
);
// Exactly at the 4× band edge: still at the floor.
assert!(joint_proposal_at_step_floor(4.0 * 1.355e-5, 1.355e-5));
// A genuinely large proposal (model-invalid direction far from the
// optimum) is NOT at the floor — the descent substitution must still run.
assert!(
!joint_proposal_at_step_floor(1.182e-2, 1.355e-5),
"an O(1e-2) proposal is far above the step floor; the \
preconditioned-descent fallback must remain active there"
);
// Non-finite inputs never certify the floor (so the substitution path
// keeps its existing non-finite handling).
assert!(!joint_proposal_at_step_floor(f64::NAN, 1.0e-5));
assert!(!joint_proposal_at_step_floor(1.0e-6, f64::INFINITY));
}
/// Independent derivation and direct numerical proof of the
/// ρ ≈ 2 inner-PIRLS pathology pinned by the large-scale saturated-probit
/// failure trace.
///
/// # Mechanism
///
/// Inner Newton on the penalized objective `f(β) = -ℓ(β) + ½βᵀSβ`
/// uses two different ridge values:
/// * **APPLY** path (`apply_joint_penalized_hessian_into`, called
/// inside `joint_quadratic_predicted_reduction`) uses
/// `joint_solver_diagonal_ridge`, which equals
/// `joint_mode_diagonal_ridge + JOINT_TRACE_STABILITY_RIDGE +
/// stabilizing_shift`, where the stabilizing shift is whatever
/// positive quantity `stabilized_joint_solver_diagonal_ridge`
/// adds to lift a negative-eigenvalue joint Hessian above the
/// SPD floor.
/// * **TRIAL OBJECTIVE** path (`total_quadratic_penalty`) uses
/// only `joint_mode_diagonal_ridge` (= `effective_solverridge`),
/// which is the true penalty in the objective `f` and does NOT
/// include the stabilizing shift.
///
/// Let `Δ = joint_solver_diagonal_ridge - joint_mode_diagonal_ridge`
/// (the gap between the SOLVE / APPLY matrix and the TRUE Hessian).
/// For a Newton step `δ = (H_NLL + S + joint_solver_diagonal_ridge·I)⁻¹·rhs`,
/// the Newton identity gives `δᵀ·H_used·δ = rhs·δ`, so:
///
/// predicted = rhs·δ − ½·δᵀ·H_used·δ = ½·rhs·δ
/// actual = rhs·δ − ½·δᵀ·H_true·δ
/// = rhs·δ − ½·(δᵀ·H_used·δ − Δ·‖δ‖²)
/// = ½·rhs·δ + ½·Δ·‖δ‖²
/// ρ = actual / predicted = 1 + Δ·‖δ‖² / (rhs·δ)
///
/// When `δ ∈ null(H_true)` (e.g. the marginal-block cancellation
/// direction from `marginal_block_hessian_cancels_in_saturated_regime`
/// combined with an unpenalized direction in the smoothing penalty's
/// null space), `H_true·δ = 0`, so `H_used·δ = Δ·δ` and therefore
/// `rhs = Δ·δ`, giving `rhs·δ = Δ·‖δ‖²`. Substituting:
///
/// ρ = 1 + Δ·‖δ‖² / (Δ·‖δ‖²) = 2 EXACTLY.
///
/// This is independent of `Δ`, of the data size, and of `‖δ‖` — it
/// is a structural consequence of "SOLVE/APPLY add a stabilizing
/// shift that TRIAL OBJECTIVE doesn't see" combined with "Newton
/// step lies in the null space of the true Hessian".
///
/// # Test
///
/// We construct a 2D synthetic case with H_NLL indefinite (one
/// negative eigenvalue, mimicking the entry-survival concave term),
/// `S = 0`, and `joint_mode_diagonal_ridge = 0` (i.e. the policy
/// does NOT include the ridge in the objective). The stabilizing
/// shift lifts the negative eigenvalue to the SPD floor; the Newton
/// step lies in the formerly-near-null direction; predicted and
/// actual are computed by the exact same routines the inner solver
/// uses; ρ comes out to exactly 2.0 to floating-point precision.
#[test]
fn ridge_stabilization_gap_produces_exact_rho_two_in_null_direction() {
// Synthetic 3D joint Hessian with the structure of the
// saturated-probit failure case at large scale:
// - dim 0: indefinite contribution (eigenvalue −1) from the
// concave entry-survival term `+w·log Φ(−η₀)`. This triggers
// the SPD stabilizer in the solver.
// - dim 1: positive contribution (+1) from a non-saturated
// coefficient direction.
// - dim 2: ZERO from the marginal-block Hessian cancellation
// proven separately in `marginal_block_hessian_cancels_in_saturated_regime`.
// This is the saturating direction that sits in null(H_true).
let h_nll = array![[-1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]];
let source = JointHessianSource::Dense(h_nll.clone());
let ranges = vec![(0, 3)];
// Smoothing penalty `S` is zero in the saturating direction
// (dim 2) — mirrors the duchon-smooth polynomial null space
// containing constants/linears.
let s_lambdas = vec![Array2::<f64>::zeros((3, 3))];
// Stabilized solver ridge: should add ~1.0 to lift the
// -1 eigenvalue to the SPD floor (~ridge_floor).
let base = JOINT_TRACE_STABILITY_RIDGE;
let ridge_floor = 1.0e-12_f64;
let joint_mode_diagonal_ridge = 0.0_f64; // policy: ridge NOT in objective
// `stabilized_joint_solver_diagonal_ridge` consults the family only
// for `use_exact_newton_strict_spd`, which defaults to false; we
// simulate that branch by computing the shift directly via
// `exact_newton_stabilizing_shift`.
let mut lhs = h_nll.clone();
add_joint_penalty_to_matrix(&mut lhs, &ranges, &s_lambdas, base, None);
let shift = exact_newton_stabilizing_shift(&lhs, ridge_floor)
.expect("indefinite Hessian must yield a positive stabilizing shift");
assert!(
shift > 0.9,
"shift should lift the -1 eigenvalue; got {shift}"
);
let joint_solver_diagonal_ridge = base + shift;
let big_delta = joint_solver_diagonal_ridge - joint_mode_diagonal_ridge;
// True Hessian (what TRIAL OBJECTIVE sees):
// H_true = H_NLL + S + joint_mode_diagonal_ridge·I
// = diag(-1, 1, 0)
// ⇒ dim 2 is a null direction of H_true.
// Used Hessian (what SOLVE / APPLY uses):
// H_used = H_NLL + S + joint_solver_diagonal_ridge·I
// = diag(-1+Δ, 1+Δ, Δ) where Δ ≈ 1.0
// ⇒ dim 2 has curvature Δ (purely from the stabilizing shift,
// which fires because dim 0 is negative).
// rhs aimed entirely in dim 2 puts the Newton step in null(H_true).
let rhs = array![0.0_f64, 0.0, 1.0];
let h_used_22 = 0.0 + joint_solver_diagonal_ridge;
let delta = array![0.0, 0.0, rhs[2] / h_used_22];
// Compute hpen_delta via the SAME helper the inner solver uses.
let mut hpen_delta = Array1::<f64>::zeros(3);
apply_joint_penalized_hessian_into(
&source,
&ranges,
&s_lambdas,
joint_solver_diagonal_ridge,
&delta,
&mut hpen_delta,
None,
)
.expect("apply joint penalized hessian must succeed");
// Predicted = the exact formula the inner solver uses.
let predicted = joint_quadratic_predicted_reduction(&rhs, &hpen_delta, &delta);
// Actual (true) reduction: f(β=0) − f(β+δ) for the true objective
// f(β) = ½·βᵀ·H_NLL·β + ½·βᵀ·S·β + ½·joint_mode_diagonal_ridge·‖β‖² + bᵀ·β
// taking β_start = 0 and using the Newton identity for the truth:
// actual = rhs·δ − ½·δᵀ·H_true·δ
// where H_true = H_NLL + S + joint_mode_diagonal_ridge·I.
let mut h_true_delta = Array1::<f64>::zeros(3);
apply_joint_penalized_hessian_into(
&source,
&ranges,
&s_lambdas,
joint_mode_diagonal_ridge,
&delta,
&mut h_true_delta,
None,
)
.expect("apply true (un-stabilized) hessian must succeed");
let actual = rhs.dot(&delta) - 0.5 * delta.dot(&h_true_delta);
let rho = actual / predicted;
eprintln!(
"[rho-2 proof] Δ = {big_delta:.6e}, rhs·δ = {rd:.6e}, Δ·‖δ‖² = {dn:.6e}, predicted = {predicted:.6e}, actual = {actual:.6e}, ρ = {rho:.10}",
rd = rhs.dot(&delta),
dn = big_delta * delta.dot(&delta),
);
// ρ must be EXACTLY 2 to floating-point precision (not just "close to 2").
// This is the structural fingerprint of the SOLVE/APPLY-vs-OBJECTIVE
// ridge-stabilization gap in the saturated regime.
assert!(
(rho - 2.0).abs() <= 1e-10,
"ρ should be EXACTLY 2 when Newton step lies in null(H_true) with stabilizing-shift gap; got {rho}",
);
// Sanity: the identity rhs·δ = Δ·‖δ‖² must hold (this is the
// mathematical core of why ρ = 2 specifically and not 1.5 or 3).
let rhs_dot_delta = rhs.dot(&delta);
let delta_sq_times_big_delta = big_delta * delta.dot(&delta);
assert!(
(rhs_dot_delta - delta_sq_times_big_delta).abs() <= 1e-10 * rhs_dot_delta.abs(),
"Newton-identity null-space condition: rhs·δ ({rhs_dot_delta}) should equal Δ·‖δ‖² ({delta_sq_times_big_delta})",
);
// And ρ = 2 holds AT ALL MAGNITUDES of δ — verify by scaling rhs:
for scale in [0.001_f64, 0.029, 1.0, 988.0] {
let scaled_rhs = &rhs * scale;
let scaled_delta = &delta * scale;
let mut scaled_hpen = Array1::<f64>::zeros(3);
apply_joint_penalized_hessian_into(
&source,
&ranges,
&s_lambdas,
joint_solver_diagonal_ridge,
&scaled_delta,
&mut scaled_hpen,
None,
)
.expect("apply scaled");
let scaled_predicted =
joint_quadratic_predicted_reduction(&scaled_rhs, &scaled_hpen, &scaled_delta);
let mut scaled_h_true_delta = Array1::<f64>::zeros(3);
apply_joint_penalized_hessian_into(
&source,
&ranges,
&s_lambdas,
joint_mode_diagonal_ridge,
&scaled_delta,
&mut scaled_h_true_delta,
None,
)
.expect("apply scaled true");
let scaled_actual =
scaled_rhs.dot(&scaled_delta) - 0.5 * scaled_delta.dot(&scaled_h_true_delta);
let scaled_rho = scaled_actual / scaled_predicted;
assert!(
(scaled_rho - 2.0).abs() <= 1e-10,
"ρ invariance under step rescaling broke at scale {scale}: got {scaled_rho}",
);
}
}
#[test]
fn joint_solver_ridge_stabilizes_dense_indefinite_coupled_hessian() {
let family = TwoBlockJointConstrainedFamily { coupling: 2.0 };
let source = JointHessianSource::Dense(array![[1.0, 2.0], [2.0, 1.0]]);
let ranges = vec![(0, 1), (1, 2)];
let s_lambdas = vec![Array2::zeros((1, 1)), Array2::zeros((1, 1))];
let ridge = stabilized_joint_solver_diagonal_ridge(
&family,
&source,
&ranges,
&s_lambdas,
JOINT_TRACE_STABILITY_RIDGE,
1e-12,
None,
);
assert!(
ridge > 1.0,
"dense joint solver ridge should lift the negative eigenvalue; got {ridge}"
);
let mut stabilized = match source {
JointHessianSource::Dense(matrix) => matrix,
JointHessianSource::Operator { .. } => {
panic!("dense joint solver fixture must use a dense Hessian source")
}
};
add_joint_penalty_to_matrix(&mut stabilized, &ranges, &s_lambdas, ridge, None);
let min_eval = 0.5
* (stabilized[[0, 0]] + stabilized[[1, 1]]
- ((stabilized[[0, 0]] - stabilized[[1, 1]]).powi(2)
+ 4.0 * stabilized[[0, 1]].powi(2))
.sqrt());
assert!(
min_eval > 0.0,
"stabilized dense joint Hessian should be SPD; min_eval={min_eval}"
);
}
#[test]
fn outergradient_uses_joint_surrogate_formultiblock_diagonal_family() {
let spec0 = ParameterBlockSpec {
name: "block0".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
[1.0],
[1.0]
])),
offset: array![0.0, 0.0],
penalties: vec![PenaltyMatrix::Dense(Array2::eye(1))],
nullspace_dims: vec![],
initial_log_lambdas: array![0.0],
initial_beta: Some(array![0.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let spec1 = ParameterBlockSpec {
name: "block1".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
[1.0],
[1.0]
])),
offset: array![0.0, 0.0],
penalties: vec![PenaltyMatrix::Dense(Array2::eye(1))],
nullspace_dims: vec![],
initial_log_lambdas: array![0.0],
initial_beta: Some(array![0.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let options = BlockwiseFitOptions {
use_remlobjective: true,
ridge_floor: 1e-10,
outer_max_iter: 1,
..BlockwiseFitOptions::default()
};
let penalty_counts = vec![1usize, 1usize];
let rho = array![0.0, 0.0];
let result = outerobjective_andgradient(
&TwoBlockJointSurrogateFamily,
&[spec0, spec1],
&options,
&penalty_counts,
&rho,
None,
);
assert!(
result.is_ok(),
"default joint multi-block surrogate path should succeed without blockwise dW callbacks: {:?}",
result.err()
);
}
#[test]
fn exact_newton_pseudo_laplace_objective_uses_logdet_h_without_logdet_s() {
let spec = ParameterBlockSpec {
name: "pseudo_laplace".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
offset: array![0.0],
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(array![0.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let options = BlockwiseFitOptions {
use_remlobjective: true,
ridge_floor: CUSTOM_FAMILY_RIDGE_FLOOR,
compute_covariance: false,
..BlockwiseFitOptions::default()
};
let fit = fit_custom_family(
&OneBlockPseudoLaplaceExactFamily { target: 1.5 },
&[spec],
&options,
)
.expect("pseudo-laplace exact-newton fit");
let expected = 0.5 * 2.0_f64.ln();
assert!(
(fit.penalized_objective - expected).abs() < 1e-8,
"pseudo-Laplace objective mismatch: got {}, expected {}",
fit.penalized_objective,
expected
);
}
#[test]
fn exact_newton_joint_psi_hook_can_supply_fixed_beta_termswithout_quadratic_spsi() {
let spec = ParameterBlockSpec {
name: "psi_hook".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
offset: array![0.0],
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(array![0.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let deriv = CustomFamilyBlockPsiDerivative {
penalty_index: None,
x_psi: Array2::zeros((1, 1)),
s_psi: Array2::zeros((1, 1)),
s_psi_components: None,
s_psi_penalty_components: None,
x_psi_psi: None,
s_psi_psi: None,
s_psi_psi_components: None,
s_psi_psi_penalty_components: None,
implicit_operator: None,
implicit_axis: 0,
implicit_group_id: None,
};
let result = evaluate_custom_family_joint_hyper(
&OneBlockExactPsiHookFamily,
&[spec],
&BlockwiseFitOptions {
use_remlobjective: true,
compute_covariance: false,
..BlockwiseFitOptions::default()
},
&Array1::zeros(0),
&[vec![deriv]],
None,
EvalMode::ValueAndGradient,
)
.expect("joint hyper eval with exact joint psi hook");
assert_eq!(result.gradient.len(), 1);
assert!(
(result.gradient[0] - 3.5).abs() < 1e-12,
"expected family-supplied joint psi term, got {}",
result.gradient[0]
);
}
#[test]
fn pseudo_laplace_exact_newton_rejects_indefinite_hessian() {
// #748: an indefinite joint coefficient Hessian (here a 1×1 block with
// H=-1) is a real defect — a mis-signed / non-convex curvature, or a β
// that is not at the inner block optimum. The strict pseudo-Laplace
// REML logdet must REJECT such a ρ-trial, not mask it. The earlier path
// returned `log|H + δI|` with δ escalated to 10 (so H+δI=[[9]],
// logdet=log 9) and let the fit "succeed" — but the analytic REML
// gradient still used `tr((H+S_λ)⁻¹·)` on the un-ridged H, so value and
// gradient described two different objectives. Rejecting is the honest
// signal: the outer optimizer steps back instead of optimizing a biased,
// δ-shifted surface. The fit therefore now ERRORS where it formerly
// returned a masked result.
let spec = ParameterBlockSpec {
name: "indefinite".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
offset: array![0.0],
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(array![0.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let result = fit_custom_family(
&OneBlockIndefinitePseudoLaplaceFamily,
&[spec],
&BlockwiseFitOptions {
use_remlobjective: true,
compute_covariance: false,
..BlockwiseFitOptions::default()
},
);
let err = result
.expect_err(
"strict pseudo-Laplace must reject the indefinite Hessian H=[[-1]], not δ-ridge mask it",
)
.to_string();
assert!(
err.contains("indefinite") || err.contains("below -tol"),
"rejection error should name the indefiniteness; got: {err}",
);
}
#[test]
fn auto_determinant_mode_is_exact_full_logdet_policy() {
let h = array![[6.0, 0.8, 0.1], [0.8, 4.5, 0.4], [0.1, 0.4, 3.2]];
let exact = stable_logdet_with_ridge_policy(
&h,
1e-8,
RidgePolicy::explicit_stabilization_full_exact(),
)
.expect("exact logdet");
let auto =
stable_logdet_with_ridge_policy(&h, 1e-8, RidgePolicy::explicit_stabilization_full())
.expect("auto logdet");
assert!((auto - exact).abs() < 1e-12, "auto={auto}, exact={exact}");
}
#[test]
fn indefinite_hessian_uses_smooth_regularized_logdet() {
// Indefinite Hessian: eigenvalues {-1, 2}.
//
// Old behaviour: silently drop the -1 direction from logdet, warn,
// and after enough repeats escalate to an EFS abort (first-order
// fallback marker).
//
// New behaviour: every eigenvalue contributes via the smooth
// regularizer r_ε(σ) = ½(σ + √(σ² + 4ε²)). No direction is ignored,
// no escalation, and the logdet matches what the downstream
// `DenseSpectralOperator` gradient computes — eliminating the
// cost/gradient mismatch that broke BFGS line search.
let h = array![[-1.0, 0.0], [0.0, 2.0]];
let logdet = stable_logdet_with_ridge_policy(
&h,
1e-12,
RidgePolicy::explicit_stabilization_pospart(),
)
.expect("smooth-regularized logdet must be finite for indefinite H");
assert!(
logdet.is_finite(),
"smooth-regularized logdet should be finite, got {logdet}"
);
// Reference value using the same formula directly on the eigenvalues
// of H + ridge·I (ridge = 1e-12 here). Since ε ≫ ridge (spectral_epsilon
// floors at √(eps_mach) ≈ 1.5e-8 for p=2), the ridge contribution is
// absorbed into ε and the expected value is Σ log r_ε(σ_j).
let eps = spectral_epsilon(&[-1.0_f64, 2.0]).max(1e-12_f64.max(1e-14));
// A + ridge·I has eigenvalues shifted by 1e-12, negligible relative to ε.
let expected: f64 = [-1.0_f64 + 1e-12, 2.0 + 1e-12]
.iter()
.map(|&s| spectral_regularize(s, eps).ln())
.sum();
assert!(
(logdet - expected).abs() < 1e-10,
"logdet={logdet}, expected={expected}"
);
}
#[test]
fn pseudo_laplace_exact_newton_symmetrizes_nearly_symmetrichessian() {
let spec = ParameterBlockSpec {
name: "nearly_symmetric".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
[1.0, 0.0],
[0.0, 1.0]
])),
offset: array![0.0, 0.0],
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(array![0.0, 0.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let fit = fit_custom_family(
&OneBlockNearlySymmetricPseudoLaplaceFamily,
&[spec],
&BlockwiseFitOptions {
use_remlobjective: true,
compute_covariance: false,
..BlockwiseFitOptions::default()
},
)
.expect("nearly symmetric pseudo-laplace Hessian should be accepted after symmetrization");
assert!(
fit.penalized_objective.is_finite(),
"expected finite pseudo-laplace objective, got {}",
fit.penalized_objective
);
}
#[test]
fn outer_lamlgradient_matches_finite_differencewhen_joint_exact_path_is_active() {
crate::solver::visualizer::init_logging();
let BinomialLocationScaleWiggleOuterFixture {
family,
specs,
penalty_counts,
rho,
options: base_options,
} = binomial_location_scale_wiggle_outer_fixture();
// FD/analytic noise floor below is `EPS·|cost|/h`, valid only when PIRLS
// converges to f64 precision; HardPseudo + σ_min~1e-10 amplifies the
// default 1e-6 inner residual into ~1e-7 cost slack that lifts both
// estimators above the machine-precision floor.
let options = BlockwiseFitOptions {
inner_tol: 1e-12,
inner_max_cycles: 500,
..base_options
};
let (f0, g0, _) =
outerobjective_andgradient(&family, &specs, &options, &penalty_counts, &rho, None)
.expect("objective/gradient");
assert!(f0.is_finite());
assert_eq!(g0.len(), rho.len());
let h = 1e-5;
for k in 0..rho.len() {
let mut rho_p = rho.clone();
let mut rho_m = rho.clone();
rho_p[k] += h;
rho_m[k] -= h;
let (fp, _, _) = outerobjective_andgradient(
&family,
&specs,
&options,
&penalty_counts,
&rho_p,
None,
)
.expect("objective+");
let (fm, _, _) = outerobjective_andgradient(
&family,
&specs,
&options,
&penalty_counts,
&rho_m,
None,
)
.expect("objective-");
let gfd = (fp - fm) / (2.0 * h);
// Noise floor for FD-vs-analytic comparisons.
//
// At a rank-deficient optimum (σ_min(H) ≲ ε_machine) the outer
// REML gradient is a DIFFERENCE of two nearly-equal O(1)
// quantities — ½ λ_k (H⁺[k,k] − S⁺[k,k]) — so the true gradient
// is very close to zero. The FD estimator `(f_p − f_m)/(2h)`
// then measures cost-sum round-off: at f64 precision each cost
// value carries an uncertainty of ~EPS · |cost|, and the
// symmetric FD inflates that by 1/(2h), producing a noise floor
// of roughly `EPS · |cost| / h` on |gfd|. Below that floor
// neither `|gfd|`, `|g0|`, nor `sign(gfd)` reflect the true
// derivative — they reflect arithmetic noise.
//
// Concretely: for this test `|cost| ~ 6`, `h = 1e-5`, so the
// floor is ~1.3e-10 (≈ f64::EPSILON · 6 / 1e-5). We round up
// to a problem-scale-derived value and treat pairs where BOTH
// |g0| and |gfd| lie below the floor as a pass (the assertion
// is making a claim about the TRUE derivative, and a true
// derivative strictly less than noise is indistinguishable
// from zero — sign is not a correctness property there).
let cost_magnitude = f0.abs().max(1.0);
let noise_floor = (10.0 * f64::EPSILON * cost_magnitude / h).max(1e-9);
let both_in_noise = g0[k].abs() < noise_floor && gfd.abs() < noise_floor;
if !both_in_noise {
assert_eq!(
g0[k].signum(),
gfd.signum(),
"outer LAML gradient sign mismatch at {}: analytic={} fd={} noise_floor={:.3e}",
k,
g0[k],
gfd,
noise_floor,
);
let rel = (g0[k] - gfd).abs() / gfd.abs().max(noise_floor);
assert!(
rel < 2e-2,
"outer LAML gradient mismatch at {}: analytic={} fd={} rel={} noise_floor={:.3e}",
k,
g0[k],
gfd,
rel,
noise_floor,
);
}
}
}
#[test]
fn rho_only_outer_objective_matches_joint_hyper_when_psi_is_empty() {
let BinomialLocationScaleWiggleOuterFixture {
family,
specs,
penalty_counts,
rho,
options,
} = binomial_location_scale_wiggle_outer_fixture();
let (outer_obj, outer_grad, outer_hessian, _) =
super::test_support::outerobjectivegradienthessian(
&family,
&specs,
&options,
&penalty_counts,
&rho,
None,
EvalMode::ValueGradientHessian,
)
.expect("rho-only outer objective");
let derivative_blocks = vec![Vec::<CustomFamilyBlockPsiDerivative>::new(); specs.len()];
let joint_result = evaluate_custom_family_joint_hyper(
&family,
&specs,
&options,
&rho,
&derivative_blocks,
None,
EvalMode::ValueGradientHessian,
)
.expect("joint hyper objective with empty psi");
assert!(
(outer_obj - joint_result.objective).abs() < 1e-12,
"objective mismatch: rho-only={} joint={}",
outer_obj,
joint_result.objective
);
assert_eq!(outer_grad.len(), joint_result.gradient.len());
let max_grad_diff = outer_grad
.iter()
.zip(joint_result.gradient.iter())
.map(|(lhs, rhs)| (lhs - rhs).abs())
.fold(0.0_f64, f64::max);
assert!(
max_grad_diff < 1e-12,
"gradient mismatch: max diff={}",
max_grad_diff
);
let outer_hessian = outer_hessian.expect("rho-only outer Hessian");
let joint_hessian = joint_result
.outer_hessian
.materialize_dense()
.expect("joint outer Hessian should materialize")
.expect("joint outer Hessian");
assert_eq!(outer_hessian.dim(), joint_hessian.dim());
let max_hessian_diff = outer_hessian
.iter()
.zip(joint_hessian.iter())
.map(|(lhs, rhs)| (lhs - rhs).abs())
.fold(0.0_f64, f64::max);
assert!(
max_hessian_diff < 1e-12,
"outer Hessian mismatch: max diff={}",
max_hessian_diff
);
}
/// Shared probit binomial-location-scale outer-derivative test fixture:
/// builds the (threshold, log_sigma) block specs, family, penalty counts,
/// and outer options that every `outer_laml*_binomial_location_scale_*`
/// finite-difference test constructs identically apart from `y` and the
/// two block initial betas.
fn binomial_location_scale_outer_fixture(
y: Array1<f64>,
threshold_initial_beta: f64,
log_sigma_initial_beta: f64,
) -> (
BinomialLocationScaleFamily,
Vec<ParameterBlockSpec>,
Vec<usize>,
BlockwiseFitOptions,
) {
let n = y.len();
let weights = Array1::from_elem(n, 1.0);
let thresholdspec = ParameterBlockSpec {
name: "threshold".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::from_elem(
(n, 1),
1.0,
))),
offset: Array1::zeros(n),
penalties: vec![PenaltyMatrix::Dense(Array2::eye(1))],
nullspace_dims: vec![],
initial_log_lambdas: array![0.0],
initial_beta: Some(array![threshold_initial_beta]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let log_sigmaspec = ParameterBlockSpec {
name: "log_sigma".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::from_elem(
(n, 1),
1.0,
))),
offset: Array1::zeros(n),
penalties: vec![PenaltyMatrix::Dense(Array2::eye(1))],
nullspace_dims: vec![],
initial_log_lambdas: array![0.0],
initial_beta: Some(array![log_sigma_initial_beta]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let threshold_design = thresholdspec.design.clone();
let log_sigma_design = log_sigmaspec.design.clone();
let family = BinomialLocationScaleFamily {
y,
weights,
link_kind: crate::types::InverseLink::Standard(crate::types::StandardLink::Probit),
threshold_design: Some(threshold_design),
log_sigma_design: Some(log_sigma_design),
policy: crate::resource::ResourcePolicy::default_library(),
};
let specs = vec![thresholdspec, log_sigmaspec];
let penalty_counts = vec![1usize, 1usize];
let options = BlockwiseFitOptions {
use_remlobjective: true,
ridge_floor: 1e-10,
outer_max_iter: 1,
..BlockwiseFitOptions::default()
};
(family, specs, penalty_counts, options)
}
#[test]
fn outer_lamlgradient_diagonal_binomial_location_scale_matchesfd() {
let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0]);
let (family, specs, penalty_counts, options) =
binomial_location_scale_outer_fixture(y, 0.0, 0.0);
let rho = array![0.0, 0.0];
let (f0, g0, _) =
outerobjective_andgradient(&family, &specs, &options, &penalty_counts, &rho, None)
.expect("objective/gradient");
assert!(f0.is_finite());
assert_eq!(g0.len(), rho.len());
let h = 1e-5;
for k in 0..rho.len() {
let mut rho_p = rho.clone();
let mut rho_m = rho.clone();
rho_p[k] += h;
rho_m[k] -= h;
let (fp, _, _) = outerobjective_andgradient(
&family,
&specs,
&options,
&penalty_counts,
&rho_p,
None,
)
.expect("objective+");
let (fm, _, _) = outerobjective_andgradient(
&family,
&specs,
&options,
&penalty_counts,
&rho_m,
None,
)
.expect("objective-");
let gfd = (fp - fm) / (2.0 * h);
let abs = (g0[k] - gfd).abs();
let rel = abs / gfd.abs().max(1e-8);
if abs >= 2e-3 {
assert_eq!(
g0[k].signum(),
gfd.signum(),
"outer diagonal LAML gradient sign mismatch at {}: analytic={} fd={}",
k,
g0[k],
gfd
);
}
assert!(
abs < 2e-3 || rel < 2e-3,
"outer diagonal LAML gradient mismatch at {}: analytic={} fd={} abs={} rel={}",
k,
g0[k],
gfd,
abs,
rel
);
}
}
#[test]
fn outer_lamlgradient_diagonal_binomial_location_scale_hard_case_matchesfd() {
let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0]);
let (family, specs, penalty_counts, options) =
binomial_location_scale_outer_fixture(y, 0.2, -0.1);
let rho = array![0.15, -0.25];
let (f0, g0, _) =
outerobjective_andgradient(&family, &specs, &options, &penalty_counts, &rho, None)
.expect("objective/gradient");
assert!(f0.is_finite());
assert_eq!(g0.len(), rho.len());
let h = 1e-5;
for k in 0..rho.len() {
let mut rho_p = rho.clone();
let mut rho_m = rho.clone();
rho_p[k] += h;
rho_m[k] -= h;
let (fp, _, _) = outerobjective_andgradient(
&family,
&specs,
&options,
&penalty_counts,
&rho_p,
None,
)
.expect("objective+");
let (fm, _, _) = outerobjective_andgradient(
&family,
&specs,
&options,
&penalty_counts,
&rho_m,
None,
)
.expect("objective-");
let gfd = (fp - fm) / (2.0 * h);
let abs = (g0[k] - gfd).abs();
let rel = abs / gfd.abs().max(1e-8);
if abs >= 2e-3 {
assert_eq!(
g0[k].signum(),
gfd.signum(),
"outer diagonal hard-case LAML gradient sign mismatch at {}: analytic={} fd={}",
k,
g0[k],
gfd
);
}
assert!(
abs < 2e-3 || rel < 2e-3,
"outer diagonal hard-case LAML gradient mismatch at {}: analytic={} fd={} abs={} rel={}",
k,
g0[k],
gfd,
abs,
rel
);
}
}
#[test]
fn outer_lamlhessian_joint_exact_binomial_location_scale_matchesfd() {
// Asymmetric y (6 ones / 4 zeros). A balanced 5/5 vector forces
// β̂_threshold = 0 by probit-link symmetry, which makes the joint
// observed Hessian block-diagonal in (threshold, log_sigma) at the
// inner mode. The outer LAML Hessian off-diagonals are then ~1e-11,
// below the central-FD noise floor (≈ pirls_tol / h) at h=1e-5, so
// FD-vs-analytic agreement cannot be enforced. Asymmetric y gives
// β̂_threshold ≠ 0, coupling the (β_0, β_1) blocks through the
// observed-information weights and making all four entries validatable.
let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0]);
let (family, specs, penalty_counts, options) =
binomial_location_scale_outer_fixture(y, 0.15, -0.05);
let rho = array![0.1, -0.2];
let (_, _, h0_opt, _) = super::test_support::outerobjectivegradienthessian(
&family,
&specs,
&options,
&penalty_counts,
&rho,
None,
EvalMode::ValueGradientHessian,
)
.expect("objective/gradient/hessian");
let h0 = h0_opt.expect("analytic outer Hessian should be available");
assert_eq!(h0.nrows(), rho.len());
assert_eq!(h0.ncols(), rho.len());
let h = 1e-5;
for l in 0..rho.len() {
let mut rho_p = rho.clone();
let mut rho_m = rho.clone();
rho_p[l] += h;
rho_m[l] -= h;
let (_, gp, _, _) = super::test_support::outerobjectivegradienthessian(
&family,
&specs,
&options,
&penalty_counts,
&rho_p,
None,
EvalMode::ValueAndGradient,
)
.expect("objective/gradient +");
let (_, gm, _, _) = super::test_support::outerobjectivegradienthessian(
&family,
&specs,
&options,
&penalty_counts,
&rho_m,
None,
EvalMode::ValueAndGradient,
)
.expect("objective/gradient -");
for k in 0..rho.len() {
let hfd = (gp[k] - gm[k]) / (2.0 * h);
let abs_err = (h0[[k, l]] - hfd).abs();
let rel = (h0[[k, l]] - hfd).abs() / hfd.abs().max(1e-7);
if h0[[k, l]].abs().max(hfd.abs()) > 1e-10 {
assert_eq!(
h0[[k, l]].signum(),
hfd.signum(),
"outer Hessian sign mismatch at ({k},{l}): analytic={} fd={}",
h0[[k, l]],
hfd
);
}
assert!(
abs_err < 1e-8 || rel < 2e-2,
"outer Hessian mismatch at ({k},{l}): analytic={} fd={} abs={} rel={}",
h0[[k, l]],
hfd,
abs_err,
rel
);
}
}
for i in 0..h0.nrows() {
for j in 0..i {
let asym = (h0[[i, j]] - h0[[j, i]]).abs();
assert!(
asym < 1e-8,
"outer Hessian not symmetric at ({i},{j}): {asym}"
);
}
}
}
#[test]
fn block_solve_sparse_matches_dense() {
let x_dense = array![
[1.0, 0.0, 2.0],
[0.0, 3.0, 0.0],
[4.0, 0.0, 5.0],
[0.0, 6.0, 0.0]
];
let y_star = array![1.0, -1.0, 0.5, 2.0];
let w = array![1.0, 0.5, 2.0, 1.5];
let s_lambda = Array2::<f64>::eye(3) * 0.1;
let mut triplets = Vec::new();
for i in 0..x_dense.nrows() {
for j in 0..x_dense.ncols() {
let v = x_dense[[i, j]];
if v != 0.0 {
triplets.push(Triplet::new(i, j, v));
}
}
}
let x_sparse = SparseColMat::try_new_from_triplets(4, 3, &triplets)
.expect("sparse matrix build should succeed");
let beta_dense = solve_blockweighted_system(
&DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(x_dense.clone())),
&y_star,
&w,
&s_lambda,
1e-12,
RidgePolicy::explicit_stabilization_pospart(),
)
.expect("dense solve should succeed");
let beta_sparse = solve_blockweighted_system(
&DesignMatrix::from(x_sparse),
&y_star,
&w,
&s_lambda,
1e-12,
RidgePolicy::explicit_stabilization_pospart(),
)
.expect("sparse solve should succeed");
for j in 0..beta_dense.len() {
assert!(
(beta_dense[j] - beta_sparse[j]).abs() < 1e-10,
"dense/sparse mismatch at {}: {} vs {}",
j,
beta_dense[j],
beta_sparse[j]
);
}
}
#[test]
fn outer_lamlhessian_joint_exact_binomial_location_scale_hard_case_matchesfd() {
let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0]);
let (family, specs, penalty_counts, options) =
binomial_location_scale_outer_fixture(y, 0.2, -0.1);
let rho = array![0.15, -0.25];
let (_, _, h0_opt, _) = super::test_support::outerobjectivegradienthessian(
&family,
&specs,
&options,
&penalty_counts,
&rho,
None,
EvalMode::ValueGradientHessian,
)
.expect("objective/gradient/hessian");
let h0 = h0_opt.expect("analytic outer Hessian should be available");
assert_eq!(h0.nrows(), rho.len());
assert_eq!(h0.ncols(), rho.len());
let h = 1e-5;
for l in 0..rho.len() {
let mut rho_p = rho.clone();
let mut rho_m = rho.clone();
rho_p[l] += h;
rho_m[l] -= h;
let (_, gp, _, _) = super::test_support::outerobjectivegradienthessian(
&family,
&specs,
&options,
&penalty_counts,
&rho_p,
None,
EvalMode::ValueAndGradient,
)
.expect("objective/gradient +");
let (_, gm, _, _) = super::test_support::outerobjectivegradienthessian(
&family,
&specs,
&options,
&penalty_counts,
&rho_m,
None,
EvalMode::ValueAndGradient,
)
.expect("objective/gradient -");
for k in 0..rho.len() {
let hfd = (gp[k] - gm[k]) / (2.0 * h);
let abs_err = (h0[[k, l]] - hfd).abs();
let rel = abs_err / hfd.abs().max(1e-7);
if h0[[k, l]].abs().max(hfd.abs()) > 1e-10 {
assert_eq!(
h0[[k, l]].signum(),
hfd.signum(),
"hard-case outer Hessian sign mismatch at ({k},{l}): analytic={} fd={}",
h0[[k, l]],
hfd
);
}
assert!(
abs_err < 1e-8 || rel < 2e-2,
"hard-case outer Hessian mismatch at ({k},{l}): analytic={} fd={} abs={} rel={}",
h0[[k, l]],
hfd,
abs_err,
rel
);
}
}
}
#[test]
fn block_solve_falls_backwhen_llt_rejects_indefinite_system() {
let x_dense = array![[1.0, 0.0], [0.0, 0.0]];
let y_star = array![2.0, 0.0];
let w = array![1.0, 1.0];
let s_lambda = array![[0.0, 0.0], [0.0, -1e-12]];
let beta = solve_blockweighted_system(
&DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(x_dense)),
&y_star,
&w,
&s_lambda,
1e-12,
RidgePolicy::explicit_stabilization_pospart(),
)
.expect("fallback solve should succeed");
assert!(beta.iter().all(|v| v.is_finite()));
assert!(
(beta[0] - 2.0).abs() < 1e-10,
"unexpected solved coefficient"
);
assert!(
beta[1].abs() < 1e-8,
"null-space coefficient should stay near zero"
);
}
#[test]
fn exact_newton_block_enforces_linear_constraints() {
let spec = ParameterBlockSpec {
name: "exact_block".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
offset: array![0.0],
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(array![1.5]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let family = OneBlockConstrainedExactFamily {
target: 0.0,
lower: 1.0,
};
let fit = fit_custom_family(&family, &[spec], &BlockwiseFitOptions::default())
.expect("constrained exact-newton fit");
let beta = fit.block_states[0].beta[0];
assert!(
(beta - 1.0).abs() < 1e-8,
"expected constrained optimum at lower bound, got {beta}"
);
}
#[test]
fn extract_simple_lower_bounds_accepts_axis_aligned_rows() {
let constraints = LinearInequalityConstraints {
a: array![[1.0, 0.0], [0.0, 2.0], [3.0, 0.0]],
b: array![0.25, 1.0, 1.5],
};
let bounds = extract_simple_lower_bounds(&constraints, 2)
.expect("lower-bound extraction should succeed")
.expect("axis-aligned rows should map to lower bounds");
assert_relative_eq!(bounds.lower_bounds[0], 0.5, epsilon = 1e-12);
assert_relative_eq!(bounds.lower_bounds[1], 0.5, epsilon = 1e-12);
assert_eq!(bounds.coeff_to_row, vec![Some(2), Some(1)]);
}
#[test]
fn extract_simple_lower_bounds_rejects_coupled_rows() {
let constraints = LinearInequalityConstraints {
a: array![[1.0, 1.0]],
b: array![0.0],
};
assert!(
extract_simple_lower_bounds(&constraints, 2)
.expect("lower-bound extraction should not error on valid shapes")
.is_none(),
"coupled rows must stay on the generic linear-constraint path"
);
}
#[test]
fn constrained_exact_newton_indefinite_hessian_uses_stabilized_delta_solve() {
let spec = ParameterBlockSpec {
name: "exact_block".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
offset: array![0.0],
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(array![1.5]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let states = vec![ParameterBlockState {
beta: array![1.5],
eta: array![1.5],
}];
let constraints = LinearInequalityConstraints {
a: array![[1.0]],
b: array![1.0],
};
let hessian = SymmetricMatrix::Dense(array![[-1.0]]);
let updater = ExactNewtonBlockUpdater {
gradient: &array![-1.0],
hessian: &hessian,
};
let s_lambda = Array2::zeros((1, 1));
let update = updater
.compute_update_step(&BlockUpdateContext {
family: &OneBlockConstrainedIndefiniteHessianFamily,
states: &states,
spec: &spec,
block_idx: 0,
s_lambda: &s_lambda,
options: &BlockwiseFitOptions::default(),
linear_constraints: Some(&constraints),
cached_active_set: None,
})
.expect("indefinite constrained exact-newton update should be stabilized");
assert_relative_eq!(update.beta_new_raw[0], 1.0, epsilon = 1e-12);
assert_eq!(update.active_set, Some(vec![0]));
}
#[test]
fn quadratic_linear_constraints_release_positive_kkt_systemmultiplier() {
// max ll with exact Newton equivalent to minimizing
// 0.5 * x^2 - rhs*x with rhs=1 under 0 <= x <= 0.1.
// At x=0, active-set KKT solve gives lambda_sys=+1 for the lower bound,
// which must be released (lambda_true = -lambda_sys).
let hessian = array![[1.0]];
let rhs = array![1.0];
let beta_start = array![0.0];
let constraints = LinearInequalityConstraints {
a: array![[1.0], [-1.0]],
b: array![0.0, -0.1],
};
let (beta, active) = solve_quadratic_with_linear_constraints(
&hessian,
&rhs,
&beta_start,
&constraints,
None,
)
.expect("constrained quadratic solve should succeed");
assert!(
(beta[0] - 0.1).abs() <= 1e-10,
"expected constrained optimum at upper bound 0.1, got {}",
beta[0]
);
assert_eq!(active.len(), 1);
}
#[test]
fn quadratic_linear_constraints_ignore_near_tangential_inactiverows() {
let hessian = array![[1.0, 0.0], [0.0, 1.0]];
let rhs = array![1.0, 0.0];
let beta_start = array![0.0, 0.0];
let constraints = LinearInequalityConstraints {
a: array![[-1e-16, 1.0]],
b: array![-1.0],
};
let (beta, active) = solve_quadratic_with_linear_constraints(
&hessian,
&rhs,
&beta_start,
&constraints,
None,
)
.expect("near-tangential inactive row should not block the quadratic step");
assert!(
(beta[0] - 1.0).abs() <= 1e-12,
"expected unconstrained x-solution of 1.0, got {}",
beta[0]
);
assert!(
beta[1].abs() <= 1e-12,
"expected zero y-solution, got {}",
beta[1]
);
assert!(active.is_empty(), "no row should become active");
}
#[test]
fn quadratic_linear_constraints_projectwarm_activerows_back_to_boundary() {
let hessian = array![[2.0]];
let rhs = array![0.0];
let beta_start = array![1e-9];
let constraints = LinearInequalityConstraints {
a: array![[1.0]],
b: array![0.0],
};
let (beta, active) = solve_quadratic_with_linear_constraints(
&hessian,
&rhs,
&beta_start,
&constraints,
Some(&[0]),
)
.expect("constrained quadratic solve should project back to the boundary");
assert_relative_eq!(beta[0], 0.0, epsilon = 1e-14);
assert_eq!(active, vec![0]);
}
#[test]
fn quadratic_linear_constraints_handles_near_dependent_rows() {
// Three constraints in R^2 where the third is nearly a linear
// combination of the first two, making the naive KKT system
// ill-conditioned. The rank-reducing compression should drop
// the dependent row and the QP should converge cleanly.
//
// x1 >= 0, x2 >= 0, x1 + x2 + eps >= 0 (eps ≈ 0)
//
// Minimize 0.5 * ||x - [−1, −1]||^2 => optimum at origin.
let hessian = Array2::eye(2);
let rhs = array![-1.0, -1.0]; // gradient points toward (−1,−1)
let beta_start = array![0.0, 0.0];
let eps = 1e-14;
let constraints = LinearInequalityConstraints {
a: array![[1.0, 0.0], [0.0, 1.0], [1.0 + eps, 1.0]],
b: array![0.0, 0.0, 0.0],
};
let (beta, active) = solve_quadratic_with_linear_constraints(
&hessian,
&rhs,
&beta_start,
&constraints,
Some(&[0, 1, 2]), // all three active
)
.expect("near-dependent constraint QP should converge");
assert!(
beta[0].abs() <= 1e-10 && beta[1].abs() <= 1e-10,
"expected optimum at origin, got ({}, {})",
beta[0],
beta[1]
);
assert!(
active.len() <= 2,
"at most 2 independent constraints should remain active, got {}",
active.len()
);
}
#[test]
fn quadratic_linear_constraints_release_merged_constraint_group_by_id() {
// Two redundant lower-bound rows compress into one active KKT row.
// Releasing that merged row must drop both original constraint ids,
// not transient positions in the active vector.
let hessian = array![[1.0]];
let rhs = array![1.0];
let beta_start = array![0.0];
let constraints = LinearInequalityConstraints {
a: array![[1.0], [2.0], [-1.0]],
b: array![0.0, 0.0, -0.1],
};
let (beta, active) = solve_quadratic_with_linear_constraints(
&hessian,
&rhs,
&beta_start,
&constraints,
Some(&[0, 1]),
)
.expect("merged active constraint group should release cleanly");
assert!(
(beta[0] - 0.1).abs() <= 1e-10,
"expected constrained optimum at upper bound 0.1, got {}",
beta[0]
);
assert_eq!(active, vec![2]);
}
#[test]
fn quadratic_linear_constraints_release_merged_group_with_unsorted_active_positions() {
let hessian = array![[1.0]];
let rhs = array![1.0];
let beta_start = array![0.0];
let constraints = LinearInequalityConstraints {
a: array![[1.0], [2.0], [-1.0]],
b: array![0.0, 0.0, -0.1],
};
let (beta, active) = solve_quadratic_with_linear_constraints(
&hessian,
&rhs,
&beta_start,
&constraints,
Some(&[2, 0, 1]),
)
.expect("merged active group release should handle unsorted active positions");
assert!(
(beta[0] - 0.1).abs() <= 1e-10,
"expected constrained optimum at upper bound 0.1, got {}",
beta[0]
);
assert_eq!(active, vec![2]);
}
#[test]
fn quadratic_linear_constraints_accept_boundary_kkt_after_rank_reduction() {
let hessian = array![[2.0]];
let rhs = array![0.0];
let beta_start = array![1e-9];
let constraints = LinearInequalityConstraints {
a: array![[1.0], [1.0 + 1e-13], [2.0], [3.0]],
b: array![0.0, 0.0, 0.0, 0.0],
};
let (beta, active) = solve_quadratic_with_linear_constraints(
&hessian,
&rhs,
&beta_start,
&constraints,
Some(&[0, 1, 2, 3]),
)
.expect("degenerate boundary KKT point should be accepted");
assert_relative_eq!(beta[0], 0.0, epsilon = 1e-14);
assert!(
active.len() <= 1,
"rank-reduced boundary solution should keep at most one representative, got {:?}",
active
);
}
#[test]
fn quadratic_linear_constraints_singular_kkt_uses_pseudoinverse_fallback() {
let hessian = Array2::<f64>::zeros((2, 2));
let rhs = array![0.0, 0.0];
let beta_start = array![0.0, 0.0];
let constraints = LinearInequalityConstraints {
a: array![[1.0, 1.0]],
b: array![0.0],
};
let (beta, active) = solve_quadratic_with_linear_constraints(
&hessian,
&rhs,
&beta_start,
&constraints,
Some(&[0]),
)
.expect("singular KKT system should fall back to a finite pseudoinverse solve");
assert!(beta.iter().all(|value| value.is_finite()));
assert_relative_eq!(beta[0], 0.0, epsilon = 1e-14);
assert_relative_eq!(beta[1], 0.0, epsilon = 1e-14);
assert_eq!(active, vec![0]);
}
#[test]
fn rank_reduce_drops_exactly_dependent_row() {
// Row 3 = Row 1 + Row 2 exactly. Rank reduction should drop it.
let a = array![[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [1.0, 1.0, 0.0],];
let b = array![0.0, 0.0, 0.0];
let member_constraint_ids = vec![vec![0], vec![1], vec![2]];
let (a_out, b_out, member_constraint_ids_out, _) =
crate::solver::active_set::rank_reduce_rows_pivoted_qr_with_dependence(
a,
b,
member_constraint_ids,
);
assert_eq!(
a_out.nrows(),
2,
"should keep 2 independent rows, got {}",
a_out.nrows()
);
assert_eq!(b_out.len(), 2);
// The third constraint id should have been merged into one of the first two rows.
let total_constraint_ids: usize = member_constraint_ids_out.iter().map(|g| g.len()).sum();
assert_eq!(
total_constraint_ids, 3,
"all original constraint ids must be preserved"
);
}
#[test]
fn rank_reduce_preserves_full_rank_matrix() {
let a = array![[1.0, 0.0], [0.0, 1.0], [1.0, 1.0],];
let b = array![0.0, 0.0, 0.0];
let member_constraint_ids = vec![vec![0], vec![1], vec![2]];
let (a_out, b_out, member_constraint_ids_out, _) =
crate::solver::active_set::rank_reduce_rows_pivoted_qr_with_dependence(
a,
b,
member_constraint_ids,
);
// All three rows are independent in R^2 (but we only have rank 2).
// The first two span R^2, so row 3 = row 1 + row 2 is dependent.
assert_eq!(a_out.nrows(), 2);
assert_eq!(b_out.len(), 2);
let total_constraint_ids: usize = member_constraint_ids_out.iter().map(|g| g.len()).sum();
assert_eq!(total_constraint_ids, 3);
}
#[test]
fn constrained_exact_newton_nan_hessian_returns_feasible_noop_instead_of_failing() {
let spec = ParameterBlockSpec {
name: "exact_block".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
offset: array![0.0],
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(array![0.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let states = vec![ParameterBlockState {
beta: array![0.0],
eta: array![0.0],
}];
let constraints = LinearInequalityConstraints {
a: array![[1.0]],
b: array![0.0],
};
let hessian = SymmetricMatrix::Dense(array![[f64::NAN]]);
let updater = ExactNewtonBlockUpdater {
gradient: &array![0.0],
hessian: &hessian,
};
let s_lambda = Array2::zeros((1, 1));
let update = updater
.compute_update_step(&BlockUpdateContext {
family: &OneBlockConstrainedNaNHessianFamily,
states: &states,
spec: &spec,
block_idx: 0,
s_lambda: &s_lambda,
options: &BlockwiseFitOptions::default(),
linear_constraints: Some(&constraints),
cached_active_set: None,
})
.expect("constrained exact-newton NaN Hessian should produce a no-op update");
assert_relative_eq!(update.beta_new_raw[0], 0.0, epsilon = 1e-14);
assert_eq!(update.active_set, Some(vec![0]));
}
#[test]
fn outerobjective_failure_context_is_preserved() {
// One penalty forces the outer rho optimizer to run, which should now preserve
// the real evaluation error instead of returning an opaque line-search failure.
let spec = ParameterBlockSpec {
name: "err_block".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
[1.0],
[1.0]
])),
offset: array![0.0, 0.0],
penalties: vec![PenaltyMatrix::Dense(Array2::eye(1))],
nullspace_dims: vec![],
initial_log_lambdas: array![0.0],
initial_beta: Some(array![0.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let options = BlockwiseFitOptions {
outer_max_iter: 3,
..BlockwiseFitOptions::default()
};
let err = match fit_custom_family(&OneBlockAlwaysErrorFamily, &[spec], &options) {
Ok(_) => panic!("fit should fail when family evaluate always errors"),
Err(e) => e,
};
assert!(
err.to_string().contains(
"last objective error: synthetic outer objective failure: block[0] evaluate()"
),
"expected preserved root-cause context in error, got: {err}"
);
}
#[test]
fn fit_fails_when_requested_covariance_cannot_be_computed() {
let spec = ParameterBlockSpec {
name: "cov_block".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
[1.0],
[1.0]
])),
offset: array![0.0, 0.0],
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(array![0.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let options = BlockwiseFitOptions {
use_remlobjective: false,
compute_covariance: true,
..BlockwiseFitOptions::default()
};
let err = match fit_custom_family(&OneBlockCovarianceErrorFamily, &[spec], &options) {
Ok(_) => panic!("fit should fail when covariance computation fails"),
Err(e) => e,
};
assert!(
err.to_string()
.contains("synthetic covariance assembly failure"),
"expected covariance root cause in fit error, got: {err}"
);
}
// Exact analytic Hessians must be finite. Non-finite Hessians are rejected
// loudly instead of being masked by a surrogate update.
/// A QuadraticReml family whose log_sigma block returns a Hessian containing
/// NaN, simulating what happens when exp(eta_sigma) overflows during
/// location-scale fitting.
#[derive(Clone)]
struct TwoBlockNaNHessianFamily;
impl CustomFamily for TwoBlockNaNHessianFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let n0 = block_states[0].eta.len();
let p1 = block_states[1].beta.len();
// Block 0 (mu): well-behaved diagonal working set.
// Block 1 (log_sigma): ExactNewton with NaN in the Hessian,
// simulating overflow from extreme coefficients.
let mut hessian = Array2::<f64>::eye(p1);
hessian[[0, 0]] = f64::NAN; // overflow poison
Ok(FamilyEvaluation {
log_likelihood: -0.5 * block_states[0].eta.iter().map(|&v| v * v).sum::<f64>(),
blockworking_sets: vec![
BlockWorkingSet::Diagonal {
working_response: Array1::zeros(n0),
working_weights: Array1::ones(n0),
},
BlockWorkingSet::ExactNewton {
gradient: Array1::zeros(p1),
hessian: SymmetricMatrix::Dense(hessian),
},
],
})
}
}
/// Same two-block layout but with finite Hessians — the control group.
#[derive(Clone)]
struct TwoBlockFiniteHessianFamily;
impl CustomFamily for TwoBlockFiniteHessianFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let n0 = block_states[0].eta.len();
let p1 = block_states[1].beta.len();
let beta1 = &block_states[1].beta;
let resid1: f64 = beta1.iter().map(|&b| b * b).sum();
Ok(FamilyEvaluation {
log_likelihood: -0.5 * block_states[0].eta.iter().map(|&v| v * v).sum::<f64>()
- 0.5 * resid1,
blockworking_sets: vec![
BlockWorkingSet::Diagonal {
working_response: Array1::zeros(n0),
working_weights: Array1::ones(n0),
},
BlockWorkingSet::ExactNewton {
gradient: -beta1.clone(),
hessian: SymmetricMatrix::Dense(Array2::eye(p1)),
},
],
})
}
}
/// Same NaN-Hessian family but with PseudoLaplace objective, which takes
/// the strict-SPD path and skips the eigendecomposition in compute_update_step.
#[derive(Clone)]
struct TwoBlockNaNHessianPseudoLaplaceFamily;
impl CustomFamily for TwoBlockNaNHessianPseudoLaplaceFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
TwoBlockNaNHessianFamily.evaluate(block_states)
}
fn exact_newton_outerobjective(&self) -> ExactNewtonOuterObjective {
ExactNewtonOuterObjective::StrictPseudoLaplace
}
}
fn make_two_block_specs(n: usize) -> Vec<ParameterBlockSpec> {
vec![
ParameterBlockSpec {
name: "mu".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
Array2::from_elem((n, 1), 1.0),
)),
offset: Array1::zeros(n),
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(array![0.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
},
ParameterBlockSpec {
name: "log_sigma".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
Array2::from_elem((n, 2), 1.0),
)),
offset: Array1::zeros(n),
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(array![0.0, 0.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
},
]
}
#[test]
fn exact_newton_nan_hessian_fails_loudly_before_eigendecomposition() {
// Exact Newton Hessians are part of the mathematical contract. A
// NaN in a block Hessian means the family derivative is invalid; we
// should reject it at the logdet boundary instead of hiding it behind
// a conservative eigendecomposition fallback.
let specs = make_two_block_specs(4);
let per_block_log_lambdas = vec![Array1::zeros(0), Array1::zeros(0)];
let options = BlockwiseFitOptions {
inner_max_cycles: 1,
use_remlobjective: false,
compute_covariance: false,
..BlockwiseFitOptions::default()
};
let result = inner_blockwise_fit(
&TwoBlockNaNHessianFamily,
&specs,
&per_block_log_lambdas,
&options,
None,
);
let err = result.expect_err("NaN exact Hessian must fail loudly");
assert!(
err.contains("smooth-regularized logdet Hessian contains non-finite entry"),
"expected explicit non-finite Hessian error, got: {err}"
);
}
#[test]
fn exact_newton_finite_hessian_succeeds_where_nan_hessian_fails() {
// SUFFICIENCY (control): The identical two-block structure with a
// finite Hessian succeeds, proving that NaN in the Hessian is the
// specific trigger — not the block layout, penalty structure, or
// solver configuration.
let specs = make_two_block_specs(4);
let per_block_log_lambdas = vec![Array1::zeros(0), Array1::zeros(0)];
let options = BlockwiseFitOptions {
inner_max_cycles: 1,
use_remlobjective: false,
compute_covariance: false,
..BlockwiseFitOptions::default()
};
let result = inner_blockwise_fit(
&TwoBlockFiniteHessianFamily,
&specs,
&per_block_log_lambdas,
&options,
None,
);
assert!(
result.is_ok(),
"inner fit should succeed with finite Hessian: {:?}",
result.err()
);
}
#[test]
fn checked_penalizedobjective_rejects_non_finite_values() {
let err = checked_penalizedobjective(-1.0, 0.5, f64::NAN, "test objective")
.expect_err("non-finite objective should fail loudly");
assert!(
err.contains("non-finite penalized objective"),
"unexpected error: {err}"
);
}
#[test]
fn exact_newton_dh_closure_rejects_non_finite_directional_derivative() {
#[derive(Clone)]
struct OneBlockNonFiniteJointDhFamily;
impl CustomFamily for OneBlockNonFiniteJointDhFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let beta = block_states
.first()
.ok_or_else(|| "missing block 0".to_string())?
.beta
.clone();
Ok(FamilyEvaluation {
log_likelihood: -0.5 * beta.dot(&beta),
blockworking_sets: vec![BlockWorkingSet::ExactNewton {
gradient: beta.mapv(|v| -v),
hessian: SymmetricMatrix::Dense(array![[1.0]]),
}],
})
}
fn exact_newton_joint_hessian(
&self,
block_states: &[ParameterBlockState],
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
Ok(Some(array![[1.0]]))
}
fn exact_newton_joint_hessian_directional_derivative(
&self,
block_states: &[ParameterBlockState],
arr: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
assert!(block_states.len() <= isize::MAX as usize);
assert!(arr.iter().all(|v| !v.is_nan()));
Ok(Some(array![[f64::NAN]]))
}
}
let family = OneBlockNonFiniteJointDhFamily;
let specs = vec![ParameterBlockSpec {
name: "beta".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::from_elem(
(2, 1),
1.0,
))),
offset: Array1::zeros(2),
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(array![0.0]),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
}];
let states = vec![ParameterBlockState {
beta: array![0.0],
eta: Array1::zeros(2),
}];
let synced_states = Arc::new(
synchronized_states_from_flat_beta(&family, &specs, &states, &array![0.0])
.expect("sync states for exact_newton_dh_closure"),
);
let compute_dh =
exact_newton_dh_closure(&family, synced_states, &specs, 1, false, 1.0, None);
let err = compute_dh(&array![1.0]).expect_err("non-finite dH should fail loudly");
assert!(err.contains("non-finite"), "unexpected error: {err}");
}
#[test]
fn nan_propagating_min_detects_nan_eigenvalues() {
// Verify the fix: our NaN-propagating min correctly detects
// NaN eigenvalues, unlike f64::min which silently ignored them.
let mut mat = Array2::<f64>::eye(3);
mat[[1, 0]] = f64::NAN;
mat[[0, 1]] = f64::NAN;
use crate::faer_ndarray::FaerEigh;
match FaerEigh::eigh(&mat, faer::Side::Lower) {
Err(_) => {
// eigh failed — the fallback chain in compute_update_step
// now catches this and applies a conservative ridge.
}
Ok((evals, _)) => {
// NaN-propagating fold (matches the production code):
let new_min = evals.iter().copied().fold(f64::INFINITY, |a, b| {
if a.is_nan() || b.is_nan() {
f64::NAN
} else {
a.min(b)
}
});
assert!(
!new_min.is_finite(),
"NaN-propagating min should detect NaN eigenvalues, got {new_min}"
);
}
}
}
#[test]
fn multiblock_generic_outer_fallback_returns_error_instead_of_panicking() {
let family = TwoBlockFiniteHessianFamily;
let specs = make_two_block_specs(4);
let penalty_counts = vec![0usize, 0usize];
let rho = Array1::zeros(0);
let options = BlockwiseFitOptions {
use_remlobjective: true,
outer_max_iter: 1,
..BlockwiseFitOptions::default()
};
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
super::test_support::outerobjectivegradienthessian(
&family,
&specs,
&options,
&penalty_counts,
&rho,
None,
EvalMode::ValueGradientHessian,
)
}));
let outcome = result.expect("multi-block outer fallback must return an error, not panic");
let err = match outcome {
Ok(_) => panic!("multi-block family without a joint path should fail loudly"),
Err(err) => err.to_string(),
};
assert!(
err.contains("multi-block families must provide a joint outer path"),
"unexpected error: {err}"
);
}
#[test]
fn pseudo_laplace_path_skips_eigendecomposition_avoiding_nan_crash() {
// SUFFICIENCY: The PseudoLaplace path takes strict_solve_spd instead
// of eigendecomposition-based ridging. It will still fail (the Hessian
// is NaN so the solve produces garbage), but the failure is NOT the
// eigendecomposition NoConvergence error — it's a different error
// downstream. This proves the eigendecomposition call is the unique
// failure point for QuadraticReml families.
let specs = make_two_block_specs(4);
let per_block_log_lambdas = vec![Array1::zeros(0), Array1::zeros(0)];
let options = BlockwiseFitOptions {
inner_max_cycles: 1,
use_remlobjective: false,
compute_covariance: false,
..BlockwiseFitOptions::default()
};
let result = inner_blockwise_fit(
&TwoBlockNaNHessianPseudoLaplaceFamily,
&specs,
&per_block_log_lambdas,
&options,
None,
);
// The PseudoLaplace path may fail for other reasons (NaN in solve),
// but it must NOT fail with the eigendecomposition error.
match result {
Ok(_) => {} // Acceptable — strict_solve_spd might produce NaN
// betas which don't trigger a hard error.
Err(ref msg) => {
assert!(
!msg.contains("exact-newton eigendecomposition failed"),
"PseudoLaplace path should NOT hit eigendecomposition; \
got eigendecomposition error anyway: {msg}"
);
}
}
}
/// Regression check: when `strict_solve_spd_with_lm_continuation` is given a
/// strongly negative-definite matrix whose `|λ_min|` exceeds the LM δ-ridge
/// schedule's terminal δ (≈ ε · trace_scale · 10¹⁶), the bare schedule can't
/// rescue Cholesky and the terminal eigen-floor fallback must return a
/// finite solution equal to `Q diag(1/Λ̃) Qᵀ rhs`, with
/// `Λ̃_i = max(Λ_i, ε λ_max)`.
///
/// We also exercise the schedule-success path with a milder matrix to lock
/// in that the eigen-floor doesn't perturb the LM-δ output for cases the
/// schedule can already handle.
#[test]
fn strict_solve_spd_falls_back_to_eigen_floor_on_indefinite_matrix() {
// δ schedule from `delta0 = max(ε·tr/p, 1e-12)`, growth 10×, 16 steps.
// With `tr = 4·1e30` we get `delta0 ≈ ε·1e30 ≈ 2.2e14`; terminal δ at
// escalation 16 is `2.2e14 · 1e16 = 2.2e30`. Set `λ_min ≈ -1e32` to
// outpace the schedule and force the eigen-floor branch.
let p = 4usize;
let mut h = Array2::<f64>::zeros((p, p));
for i in 0..p {
h[[i, i]] = -1e32 - (i as f64) * 1e30;
}
h[[0, 1]] = 5e29;
h[[1, 0]] = 5e29;
let rhs = Array1::from_vec(vec![1e30, -5e29, 2.5e29, 7.5e29]);
let (x, stats) = strict_solve_spd_with_lm_continuation(&h, &rhs)
.expect("eigen-floor fallback must succeed on the negative-definite matrix");
assert!(
stats.escalations > 16,
"expected eigen-floor terminal fallback (escalations > MAX_ESCALATIONS), got {}",
stats.escalations,
);
for &v in x.iter() {
assert!(
v.is_finite(),
"eigen-floor solve returned non-finite component {v}"
);
}
// Reconstruct the analytic floored solve and compare component-wise.
let mut sym = h.clone();
symmetrize_dense_in_place(&mut sym);
let (evals, evecs) = FaerEigh::eigh(&sym, Side::Lower).expect("eigh");
let max_abs_eval = evals.iter().fold(0.0_f64, |a, &b| a.max(b.abs()));
let eps_floor = (CUSTOM_FAMILY_EVAL_FLOOR * max_abs_eval).max(1e-300);
let mut want = Array1::<f64>::zeros(p);
for k in 0..p {
let mut q_t_rhs = 0.0;
for i in 0..p {
q_t_rhs += evecs[[i, k]] * rhs[i];
}
let scaled = q_t_rhs / evals[k].max(eps_floor);
for i in 0..p {
want[i] += evecs[[i, k]] * scaled;
}
}
for i in 0..p {
let tol = 1e-9 * want[i].abs().max(1.0) + 1e-9;
assert!(
(want[i] - x[i]).abs() <= tol,
"eigen-floor solve component {i}: want={:.6e}, got={:.6e}",
want[i],
x[i],
);
}
}
// ---------- eta_backup heterogeneous-shape regression tests ----------
//
// Regression note: a previous `inner_blockwise_fit` implementation
// reused a single `eta_backup` buffer across blocks during line search.
// With heterogeneous eta lengths (e.g. survival time block = 3n,
// threshold/log-sigma = n), that buffer could be left at the wrong
// shape for the next block update and trigger an ndarray broadcast
// panic:
// "could not broadcast array from shape: [n] to: [3n]"
/// Minimal two-block family where block 0 has design nrows=3n and
/// block 1 has design nrows=n. Both use ExactNewton. Block 0's
/// gradient is nonzero so the Newton step exceeds tol and exercises
/// the line-search path that previously mishandled heterogeneous
/// eta buffer shapes.
#[derive(Clone)]
struct HeterogeneousEtaLengthFamily {
n: usize,
}
impl CustomFamily for HeterogeneousEtaLengthFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let n = self.n;
let eta0 = &block_states[0].eta;
let eta1 = &block_states[1].eta;
assert_eq!(eta0.len(), 3 * n, "block 0 eta must be 3n");
assert_eq!(eta1.len(), n, "block 1 eta must be n");
let p0 = block_states[0].beta.len();
let p1 = block_states[1].beta.len();
// Simple quadratic log-likelihood so optimum is at beta=0.
let ll = -0.5 * eta0.dot(eta0) - 0.5 * eta1.dot(eta1);
// Nonzero gradient drives a real step in both blocks.
let grad0 = &(-&block_states[0].beta) + &Array1::from_elem(p0, 0.1);
let grad1 = &(-&block_states[1].beta) + &Array1::from_elem(p1, 0.1);
Ok(FamilyEvaluation {
log_likelihood: ll,
blockworking_sets: vec![
BlockWorkingSet::ExactNewton {
gradient: grad0,
hessian: SymmetricMatrix::Dense(Array2::eye(p0)),
},
BlockWorkingSet::ExactNewton {
gradient: grad1,
hessian: SymmetricMatrix::Dense(Array2::eye(p1)),
},
],
})
}
}
fn make_heterogeneous_eta_specs(n: usize) -> Vec<ParameterBlockSpec> {
let p0 = 2;
let p1 = 2;
vec![
ParameterBlockSpec {
name: "big_block".to_string(),
// 3n rows — mimics survival time block stacking
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
Array2::from_elem((3 * n, p0), 1.0),
)),
offset: Array1::zeros(3 * n),
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(Array1::from_elem(p0, 1.0)),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
},
ParameterBlockSpec {
name: "small_block".to_string(),
// n rows — mimics threshold/log-sigma block
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
Array2::from_elem((n, p1), 1.0),
)),
offset: Array1::zeros(n),
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(Array1::from_elem(p1, 1.0)),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
},
]
}
/// Regression guard: blocks with identical eta lengths never exercised
/// the old heterogeneous-shape failure mode.
#[test]
fn uniform_eta_lengths_do_not_panic() {
let n = 10;
#[derive(Clone)]
struct UniformEtaFamily;
impl CustomFamily for UniformEtaFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let p0 = block_states[0].beta.len();
let p1 = block_states[1].beta.len();
let eta0 = &block_states[0].eta;
let eta1 = &block_states[1].eta;
let ll = -0.5 * eta0.dot(eta0) - 0.5 * eta1.dot(eta1);
Ok(FamilyEvaluation {
log_likelihood: ll,
blockworking_sets: vec![
BlockWorkingSet::ExactNewton {
gradient: &(-&block_states[0].beta) + &Array1::from_elem(p0, 0.1),
hessian: SymmetricMatrix::Dense(Array2::eye(p0)),
},
BlockWorkingSet::ExactNewton {
gradient: &(-&block_states[1].beta) + &Array1::from_elem(p1, 0.1),
hessian: SymmetricMatrix::Dense(Array2::eye(p1)),
},
],
})
}
}
// Both blocks have n rows — no shape mismatch possible.
let specs = vec![
ParameterBlockSpec {
name: "block_a".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
Array2::from_elem((n, 2), 1.0),
)),
offset: Array1::zeros(n),
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(Array1::from_elem(2, 1.0)),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
},
ParameterBlockSpec {
name: "block_b".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
Array2::from_elem((n, 2), 1.0),
)),
offset: Array1::zeros(n),
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: Some(Array1::from_elem(2, 1.0)),
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
},
];
let per_block = vec![Array1::zeros(0), Array1::zeros(0)];
let options = BlockwiseFitOptions {
inner_max_cycles: 3,
use_remlobjective: false,
compute_covariance: false,
..BlockwiseFitOptions::default()
};
// Must NOT panic — uniform eta lengths keep eta_backup
// compatible with every block's eta after mem::swap.
let result = inner_blockwise_fit(&UniformEtaFamily, &specs, &per_block, &options, None);
assert!(
result.is_ok(),
"uniform eta lengths should not panic: {result:?}"
);
}
/// Regression guard: heterogeneous eta lengths (3n vs n) must not
/// prevent the inner fit from completing. Older code could panic with
/// "could not broadcast array from shape: [n] to: [3n]" due to the
/// eta_backup swap bug.
#[test]
fn heterogeneous_eta_lengths_inner_fit_completes() {
let n = 10;
let family = HeterogeneousEtaLengthFamily { n };
let specs = make_heterogeneous_eta_specs(n);
let per_block = vec![Array1::zeros(0), Array1::zeros(0)];
let options = BlockwiseFitOptions {
inner_max_cycles: 3,
use_remlobjective: false,
compute_covariance: false,
..BlockwiseFitOptions::default()
};
let result = inner_blockwise_fit(&family, &specs, &per_block, &options, None);
assert!(result.is_ok(), "inner fit should complete: {result:?}");
}
/// SUFFICIENCY (single-cycle): even one inner cycle must complete
/// without panic when blocks have heterogeneous eta lengths.
#[test]
fn heterogeneous_eta_single_cycle_completes() {
let n = 10;
let family = HeterogeneousEtaLengthFamily { n };
let specs = make_heterogeneous_eta_specs(n);
let per_block = vec![Array1::zeros(0), Array1::zeros(0)];
let options = BlockwiseFitOptions {
inner_max_cycles: 1,
use_remlobjective: false,
compute_covariance: false,
..BlockwiseFitOptions::default()
};
let result = inner_blockwise_fit(&family, &specs, &per_block, &options, None);
assert!(
result.is_ok(),
"single-cycle inner fit should complete: {result:?}"
);
}
/// Regression guard: when all blocks have step <= tol, the line-search
/// path is skipped for every block, so this case should remain safe
/// even with heterogeneous eta lengths.
#[test]
fn heterogeneous_eta_no_panic_when_all_blocks_converged() {
let n = 10;
#[derive(Clone)]
struct AllConvergedFamily {
n: usize,
}
impl CustomFamily for AllConvergedFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let n = self.n;
let eta0 = &block_states[0].eta;
let eta1 = &block_states[1].eta;
assert_eq!(eta0.len(), 3 * n);
assert_eq!(eta1.len(), n);
let p0 = block_states[0].beta.len();
let p1 = block_states[1].beta.len();
let ll = -0.5 * eta0.dot(eta0) - 0.5 * eta1.dot(eta1);
Ok(FamilyEvaluation {
log_likelihood: ll,
blockworking_sets: vec![
BlockWorkingSet::ExactNewton {
gradient: Array1::zeros(p0),
hessian: SymmetricMatrix::Dense(Array2::eye(p0)),
},
BlockWorkingSet::ExactNewton {
gradient: Array1::zeros(p1),
hessian: SymmetricMatrix::Dense(Array2::eye(p1)),
},
],
})
}
}
let mut specs = make_heterogeneous_eta_specs(n);
specs[0].initial_beta = Some(Array1::zeros(2));
specs[1].initial_beta = Some(Array1::zeros(2));
let family = AllConvergedFamily { n };
let per_block = vec![Array1::zeros(0), Array1::zeros(0)];
let options = BlockwiseFitOptions {
inner_max_cycles: 1,
use_remlobjective: false,
compute_covariance: false,
..BlockwiseFitOptions::default()
};
// All blocks converged → step=0 → `continue` before swap →
// eta_backup never participates → no broadcast panic.
let result = inner_blockwise_fit(&family, &specs, &per_block, &options, None);
assert!(
result.is_ok(),
"should not panic when all blocks are converged: {result:?}"
);
}
/// Regression guard: even when only the second (smaller) block takes
/// a step, the fit must complete. Earlier code could still panic here
/// after reusing an oversized eta_backup buffer across blocks.
#[test]
fn heterogeneous_eta_completes_when_only_small_block_steps() {
let n = 10;
#[derive(Clone)]
struct OnlySmallBlockStepsFamily {
n: usize,
}
impl CustomFamily for OnlySmallBlockStepsFamily {
fn evaluate(
&self,
block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
let n = self.n;
let eta0 = &block_states[0].eta;
let eta1 = &block_states[1].eta;
assert_eq!(eta0.len(), 3 * n);
assert_eq!(eta1.len(), n);
let p0 = block_states[0].beta.len();
let p1 = block_states[1].beta.len();
let ll = -0.5 * eta0.dot(eta0) - 0.5 * eta1.dot(eta1);
Ok(FamilyEvaluation {
log_likelihood: ll,
blockworking_sets: vec![
BlockWorkingSet::ExactNewton {
// Block 0: converged, step=0
gradient: Array1::zeros(p0),
hessian: SymmetricMatrix::Dense(Array2::eye(p0)),
},
BlockWorkingSet::ExactNewton {
// Block 1: nontrivial step
gradient: &(-&block_states[1].beta) + &Array1::from_elem(p1, 0.1),
hessian: SymmetricMatrix::Dense(Array2::eye(p1)),
},
],
})
}
}
let mut specs = make_heterogeneous_eta_specs(n);
specs[0].initial_beta = Some(Array1::zeros(2)); // block 0 at optimum
let family = OnlySmallBlockStepsFamily { n };
let per_block = vec![Array1::zeros(0), Array1::zeros(0)];
let options = BlockwiseFitOptions {
inner_max_cycles: 1,
use_remlobjective: false,
compute_covariance: false,
..BlockwiseFitOptions::default()
};
let result = inner_blockwise_fit(&family, &specs, &per_block, &options, None);
assert!(
result.is_ok(),
"fit should complete when only small block steps: {result:?}"
);
}
/// Direct test of the KKT-aware projection in
/// `projected_stationarity_inf_norm`.
///
/// Contract:
/// (i) with no constraints, returns the plain inf-norm of the residual;
/// (ii) at an active lower bound with multiplier-signed residual
/// (`β_j == lb_j` and `residual_j > 0`) the coordinate is skipped;
/// (iii) at an active lower bound with wrong-signed residual
/// (`residual_j < 0`) the coordinate still contributes;
/// (iv) interior coordinates always contribute regardless of
/// residual sign.
///
/// This pins the exact convergence semantics that the joint-Newton loop
/// relies on: a genuine constrained-KKT optimum must score zero, while
/// infeasibility and interior non-stationarity remain observable.
#[test]
fn projected_stationarity_inf_norm_respects_kkt_multipliers() {
assert!(file!().ends_with(".rs"));
// Test (i): no constraints → plain inf-norm.
let beta = array![1.0, 2.0, -0.5];
let residual = array![0.3, -0.1, 0.2];
let inf_nocon = projected_stationarity_inf_norm(&residual, &beta, None, None);
assert_relative_eq!(inf_nocon, 0.3_f64, epsilon = 1e-12);
// Test (ii): β_j at its lower bound with residual_j > 0 is a KKT
// multiplier; projection drops it, so only the interior entry (-0.1)
// contributes.
let beta_active = array![0.0, 2.0];
let residual_active = array![0.5, -0.1];
let constraints_lb0 = LinearInequalityConstraints {
a: array![[1.0, 0.0], [0.0, 1.0]],
b: array![0.0, f64::NEG_INFINITY], // only β_0 has a finite lower bound
};
// Build a minimal single-row constraint first (β_0 ≥ 0) so the
// "active lower bound + positive residual" branch of the projection
// is exercised in isolation. β_1 is left unconstrained relative to
// this single-row constraint matrix (it's not pinned by any row),
// so its contribution (|-0.1| = 0.1) stays in the inf-norm.
let single = LinearInequalityConstraints {
a: array![[1.0, 0.0]],
b: array![0.0],
};
let inf_projected =
projected_stationarity_inf_norm(&residual_active, &beta_active, Some(&single), None);
assert_relative_eq!(inf_projected, 0.1_f64, epsilon = 1e-12);
let vec_projected = projected_linear_constraint_stationarity_vector(
&residual_active,
&beta_active,
&single,
None,
)
.expect("active lower-bound projection should succeed");
assert_relative_eq!(vec_projected[0], 0.0_f64, epsilon = 1e-10);
assert_relative_eq!(vec_projected[1], -0.1_f64, epsilon = 1e-12);
// Also verify the per-coord handling of an explicitly-unconstrained
// row (b = -inf) in the two-row form: β_0 has a finite lower bound
// of 0 (from row 0), β_1 gets lb = -inf (from row 1 via b/a), which
// `lb.is_finite() == false` routes to the "no lower bound" branch of
// the projection. The active-bound drop still fires on coord 0, so
// the result matches the single-row case: 0.1. This documents that
// the projection's per-coord `lb.is_finite()` gate is what makes the
// unconstrained-coord case work — NOT rejection of the whole
// constraint set by `extract_simple_lower_bounds`.
let inf_with_two_row = projected_stationarity_inf_norm(
&residual_active,
&beta_active,
Some(&constraints_lb0),
None,
);
assert_relative_eq!(inf_with_two_row, 0.1_f64, epsilon = 1e-12);
// Test (iii): β_j at its bound but residual points the WRONG way
// (residual_j < 0 means the KKT dual feasibility λ_j ≥ 0 is violated
// — i.e. the bound should release). Keep that coordinate in the
// norm so the optimizer does not declare convergence on an infeasible
// multiplier.
let beta_wrong_sign = array![0.0];
let residual_wrong_sign = array![-0.2];
let single1 = LinearInequalityConstraints {
a: array![[1.0]],
b: array![0.0],
};
let inf_wrong_sign = projected_stationarity_inf_norm(
&residual_wrong_sign,
&beta_wrong_sign,
Some(&single1),
None,
);
assert_relative_eq!(inf_wrong_sign, 0.2_f64, epsilon = 1e-12);
// Test (iv): an interior coordinate with a valid lower bound keeps
// contributing to the norm, whatever the residual sign.
let beta_interior = array![1.5];
let residual_interior = array![0.4];
let inf_interior = projected_stationarity_inf_norm(
&residual_interior,
&beta_interior,
Some(&single1),
None,
);
assert_relative_eq!(inf_interior, 0.4_f64, epsilon = 1e-12);
}
/// Pins the constrained-stationary certificate semantics.
///
/// The certificate combines three local signals from the most recent
/// accepted Newton step:
///
/// 1. `linearized_rel = ‖g + Hδ‖∞ / (1 + ‖g‖∞)` ≥ 0.5
/// — the linear solve refused to neutralise most of `g`; the
/// unreduced component lives in the constraint-active subspace
/// and IS a Lagrange multiplier, not a defect of the solve.
///
/// 2. `scalar_model_relative_error()` ≤ 1e-3
/// — the local quadratic Newton model agrees with the observed
/// objective change to roundoff, proving the Hessian+gradient
/// are correct at this β. Rules out genuine model mismatch
/// masquerading as a multiplier.
///
/// 3. `|Δobjective|` ≤ `objective_tol`
/// — the objective has ceased moving.
///
/// Reproduces the large-scale survival-marginal-slope failure numerics:
/// `old_kkt ≈ 8.6e5`, `linearized_next ≈ 8.6e5`, `actual ≈ pred ≈ 1.6e-2`.
#[test]
fn joint_newton_math_constrained_stationary_signature_matches_aou_failure() {
let math = JointNewtonMathDiagnostic {
old_kkt_inf: 8.613e5,
linearized_next_kkt_inf: 8.580e5,
predicted_reduction: 1.589e-2,
actual_reduction: 1.589e-2,
trust_ratio: 1.000,
step_inf: 1.270e-2,
proposal_inf: 1.270e-2,
};
// (1) The linearized solve neutralised <1% of g — Lagrange multiplier
// pattern, not a defect of the solve.
let linearized_rel = math.linearized_next_kkt_inf / (1.0 + math.old_kkt_inf);
assert!(
linearized_rel >= 0.5,
"large-scale exit has linearized_rel = {:.3e}, must be >= 0.5 for the \
constrained-stationary certificate to fire",
linearized_rel,
);
// (2) Scalar Newton model is correct to roundoff — Hessian+gradient OK.
let relerr = math.scalar_model_relative_error();
assert!(
relerr <= 1e-3,
"large-scale exit has scalar_model_relerr = {:.3e}, must be <= 1e-3 \
(model agrees with actual ⇒ residual is a real multiplier)",
relerr,
);
// (3) Objective change at obj_tol scale. At |obj| ~ 3.5e5 and
// inner_tol ~ 1e-6, obj_tol ≈ 0.348, and observed Δobj ≈ 1.6e-2.
let objective_change = 1.589e-2_f64;
let objective_tol = 1e-6 * (1.0 + 3.484783e5_f64);
assert!(
objective_change <= objective_tol,
"large-scale exit has |Δobj| = {:.3e}, must be <= obj_tol {:.3e}",
objective_change,
objective_tol,
);
}
/// Reproduces the post-diagnostic large-scale trace: the scalar Newton model
/// and objective plateau tests alone look like a constrained-stationary
/// point, but the projected KKT residual is hundreds of times above
/// tolerance and the accepted Newton step is still macroscopic. That is
/// not a terminal certificate; it is a normal in-progress Newton cycle.
#[test]
fn constrained_stationary_certificate_keeps_iterating_when_step_is_large() {
let math = JointNewtonMathDiagnostic {
old_kkt_inf: 2.708e4,
linearized_next_kkt_inf: 2.707e4,
predicted_reduction: 3.421e-1,
actual_reduction: 3.421e-1,
trust_ratio: 1.0,
step_inf: 2.891e-2,
proposal_inf: 2.891e-2,
};
let objective_change = 3.421e-1;
let objective_tol = 3.479e-1;
let residual = 8.102;
let residual_tol = 2.707e-2;
let step_tol = 1.2e-5;
// These are the three non-step conditions that made 0.1.126 reject a
// seed as soon as objective change touched tolerance.
let linearized_rel = math.linearized_next_kkt_inf / (1.0 + math.old_kkt_inf);
assert!(linearized_rel >= 0.5);
assert!(math.scalar_model_relative_error() <= 1e-3);
assert!(objective_change <= objective_tol);
assert!(math.step_inf > step_tol);
// The projected residual still rules out accepting convergence, but
// the large step rules out terminal refusal. The loop must continue.
assert!(residual > residual_tol);
assert_eq!(
constrained_stationary_certificate_decision(
&math,
objective_change,
objective_tol,
step_tol,
None,
residual,
residual_tol,
),
ConstrainedStationaryCertificate::NotCandidate,
);
}
#[test]
fn residual_steady_geometric_descent_distinguishes_converging_from_plateau() {
use std::collections::VecDeque;
// gam#787 duchon centers≥20: the logslope block converged geometrically
// (~0.33×/cycle) but `linearized_rel ≥ 0.5` + flat objective routed it
// into the plateau-refusal break a few cycles short of tol. The
// steady-descent guard must keep it iterating.
let converging: VecDeque<f64> = [6.985e-4, 2.388e-4, 7.987e-5, 2.597e-5]
.into_iter()
.collect();
assert!(
residual_in_steady_geometric_descent(&converging),
"a steadily ~0.33x/cycle descending residual must be recognized as converging"
);
// A genuine multiplier/null plateau: residual flat/oscillating above tol.
let plateau: VecDeque<f64> = [2.066e0, 2.063e0, 2.066e0, 2.063e0].into_iter().collect();
assert!(
!residual_in_steady_geometric_descent(&plateau),
"a flat/oscillating residual plateau must NOT be treated as converging"
);
// A single lucky drop inside an otherwise flat window must not qualify.
let noisy: VecDeque<f64> = [2.0e0, 2.0e0, 1.0e-3].into_iter().collect();
assert!(
!residual_in_steady_geometric_descent(&noisy),
"a single-cycle drop must not be mistaken for steady descent"
);
// Too few cycles to judge steadiness.
let short: VecDeque<f64> = [1.0e-3, 3.0e-4].into_iter().collect();
assert!(
!residual_in_steady_geometric_descent(&short),
"fewer than the window of cycles must not assert steady descent"
);
}
#[test]
fn constrained_stationary_certificate_refuses_only_when_step_is_exhausted() {
let math = JointNewtonMathDiagnostic {
old_kkt_inf: 2.708e4,
linearized_next_kkt_inf: 2.707e4,
predicted_reduction: 3.421e-1,
actual_reduction: 3.421e-1,
trust_ratio: 1.0,
step_inf: 2.891e-7,
proposal_inf: 2.891e-7,
};
let objective_change = 3.421e-1;
let objective_tol = 3.479e-1;
let step_tol = 1.0e-6;
let residual_tol = 2.707e-2;
// Inside the certification band (`residual <= 4x residual_tol`, the
// documented gam#797 conditioning/round-off allowance) a fully
// stationary iterate is accepted.
assert_eq!(
constrained_stationary_certificate_decision(
&math,
objective_change,
objective_tol,
step_tol,
None,
residual_tol,
residual_tol,
),
ConstrainedStationaryCertificate::Accept,
);
assert_eq!(
constrained_stationary_certificate_decision(
&math,
objective_change,
objective_tol,
step_tol,
None,
// Still within 4x: a residual a hair above 1x must remain
// accepted, because the active-projected residual genuinely
// floors just above the scale-relative tolerance.
residual_tol + 1.0e-12,
residual_tol,
),
ConstrainedStationaryCertificate::Accept,
);
// Beyond the 4x band the residual is too large to be a mere
// conditioning floor: the certificate must refuse the phantom
// multiplier rather than fake convergence.
assert_eq!(
constrained_stationary_certificate_decision(
&math,
objective_change,
objective_tol,
step_tol,
None,
4.0 * residual_tol + 1.0e-6,
residual_tol,
),
ConstrainedStationaryCertificate::RefusePhantomMultiplier,
);
}
/// Negative case: a genuine non-stationary state must NOT trigger
/// the certificate. We construct numbers where the linear solve
/// successfully neutralises g (linearized_rel small) — meaning Newton
/// is making real progress on an unconstrained problem — and verify
/// the certificate does NOT fire.
#[test]
fn joint_newton_math_unconstrained_progress_does_not_match_certificate() {
let math = JointNewtonMathDiagnostic {
// Unconstrained Newton: linear solve reduces ‖g‖ by O(1e-12).
old_kkt_inf: 1.0e3,
linearized_next_kkt_inf: 1.0e-9,
predicted_reduction: 5.0e-1,
actual_reduction: 5.0e-1,
trust_ratio: 1.0,
step_inf: 1.0e-1,
proposal_inf: 1.0e-1,
};
let linearized_rel = math.linearized_next_kkt_inf / (1.0 + math.old_kkt_inf);
assert!(
linearized_rel < 0.5,
"unconstrained Newton must have linearized_rel < 0.5 (was {:.3e})",
linearized_rel,
);
}
#[test]
fn projected_stationarity_inf_norm_projects_coupled_linear_kkt_multipliers() {
assert!(file!().ends_with(".rs"));
let constraints = LinearInequalityConstraints {
a: array![[1.0, 1.0]],
b: array![1.0],
};
let beta_active = array![0.25, 0.75];
let residual_valid_multiplier = array![3.0, 3.0];
let inf_valid = projected_stationarity_inf_norm(
&residual_valid_multiplier,
&beta_active,
Some(&constraints),
None,
);
assert_relative_eq!(inf_valid, 0.0_f64, epsilon = 1e-10);
let vec_valid = projected_linear_constraint_stationarity_vector(
&residual_valid_multiplier,
&beta_active,
&constraints,
None,
)
.expect("coupled active projection should succeed");
assert_relative_eq!(vec_valid[0], 0.0_f64, epsilon = 1e-10);
assert_relative_eq!(vec_valid[1], 0.0_f64, epsilon = 1e-10);
let residual_wrong_sign = array![-3.0, -3.0];
let inf_wrong = projected_stationarity_inf_norm(
&residual_wrong_sign,
&beta_active,
Some(&constraints),
None,
);
assert_relative_eq!(inf_wrong, 3.0_f64, epsilon = 1e-12);
let beta_interior = array![0.75, 0.75];
let inf_interior = projected_stationarity_inf_norm(
&residual_valid_multiplier,
&beta_interior,
Some(&constraints),
None,
);
assert_relative_eq!(inf_interior, 3.0_f64, epsilon = 1e-12);
}
#[test]
fn joint_stationarity_from_gradient_projects_coupled_linear_constraints() {
assert!(file!().ends_with(".rs"));
let spec = ParameterBlockSpec {
name: "coupled".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
[1.0, 0.0],
[0.0, 1.0]
])),
offset: array![0.0, 0.0],
penalties: Vec::new(),
nullspace_dims: Vec::new(),
initial_log_lambdas: Array1::zeros(0),
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let state = ParameterBlockState {
beta: array![0.25, 0.75],
eta: array![0.25, 0.75],
};
let constraints = LinearInequalityConstraints {
a: array![[1.0, 1.0]],
b: array![1.0],
};
let s_lambdas = vec![Array2::<f64>::zeros((2, 2))];
// residual = S beta - gradient = [4, 4] = A_active^T lambda,
// lambda=4. This is a valid constrained KKT point and must not be
// reported as a large free-gradient residual.
let residual_multiplier = array![4.0, 4.0];
let gradient = -&residual_multiplier;
let projected = exact_newton_joint_stationarity_inf_norm_from_gradient(
&gradient,
&[state.clone()],
std::slice::from_ref(&spec),
&s_lambdas,
0.0,
RidgePolicy::explicit_stabilization_full(),
&[Some(constraints.clone())],
None,
)
.expect("stationarity projection should succeed");
assert_relative_eq!(projected, 0.0_f64, epsilon = 1e-10);
let kkt_residual = exact_newton_joint_projected_kkt_residual_for_ift_from_gradient(
&gradient,
std::slice::from_ref(&spec),
&[state.clone()],
&s_lambdas,
0.0,
RidgePolicy::explicit_stabilization_full(),
&[Some(constraints.clone())],
None,
)
.expect("KKT residual assembly should succeed")
.expect("exact-gradient path should produce residual");
assert_relative_eq!(kkt_residual.as_array()[0], 0.0_f64, epsilon = 1e-10);
assert_relative_eq!(kkt_residual.as_array()[1], 0.0_f64, epsilon = 1e-10);
// Wrong-signed normal residual means the active constraint wants to
// release. That is not convergence and must remain visible.
let wrong_signed_gradient = residual_multiplier;
let unprojected = exact_newton_joint_stationarity_inf_norm_from_gradient(
&wrong_signed_gradient,
&[state],
&[spec],
&s_lambdas,
0.0,
RidgePolicy::explicit_stabilization_full(),
&[Some(constraints)],
None,
)
.expect("stationarity projection should succeed");
assert_relative_eq!(unprojected, 4.0_f64, epsilon = 1e-12);
}
#[test]
fn kkt_residual_uses_cached_joint_gradient_without_re_evaluating_family() {
assert!(file!().ends_with(".rs"));
let spec = ParameterBlockSpec {
name: "cached-gradient".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
[1.0, 0.0],
[0.0, 1.0]
])),
offset: array![0.0, 0.0],
penalties: Vec::new(),
nullspace_dims: Vec::new(),
initial_log_lambdas: Array1::zeros(0),
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let state = ParameterBlockState {
beta: array![2.0, -1.0],
eta: array![2.0, -1.0],
};
let s_lambda = Array2::<f64>::eye(2);
let expected_residual = array![0.25, -0.5];
let cached_gradient = s_lambda.dot(&state.beta) - &expected_residual;
let residual = exact_newton_joint_kkt_residual_for_ift_from_cached_gradient(
&OneBlockAlwaysErrorFamily,
std::slice::from_ref(&spec),
std::slice::from_ref(&state),
std::slice::from_ref(&s_lambda),
0.0,
RidgePolicy::explicit_stabilization_full(),
None,
Some(&cached_gradient),
)
.expect("cached gradient path should not call family.evaluate()")
.expect("cached gradient should produce a KKT residual");
assert_relative_eq!(
residual.as_array()[0],
expected_residual[0],
epsilon = 1e-12
);
assert_relative_eq!(
residual.as_array()[1],
expected_residual[1],
epsilon = 1e-12
);
}
#[test]
fn projected_stationarity_vector_uses_penalized_residual_not_raw_score() {
let spec = ParameterBlockSpec {
name: "score-cancellation".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
[1.0, 0.0],
[0.0, 1.0]
])),
offset: array![0.0, 0.0],
penalties: Vec::new(),
nullspace_dims: Vec::new(),
initial_log_lambdas: Array1::zeros(0),
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let state = ParameterBlockState {
beta: array![10.0, -4.0],
eta: array![10.0, -4.0],
};
let s_lambda = array![[2.0, 0.0], [0.0, 3.0]];
let gradient = array![19.5, -12.25];
let residual = exact_newton_joint_projected_stationarity_vector_from_gradient(
&gradient,
std::slice::from_ref(&state),
std::slice::from_ref(&spec),
std::slice::from_ref(&s_lambda),
0.0,
RidgePolicy::explicit_stabilization_full(),
&[None],
None,
)
.expect("projected stationarity residual should assemble");
assert_relative_eq!(residual[0], 0.5, epsilon = 1e-12);
assert_relative_eq!(residual[1], 0.25, epsilon = 1e-12);
}
#[test]
fn zero_psi_derivative_operator_acts_as_zero_map() {
let n = 17usize;
let p = 5usize;
let op = ZeroPsiDerivativeOperator::new(n, p);
assert_eq!(op.n_data(), n);
assert_eq!(op.p_out(), p);
let u = Array1::from_iter((0..p).map(|k| 1.0 + k as f64));
let v = Array1::from_iter((0..n).map(|k| 1.0 - 0.5 * k as f64));
let fwd = op.forward_mul(0, &u.view()).expect("forward_mul");
assert_eq!(fwd.len(), n);
assert!(fwd.iter().all(|x| *x == 0.0));
let trn = op.transpose_mul(0, &v.view()).expect("transpose_mul");
assert_eq!(trn.len(), p);
assert!(trn.iter().all(|x| *x == 0.0));
let fwd2 = op
.forward_mul_second_diag(0, &u.view())
.expect("forward_mul_second_diag");
assert_eq!(fwd2.len(), n);
assert!(fwd2.iter().all(|x| *x == 0.0));
let trn2 = op
.transpose_mul_second_diag(0, &v.view())
.expect("transpose_mul_second_diag");
assert_eq!(trn2.len(), p);
assert!(trn2.iter().all(|x| *x == 0.0));
let fwd_cross = op
.forward_mul_second_cross(0, 1, &u.view())
.expect("forward_mul_second_cross");
assert_eq!(fwd_cross.len(), n);
assert!(fwd_cross.iter().all(|x| *x == 0.0));
let trn_cross = op
.transpose_mul_second_cross(0, 1, &v.view())
.expect("transpose_mul_second_cross");
assert_eq!(trn_cross.len(), p);
assert!(trn_cross.iter().all(|x| *x == 0.0));
let chunk = op.row_chunk_first(0, 3..7).expect("row_chunk_first");
assert_eq!(chunk.dim(), (4, p));
assert!(chunk.iter().all(|x| *x == 0.0));
let chunk_diag = op
.row_chunk_second_diag(0, 0..n)
.expect("row_chunk_second_diag");
assert_eq!(chunk_diag.dim(), (n, p));
assert!(chunk_diag.iter().all(|x| *x == 0.0));
let chunk_cross = op
.row_chunk_second_cross(0, 1, 1..3)
.expect("row_chunk_second_cross");
assert_eq!(chunk_cross.dim(), (2, p));
assert!(chunk_cross.iter().all(|x| *x == 0.0));
let mut row = Array1::from_elem(p, 9.5);
op.row_vector_first_into(0, 4, row.view_mut())
.expect("row_vector_first_into");
assert!(row.iter().all(|x| *x == 0.0));
// The operator must not advertise dense materialization — production
// hot paths rely on this to avoid forming an (n, p) buffer.
assert!(op.as_materializable().is_none());
}
/// At large scale (n=320 000, p=101) a dense `Array2::zeros((n, p))`
/// for an unused ψ-derivative slot consumes ≈ 0.24 GiB; the spatial-
/// adaptive baseline used to allocate one per ψ coordinate (≈ 1.4 GiB
/// of guaranteed-zero memory at six coords). Replacing the dense zero
/// matrix with a `(0, 0)` shape sentinel — without an implicit
/// operator — must still resolve to `PsiDesignMap::Zero` so callers
/// see exact-zero semantics with O(1) memory.
#[test]
fn spatial_adaptive_zero_xpsi_uses_zero_map_without_dense_allocation() {
let n = 320_000usize;
let p = 101usize;
let deriv = CustomFamilyBlockPsiDerivative {
penalty_index: None,
x_psi: Array2::<f64>::zeros((0, 0)),
s_psi: Array2::<f64>::zeros((0, 0)),
s_psi_components: None,
s_psi_penalty_components: None,
x_psi_psi: None,
s_psi_psi: None,
s_psi_psi_components: None,
s_psi_psi_penalty_components: None,
implicit_operator: None,
implicit_axis: 0,
implicit_group_id: None,
};
let policy = ResourcePolicy::default_library();
let map = resolve_custom_family_x_psi_map(
&deriv,
n,
p,
0..n,
"spatial-adaptive zero sentinel",
&policy,
)
.expect("resolve x_psi map for (0, 0)-sentinel deriv");
match map {
PsiDesignMap::Zero { nrows, ncols } => {
assert_eq!(nrows, n);
assert_eq!(ncols, p);
}
other => panic!(
"(0, 0) x_psi sentinel must resolve to PsiDesignMap::Zero, got {:?}",
std::mem::discriminant(&other)
),
}
}
#[test]
fn zero_psi_derivative_operator_resolves_to_zero_design_map() {
let n = 12usize;
let p = 4usize;
let zero_op: Arc<dyn CustomFamilyPsiDerivativeOperator> =
Arc::new(ZeroPsiDerivativeOperator::new(n, p));
let deriv = CustomFamilyBlockPsiDerivative {
penalty_index: None,
x_psi: Array2::<f64>::zeros((0, 0)),
s_psi: Array2::<f64>::zeros((0, 0)),
s_psi_components: None,
s_psi_penalty_components: None,
x_psi_psi: None,
s_psi_psi: None,
s_psi_psi_components: None,
s_psi_psi_penalty_components: None,
implicit_operator: Some(Arc::clone(&zero_op)),
implicit_axis: 0,
implicit_group_id: None,
};
let policy = ResourcePolicy::default_library();
let map = resolve_custom_family_x_psi_map(&deriv, n, p, 0..n, "zero", &policy)
.expect("resolve x_psi map");
let u = Array1::from_iter((0..p).map(|k| 1.0 + k as f64));
let fwd = map.forward_mul(u.view()).expect("forward_mul map");
assert_eq!(fwd.len(), n);
assert!(fwd.iter().all(|x| *x == 0.0));
let chunk = map.row_chunk(2..5).expect("row_chunk map");
assert_eq!(chunk.dim(), (3, p));
assert!(chunk.iter().all(|x| *x == 0.0));
let map_second =
resolve_custom_family_x_psi_psi_map(&deriv, &deriv, 0, n, p, 0..n, "zero", &policy)
.expect("resolve x_psi_psi map");
let fwd_second = map_second
.forward_mul(u.view())
.expect("forward_mul second");
assert_eq!(fwd_second.len(), n);
assert!(fwd_second.iter().all(|x| *x == 0.0));
}
#[test]
fn rowwise_kronecker_psi_row_chunks_are_window_consistent() {
let first = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
let second_diag = array![[0.5, 1.0], [1.5, 2.0], [2.5, 3.0]];
let second_cross = array![[-1.0, 0.25], [-1.5, 0.5], [-2.0, 0.75]];
let base = build_embedded_dense_psi_operator(
&first,
&second_diag,
Some(&vec![(1, second_cross.clone())]),
0..2,
2,
0,
)
.expect("embedded dense base");
let time_a = Arc::new(array![[1.0, 0.0], [0.5, 1.0], [1.5, -0.5]]);
let time_b = Arc::new(array![[0.25, 2.0], [-1.0, 0.75], [0.0, 1.25]]);
let op = build_rowwise_kronecker_psi_operator(base, vec![time_a, time_b])
.expect("rowwise kronecker psi operator");
let mat = op
.as_materializable()
.expect("rowwise operator dense reference");
let rows = 1..5;
let first_dense = mat.materialize_first(0).expect("dense first");
let first_chunk = op.row_chunk_first(0, rows.clone()).expect("chunk first");
assert_eq!(
first_chunk,
first_dense.slice(ndarray::s![rows.clone(), ..]).to_owned()
);
let diag_full = op
.row_chunk_second_diag(0, 0..op.n_data())
.expect("full row-chunk diag");
let diag_chunk = op
.row_chunk_second_diag(0, rows.clone())
.expect("chunk diag");
assert_eq!(
diag_chunk,
diag_full.slice(ndarray::s![rows.clone(), ..]).to_owned()
);
let cross_full = op
.row_chunk_second_cross(0, 1, 0..op.n_data())
.expect("full row-chunk cross");
let cross_chunk = op
.row_chunk_second_cross(0, 1, rows.clone())
.expect("chunk cross");
assert_eq!(
cross_chunk,
cross_full.slice(ndarray::s![rows, ..]).to_owned()
);
}
#[test]
fn joint_trust_region_radius_update_accept_reject_logic() {
let accepted = update_joint_trust_region_radius(1.0, 1.0, 2.0, 2.0, 1.0);
assert!(accepted.accepted);
assert!((accepted.rho - 1.0).abs() < 1.0e-12);
assert!((accepted.radius - 2.0).abs() < 1.0e-12);
assert_eq!(accepted.decision.label(), "grow_at_boundary");
let rejected = update_joint_trust_region_radius(1.0, 0.5, -0.1, 2.0, 1.0);
assert!(!rejected.accepted);
assert!(rejected.rho < 0.0);
assert!((rejected.radius - 0.25).abs() < 1.0e-12);
assert_eq!(rejected.decision.label(), "shrink_reject");
let rejected_inside_radius = update_joint_trust_region_radius(1.0, 1.0e-3, -0.1, 2.0, 1.0);
assert!(!rejected_inside_radius.accepted);
assert!(
rejected_inside_radius.radius < 1.0e-3,
"a rejected in-radius step must be outside the next trust region"
);
assert!((rejected_inside_radius.radius - 5.0e-4).abs() < 1.0e-12);
assert_eq!(rejected_inside_radius.decision.label(), "shrink_reject");
let poor = update_joint_trust_region_radius(1.0, 0.5, 0.1, 1.0, 1.0);
assert!(poor.accepted);
assert!((poor.rho - 0.1).abs() < 1.0e-12);
assert!((poor.radius - 0.25).abs() < 1.0e-12);
assert_eq!(poor.decision.label(), "shrink_marginal_accept");
}
#[test]
fn joint_trust_region_noise_floor_accepts_round_off_negative_actual() {
// Near-converged iterate at large objective scale: both the
// model-predicted decrease and the realized objective change are
// below the noise floor. Round-off can flip the sign of `actual`;
// the principled response is to accept (rho ≈ 1) rather than
// declare failure on the sign of noise. Mirrors the noise-floor
// branch in `src/solver/pirls.rs`.
let objective_scale = 1.66e5;
let noise_floor = objective_scale * 1e-14;
let predicted = noise_floor * 0.1;
let actual = -noise_floor * 0.5;
let update =
update_joint_trust_region_radius(1.0, 0.05, actual, predicted, objective_scale);
assert!(
update.accepted,
"sub-noise-floor sign flip must not reject as failure"
);
assert!((update.rho - 1.0).abs() < 1.0e-12);
}
#[test]
fn joint_trust_region_noise_floor_rejects_genuine_increase() {
// Genuine objective increase clearly beyond the noise floor must
// still be rejected even when predicted_reduction is sub-floor:
// this is real model failure, not round-off.
let objective_scale = 1.66e5;
let noise_floor = objective_scale * 1e-14;
let predicted = noise_floor * 0.1;
let actual = -1.0;
let update = update_joint_trust_region_radius(1.0, 0.5, actual, predicted, objective_scale);
assert!(
!update.accepted,
"objective increase beyond noise must reject"
);
assert!(update.rho.is_infinite() && update.rho < 0.0);
}
#[test]
fn joint_objective_roundoff_slack_accepts_large_scale_wobble() {
let old_objective = 1.218530e5;
let trial_objective = old_objective + 2.183e-10;
assert!(
trial_objective
<= old_objective + joint_objective_roundoff_slack(old_objective, trial_objective),
"sub-nanounit objective wobble at large scale should not burn all trust attempts"
);
}
#[test]
fn joint_objective_floor_only_accepts_sub_tolerance_model_steps() {
let old_objective = 1.218942e5_f64;
let objective_tol = 1e-6 * (1.0 + old_objective.abs());
let actual_reduction = -3.783e-10;
let predicted_reduction = 9.481e-15;
let trial_objective = old_objective - actual_reduction;
assert!(
joint_objective_floor_reached(
old_objective,
trial_objective,
actual_reduction,
predicted_reduction,
objective_tol,
),
"the repeated large-scale roundoff wobble should terminate immediately"
);
assert!(
!joint_objective_floor_reached(
old_objective,
old_objective + 2.0,
-2.0,
predicted_reduction,
objective_tol,
),
"real objective increases must still be rejected"
);
assert!(
!joint_objective_floor_reached(
old_objective,
trial_objective,
actual_reduction,
10.0 * objective_tol,
objective_tol,
),
"non-negligible predicted progress must not be hidden by the floor exit"
);
// A positive-but-noise-level `actual_reduction` must NOT trigger the
// floor (asymmetric guard). At rank-deficient optima the outer-gradient
// FD identity (`outer_lamlgradient_matches_finite_differencewhen_joint_exact_path_is_active`,
// inner_tol=1e-12) relies on the trust-region loop running the same
// number of attempts at neighbouring λ probes; accepting positive-noise
// reductions exits a cycle earlier on the probe where round-off
// happened to land positive and decorrelates the null-space drift.
let positive_noise_actual = 3.783e-10_f64;
let positive_noise_trial = old_objective - positive_noise_actual;
assert!(
!joint_objective_floor_reached(
old_objective,
positive_noise_trial,
positive_noise_actual,
predicted_reduction,
objective_tol,
),
"positive-noise reductions must NOT trigger the floor; symmetric exit breaks rank-deficient FD identity"
);
}
#[test]
fn joint_inner_convergence_rejects_objective_flat_non_kkt_stall() {
// Direct reproduction of the bad 0.1.79 log shape:
//
// obj=4.472714e5 Δobj=5.381e-2 |δ|∞=2.794e-2
// residual=5.980e1 tol=4.473e-1
//
// The objective and step are both flat at this scale, but the KKT
// residual is 134x tolerance. Accepting this as an inner optimum makes
// the envelope-theorem outer gradient invalid, which is what surfaced
// as outer BFGS objective stalls with |g|≈1e14-1e16.
let objective = 4.472714e5_f64;
let inner_tol = 1.0e-6_f64;
let objective_change = 5.381e-2_f64;
let accepted_step_inf = 2.794e-2_f64;
let residual = 5.980e1_f64;
let residual_tol = inner_tol * (1.0 + objective);
let step_tol = 1.242e-3_f64;
let objective_tol = residual_tol;
let old_flat_step_predicate = objective_change <= objective_tol
&& accepted_step_inf <= objective_tol.sqrt().max(step_tol);
assert!(
old_flat_step_predicate,
"the historical objective-flat/step-flat predicate would have accepted this stalled inner solve"
);
assert!(
!joint_inner_kkt_converged(residual, residual_tol),
"inner convergence must require KKT residual <= tolerance"
);
assert!(
!joint_inner_kkt_converged(1.5 * residual_tol, residual_tol),
"near-miss residual slack would still invalidate the outer envelope gradient"
);
}
#[test]
fn joint_trust_region_block_metric_does_not_starve_unrelated_blocks() {
const TIME_W: usize = 12;
const MARG_W: usize = 11;
const LOG_W: usize = 10;
const P: usize = TIME_W + MARG_W + LOG_W;
let mut h = Array2::<f64>::zeros((P, P));
let mut g = Array1::<f64>::zeros(P);
h[[0, 0]] = 2.24e8;
g[0] = -5.6e8;
for i in 1..TIME_W {
h[[i, i]] = 1.0 + 0.3 * i as f64;
g[i] = -0.3 - 0.07 * i as f64;
}
for j in 0..MARG_W {
let idx = TIME_W + j;
h[[idx, idx]] = 1.2 + 0.2 * j as f64;
g[idx] = -0.9;
}
let log0 = TIME_W + MARG_W;
h[[log0, log0]] = 1.0e-5;
g[log0] = -2.173;
for k in 1..LOG_W {
let idx = log0 + k;
h[[idx, idx]] = 1.5 + 0.1 * k as f64;
g[idx] = -0.4;
}
let mut newton = Array1::<f64>::zeros(P);
for i in 0..P {
newton[i] = -g[i] / h[[i, i]];
}
let mut raw_global = newton.clone();
let raw_norm = raw_global.iter().map(|v| v * v).sum::<f64>().sqrt();
if raw_norm.is_finite() && raw_norm > 20.0 {
raw_global.mapv_inplace(|v| v * (20.0 / raw_norm));
}
let raw_linearized = (&g + &h.dot(&raw_global))
.iter()
.map(|v| v.abs())
.fold(0.0_f64, f64::max)
/ (1.0 + g.iter().map(|v| v.abs()).fold(0.0_f64, f64::max));
assert!(
raw_linearized > 0.99,
"raw concatenated L2 truncation should reproduce the starvation mechanism"
);
let ranges = vec![(0, TIME_W), (TIME_W, TIME_W + MARG_W), (TIME_W + MARG_W, P)];
let metric_diag = h.diag().to_owned();
let full_block_norms =
joint_trust_region_block_metric_norms(&newton, &ranges, &metric_diag);
let mut block_metric = newton.clone();
let block_radii = vec![full_block_norms[0], full_block_norms[1], 20.0];
truncate_joint_step_to_block_metric_radii(
&mut block_metric,
&ranges,
&metric_diag,
&block_radii,
);
let block_linearized = (&g + &h.dot(&block_metric))
.iter()
.map(|v| v.abs())
.fold(0.0_f64, f64::max)
/ (1.0 + g.iter().map(|v| v.abs()).fold(0.0_f64, f64::max));
assert!(
block_linearized < 1.0e-6,
"block-local curvature metric must let the time block neutralize its KKT defect; got {block_linearized:.3e}"
);
}
#[test]
fn shrink_active_joint_block_trust_radii_strictly_decreases_max_radius() {
// Regression for the joint-Newton fully-rejected stall. Before the
// fix, when a boundary block's radius was already at the 1e-12 floor
// and an interior block held the max, `shrink_active_joint_block_trust_radii`
// returned the same `max(block_radii)` on every call — the trust
// region never actually shrank, the dogleg recomputed an identical
// joint δ, and the inner solver burned `inner_loop_hard_ceiling`
// cycles before the 8-cycle stall guard finally bailed it out. The
// fix must guarantee that every call strictly decreases the joint
// trust radius until the floor.
let mut block_radii = vec![1.0, 1.0e-12];
// Boundary block (#1) sits at the radius floor with step at boundary;
// interior block (#0) has step well inside its radius. Before the
// fix: only block #1 participates, its radius re-clamps to 1e-12,
// returned max stays at 1.0 — byte-identical to the previous call.
let block_step_norms = vec![1.0e-3, 1.0e-12];
let old_max = block_radii.iter().copied().fold(0.0_f64, f64::max);
let new_max =
shrink_active_joint_block_trust_radii(&mut block_radii, &block_step_norms, 0.25);
assert!(
new_max < old_max,
"joint trust radius must strictly decrease when a step is rejected (was {old_max:.3e}, now {new_max:.3e})"
);
// Interior block must have shrunk below its current step norm so the
// next dogleg step is forced strictly smaller in that block.
assert!(
block_radii[0] < block_step_norms[0],
"interior block radius must drop below its step norm to force a strictly smaller next step (radius {:.3e}, step {:.3e})",
block_radii[0],
block_step_norms[0]
);
}
#[test]
fn shrink_active_joint_block_trust_radii_pulls_radius_below_step_norm() {
// The accept-path radius update (`update_joint_trust_region_radius`)
// pulls the new radius below `0.5 * step_norm` on rejection so the
// next step is provably smaller; the reject-path block shrink must
// do the same. Otherwise an interior block with `step_norm <<
// factor * radius` re-takes the identical Newton step on the next
// dogleg attempt and the trust-region globalization is degenerate.
let mut block_radii = vec![1.0];
let block_step_norms = vec![1.0e-3];
let new_max =
shrink_active_joint_block_trust_radii(&mut block_radii, &block_step_norms, 0.25);
assert!(
new_max <= 0.5 * block_step_norms[0],
"shrunken radius must be ≤ 0.5 · step_norm to force a strictly smaller next step (was {new_max:.3e}, step {:.3e})",
block_step_norms[0]
);
}
#[test]
fn blockwise_trust_region_uses_penalized_metric_not_raw_coefficient_size() {
let spec = ParameterBlockSpec {
name: "single_block".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
Array2::<f64>::zeros((1, 3)),
)),
offset: Array1::zeros(1),
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let h: Array2<f64> = array![[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0e-10]];
let work = BlockWorkingSet::ExactNewton {
gradient: array![0.0, 0.0, 0.0],
hessian: SymmetricMatrix::Dense(h.clone()),
};
let s_lambda = Array2::<f64>::zeros((3, 3));
let raw_delta: Array1<f64> = array![2.0, -1.0, 2.0e5];
let raw_inf = raw_delta.iter().fold(0.0_f64, |m, v| {
let value: f64 = *v;
m.max(value.abs())
});
let radius = 20.0_f64;
let raw_inf_scaled = &raw_delta * (radius / raw_inf);
assert!(
raw_inf_scaled[0].abs() < 1.0e-3,
"the old raw coefficient cap would starve ordinary coordinates inside the block"
);
let (metric_delta, metric_norm) = truncate_block_step_to_metric_radius(
&spec,
&work,
&s_lambda,
raw_delta,
radius,
0.0,
RidgePolicy::explicit_stabilization_pospart(),
)
.expect("block metric truncation should succeed");
assert!(
metric_norm < radius,
"the near-null coordinate is large in beta-space but small in the block's penalized-Hessian metric"
);
assert!(
(metric_delta[0] - 2.0).abs() < 1.0e-12
&& (metric_delta[1] + 1.0).abs() < 1.0e-12
&& (metric_delta[2] - 2.0e5).abs() < 1.0e-6,
"blockwise trust regions must size steps in objective curvature units, not raw coefficient units"
);
}
#[test]
fn blockwise_trust_region_never_reverts_to_raw_beta_norm_on_indefinite_curvature() {
let spec = ParameterBlockSpec {
name: "single_block".to_string(),
design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
Array2::<f64>::zeros((1, 3)),
)),
offset: Array1::zeros(1),
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
};
let h: Array2<f64> = array![[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, -1.0e-8]];
let work = BlockWorkingSet::ExactNewton {
gradient: array![0.0, 0.0, 0.0],
hessian: SymmetricMatrix::Dense(h),
};
let s_lambda = Array2::<f64>::zeros((3, 3));
let raw_delta: Array1<f64> = array![2.0, -1.0, 2.0e5];
let radius = 20.0_f64;
let old_quadratic = raw_delta.dot(&array![2.0, -1.0, -2.0e-3]);
assert!(
old_quadratic < 0.0,
"fixture must hit the historical non-SPD branch"
);
let (metric_delta, metric_norm) = truncate_block_step_to_metric_radius(
&spec,
&work,
&s_lambda,
raw_delta,
radius,
0.0,
RidgePolicy::explicit_stabilization_pospart(),
)
.expect("block metric truncation should succeed");
assert!(
metric_norm < radius,
"indefinite curvature must still use the positive penalized diagonal metric, not raw beta length"
);
assert!(
(metric_delta[0] - 2.0).abs() < 1.0e-12
&& (metric_delta[1] + 1.0).abs() < 1.0e-12
&& (metric_delta[2] - 2.0e5).abs() < 1.0e-6,
"non-SPD local curvature must not resurrect coefficient-space trust-region scaling"
);
}
#[test]
fn joint_trust_region_rosenbrock_like_quadratic_is_armijo_safe() {
// Local Rosenbrock-at-the-valley quadratic in variables (x, y):
// f ≈ 0.5 * [dx, dy]' H [dx, dy], H = [[802, -400], [-400, 200]].
// Add a tiny ridge to make the test SPD and use a gradient whose full
// Newton step crosses the radius, exercising truncation before the
// objective is evaluated.
let h = array![[802.0, -400.0], [-400.0, 200.1]];
let unconstrained = array![1.0, 1.0];
let gradient = -h.dot(&unconstrained);
let rhs = -&gradient;
let mut step = unconstrained.clone();
let unconstrained_norm = unconstrained.iter().map(|v| v * v).sum::<f64>().sqrt();
assert!(unconstrained_norm > 0.25);
step.mapv_inplace(|v| v * (0.25 / unconstrained_norm));
let step_norm = step.iter().map(|v| v * v).sum::<f64>().sqrt();
assert!(step_norm <= 0.25 + 1.0e-12);
let h_step = h.dot(&step);
let predicted = joint_quadratic_predicted_reduction(&rhs, &h_step, &step);
let old_objective = 0.0;
let trial_objective = gradient.dot(&step) + 0.5 * step.dot(&h_step);
let actual = old_objective - trial_objective;
assert!(predicted > 0.0);
assert!((predicted - actual).abs() < 1.0e-10);
let update =
update_joint_trust_region_radius(0.25, step_norm, actual, predicted, old_objective);
assert!(update.accepted);
assert!(trial_objective < old_objective);
}
// Inline RED REPRO moved to tests/joint_newton_isotropic_tr_starvation.rs
// so it survives in-progress refactors of the surrounding test
// support module (this `mod tests { }` currently does not compile due
// to `crate::test_support::*` / `test_outerobjective_andgradient` WIP).
/// Synthetic 3-block fixture where the joint penalized Hessian is
/// rank-deficient inside block 2 (block-diagonal H with two
/// well-conditioned 3x3 identity blocks and a rank-1 third block; all
/// s_lambdas are zero so the penalty does not lift the deficiency).
/// The gradient is concentrated on block 2's null directions so the
/// stationarity residual is dominated by block 2. The report must
/// (a) classify the refusal as `RankDeficientHPen`, (b) record
/// nullity > 0, and (c) name block 2 as the carrying block.
#[test]
fn kkt_refusal_report_classifies_rank_deficient_hpen_third_block() {
let block_widths = [3usize, 3, 3];
let total_p: usize = block_widths.iter().sum();
let block_count = block_widths.len();
let mut specs: Vec<ParameterBlockSpec> = Vec::with_capacity(block_count);
let mut states: Vec<ParameterBlockState> = Vec::with_capacity(block_count);
let mut s_lambdas: Vec<Array2<f64>> = Vec::with_capacity(block_count);
let mut ranges: Vec<(usize, usize)> = Vec::with_capacity(block_count);
let names = ["block_a", "block_b", "block_c_rank_deficient"];
let mut offset = 0usize;
for (b, &width) in block_widths.iter().enumerate() {
let start = offset;
let end = start + width;
offset = end;
ranges.push((start, end));
specs.push(ParameterBlockSpec {
name: names[b].to_string(),
design: DesignMatrix::from(Array2::<f64>::zeros((1, width))),
offset: Array1::zeros(1),
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
});
states.push(ParameterBlockState {
beta: Array1::zeros(width),
eta: Array1::zeros(1),
});
s_lambdas.push(Array2::<f64>::zeros((width, width)));
}
// Block-diagonal H: I(3) ⊕ I(3) ⊕ e0 e0ᵀ (third block rank 1, nullity 2).
let mut h = Array2::<f64>::zeros((total_p, total_p));
for i in 0..3 {
h[[i, i]] = 1.0;
h[[3 + i, 3 + i]] = 1.0;
}
h[[6, 6]] = 1.0;
let source = JointHessianSource::Dense(h);
// Concentrate the gradient on block 2's null directions (rows 7,8).
// With s_lambdas all zero and β=0, the stationarity residual equals
// -gradient, so block 2 carries the dominant residual mass.
let mut joint_grad = Array1::<f64>::zeros(total_p);
joint_grad[7] = 5.0;
joint_grad[8] = 3.0;
joint_grad[0] = 1.0e-6;
let cached_active_sets: Vec<Option<Vec<usize>>> = vec![None; block_count];
let block_constraints: Vec<Option<LinearInequalityConstraints>> = vec![None; block_count];
let math = JointNewtonMathDiagnostic {
old_kkt_inf: 5.0,
linearized_next_kkt_inf: 4.9,
predicted_reduction: 1.0e-4,
actual_reduction: 1.0e-4,
trust_ratio: 1.0,
step_inf: 1.0e-9,
proposal_inf: 1.0e-3,
};
let residual_tol = 1.0e-6;
let projected_residual_inf = 5.0;
let report = compute_kkt_refusal_report(
42,
&states,
&specs,
&s_lambdas,
&ranges,
Some(&joint_grad),
&cached_active_sets,
&block_constraints,
Some(&source),
total_p,
0.0,
RidgePolicy::explicit_stabilization_full(),
1.0e-9,
1.0e-3,
1.0,
residual_tol,
1.0e-6,
1.0e-6,
1.0e-8,
projected_residual_inf,
Some(&math),
);
assert_eq!(
report.diagnosis,
KktRefusalDiagnosis::RankDeficientHPen,
"block-2 rank-1 H_pen with zero s_lambdas must classify as RankDeficientHPen, got {:?}",
report.diagnosis,
);
assert!(
report.hpen_nullity_at_rank_tol > 0,
"rank-1 block embedded in 9x9 block-diagonal H must register nullity > 0, got {}",
report.hpen_nullity_at_rank_tol,
);
assert_eq!(
report.block_carrying_residual,
Some(2),
"block 2 must carry the largest |∇L − Sβ|∞ component; got {:?}, residuals={:?}",
report.block_carrying_residual,
report.block_residual_inf,
);
assert_eq!(report.block_names.len(), block_count);
assert_eq!(
report.block_names[2], "block_c_rank_deficient",
"carrying-block name should be the third block",
);
assert!(
report
.format_structured_log(residual_tol)
.contains("rank_deficient_H_pen"),
"structured log must surface the diagnosis label",
);
assert!(
report
.format_bubbled_error()
.contains("block_c_rank_deficient"),
"bubbled error must name the carrying block by spec.name",
);
assert!(
report
.format_bubbled_error()
.contains("structural or numerical null direction"),
"rank-deficient refusals should no longer emit the old polynomial-only guidance",
);
}
/// Round-trip: every variant's `as_str()` output, when embedded in the
/// `diagnosis: <label>` slot of the bubbled-error format, must parse
/// back via `parse_from_error`. seed-accounting's `InnerStatus`
/// classifier reads diagnoses out of bubbled error strings via that
/// parser; if a variant's label diverges between formatter and parser
/// the classifier silently falls back to "unknown" and the early-exit
/// canary degrades to a generic non-converged result.
#[test]
fn kkt_refusal_diagnosis_string_round_trip_through_bubbled_error_parser() {
for diagnosis in [
KktRefusalDiagnosis::RankDeficientHPen,
KktRefusalDiagnosis::PhantomMultiplierWithWellConditionedH,
KktRefusalDiagnosis::ActiveSetIncomplete,
KktRefusalDiagnosis::AliasingDetectedAtFit,
] {
let label = diagnosis.as_str();
// Mimic the trailing slot exactly as `format_bubbled_error`
// emits it (label at the very end after `; diagnosis: `).
let synthetic_error = format!(
"coupled exact-joint inner solve exited the joint Newton path before convergence \
— cycle=7 cert REFUSED: residual=1.0e-2 > tol=1.0e-6; \
diagnosis: {label}"
);
let parsed = KktRefusalDiagnosis::parse_from_error(&synthetic_error);
assert_eq!(
parsed,
Some(diagnosis),
"label '{label}' must round-trip through parse_from_error; got {:?}",
parsed,
);
}
}
#[test]
fn kkt_refusal_guidance_distinguishes_marginal_slope_coupling_from_polynomial_nullspace() {
let phantom = KktRefusalDiagnosis::PhantomMultiplierWithWellConditionedH.guidance();
assert!(phantom.contains("marginal/logslope coupling"));
assert!(phantom.contains("rather than a"));
assert!(phantom.contains("Matérn/Duchon polynomial-nullspace failure"));
let active = KktRefusalDiagnosis::ActiveSetIncomplete.guidance();
assert!(active.contains("active-set certification failure"));
assert!(active.contains("not a polynomial-nullspace diagnosis"));
let alias = KktRefusalDiagnosis::AliasingDetectedAtFit.guidance();
assert!(alias.contains("drop or reparameterize"));
}
/// Regression canary: a synthetic 3-block fixture chosen to mimic the
/// large-scale rank-deficient-H_pen failure mode — block-diagonal H with
/// a fully degenerate third block and zero s_lambdas — must classify
/// as `RankDeficientHPen` with nullity matching the structural rank
/// deficiency. When `nullspace-lead`'s smooth-construction
/// reparameterization lands and absorbs polynomial null spaces into
/// the parametric block, the SAME fixture (rewritten with a
/// full-rank reparameterized basis) should fit cleanly with no
/// refusal. That follow-up half is wired below behind `#[ignore]`
/// per the lead's note; the diagnosis half here is active so the
/// canary fires today on the failure mode the rework targets.
#[test]
fn rank_deficient_hpen_canary_fires_on_large_scale_shaped_failure() {
let block_widths = [4usize, 4, 4];
let total_p: usize = block_widths.iter().sum();
let block_count = block_widths.len();
let mut specs: Vec<ParameterBlockSpec> = Vec::with_capacity(block_count);
let mut states: Vec<ParameterBlockState> = Vec::with_capacity(block_count);
let mut s_lambdas: Vec<Array2<f64>> = Vec::with_capacity(block_count);
let mut ranges: Vec<(usize, usize)> = Vec::with_capacity(block_count);
let names = ["location_block", "scale_block", "marginal_slope_block"];
let mut offset = 0usize;
for (b, &width) in block_widths.iter().enumerate() {
let start = offset;
let end = start + width;
offset = end;
ranges.push((start, end));
specs.push(ParameterBlockSpec {
name: names[b].to_string(),
design: DesignMatrix::from(Array2::<f64>::zeros((1, width))),
offset: Array1::zeros(1),
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
});
states.push(ParameterBlockState {
beta: Array1::zeros(width),
eta: Array1::zeros(1),
});
s_lambdas.push(Array2::<f64>::zeros((width, width)));
}
// H = I(4) ⊕ I(4) ⊕ 0 — the third block is the marginal-slope
// pathology: zero Hessian curvature on a 4-D null space the
// penalty does not constrain (s_lambdas are zero everywhere).
let mut h = Array2::<f64>::zeros((total_p, total_p));
for i in 0..4 {
h[[i, i]] = 1.0;
h[[4 + i, 4 + i]] = 1.0;
}
// Marginal-slope block left as the zero matrix → nullity = 4.
let source = JointHessianSource::Dense(h);
// Gradient mass concentrated on the marginal-slope block. With
// β=0 and S=0, the stationarity residual on that block equals
// −gradient there, so the carrying block is unambiguous.
let mut joint_grad = Array1::<f64>::zeros(total_p);
joint_grad[8] = 4.2;
joint_grad[9] = 1.7;
joint_grad[10] = -2.5;
joint_grad[11] = 0.9;
let cached_active_sets: Vec<Option<Vec<usize>>> = vec![None; block_count];
let block_constraints: Vec<Option<LinearInequalityConstraints>> = vec![None; block_count];
let math = JointNewtonMathDiagnostic {
old_kkt_inf: 4.2,
linearized_next_kkt_inf: 4.2,
predicted_reduction: 0.0,
actual_reduction: 0.0,
trust_ratio: 0.0,
step_inf: 0.0,
proposal_inf: 1.0e-3,
};
let report = compute_kkt_refusal_report(
123,
&states,
&specs,
&s_lambdas,
&ranges,
Some(&joint_grad),
&cached_active_sets,
&block_constraints,
Some(&source),
total_p,
0.0,
RidgePolicy::explicit_stabilization_full(),
0.0,
1.0e-3,
1.0,
1.0e-6,
1.0e-6,
1.0e-6,
0.0,
4.2,
Some(&math),
);
assert_eq!(
report.diagnosis,
KktRefusalDiagnosis::RankDeficientHPen,
"large-scale-shaped marginal-slope failure must classify as RankDeficientHPen \
(this is the canary nullspace-lead's smooth-construction rework targets)",
);
assert!(
report.hpen_nullity_at_rank_tol >= 4,
"fully degenerate marginal-slope block (4 zero eigenvalues) must contribute \
nullity >= 4; got {}",
report.hpen_nullity_at_rank_tol,
);
assert_eq!(
report.block_carrying_residual,
Some(2),
"marginal_slope_block (idx 2) must carry the residual; got {:?}, residuals={:?}",
report.block_carrying_residual,
report.block_residual_inf,
);
let bubbled = report.format_bubbled_error();
assert_eq!(
KktRefusalDiagnosis::parse_from_error(&bubbled),
Some(KktRefusalDiagnosis::RankDeficientHPen),
"canary's bubbled-error string must parse back via the classifier's parser",
);
assert!(
bubbled.contains("marginal-slope fits can also expose callback-owned weak directions"),
"BMS-shaped refusal should mention the callback-owned weak-direction mechanism"
);
}
/// Post-fix half of the canary: once `nullspace-lead`'s smooth
/// reparameterization absorbs polynomial null spaces into the
/// parametric block, the marginal-slope synthetic above (rewritten
/// to use a full-rank reparameterized basis with the absorbed null
/// columns moved into a separate identifiable block) should fit
/// without any cert refusal.
#[test]
fn rank_deficient_hpen_canary_disappears_after_nullspace_absorption() {
let block_widths = [4usize, 4, 4];
let total_p: usize = block_widths.iter().sum();
let block_count = block_widths.len();
let mut specs: Vec<ParameterBlockSpec> = Vec::with_capacity(block_count);
let mut states: Vec<ParameterBlockState> = Vec::with_capacity(block_count);
let mut s_lambdas: Vec<Array2<f64>> = Vec::with_capacity(block_count);
let mut ranges: Vec<(usize, usize)> = Vec::with_capacity(block_count);
let names = ["location_block", "scale_block", "marginal_slope_block"];
let mut offset = 0usize;
for (b, &width) in block_widths.iter().enumerate() {
let start = offset;
let end = start + width;
offset = end;
ranges.push((start, end));
specs.push(ParameterBlockSpec {
name: names[b].to_string(),
design: DesignMatrix::from(Array2::<f64>::zeros((1, width))),
offset: Array1::zeros(1),
penalties: vec![],
nullspace_dims: vec![],
initial_log_lambdas: Array1::zeros(0),
initial_beta: None,
gauge_priority: 100,
jacobian_callback: None,
stacked_design: None,
stacked_offset: None,
});
states.push(ParameterBlockState {
beta: Array1::zeros(width),
eta: Array1::zeros(1),
});
s_lambdas.push(Array2::<f64>::zeros((width, width)));
}
// Full-rank H across all three blocks — the post-absorption
// shape: the polynomial null space has been moved out of the
// smooth and the remaining basis is fully identified by the
// likelihood Hessian.
let h = Array2::<f64>::eye(total_p);
let source = JointHessianSource::Dense(h);
let joint_grad = Array1::<f64>::zeros(total_p);
let cached_active_sets: Vec<Option<Vec<usize>>> = vec![None; block_count];
let block_constraints: Vec<Option<LinearInequalityConstraints>> = vec![None; block_count];
let math = JointNewtonMathDiagnostic {
old_kkt_inf: 0.0,
linearized_next_kkt_inf: 0.0,
predicted_reduction: 0.0,
actual_reduction: 0.0,
trust_ratio: 1.0,
step_inf: 0.0,
proposal_inf: 0.0,
};
let report = compute_kkt_refusal_report(
0,
&states,
&specs,
&s_lambdas,
&ranges,
Some(&joint_grad),
&cached_active_sets,
&block_constraints,
Some(&source),
total_p,
0.0,
RidgePolicy::explicit_stabilization_full(),
0.0,
0.0,
1.0,
1.0e-6,
1.0e-6,
1.0e-6,
0.0,
0.0,
Some(&math),
);
assert_eq!(
report.hpen_nullity_at_rank_tol, 0,
"post-absorption: full-rank H_pen must register nullity 0",
);
assert_ne!(
report.diagnosis,
KktRefusalDiagnosis::RankDeficientHPen,
"post-absorption: the rank-deficiency diagnosis must no longer fire",
);
}
/// Pins the structural effective-df machinery to the exact trace identity
///
/// ```text
/// Σ_j γ_j/(γ_j + λ) = tr{ G (G + λ S)⁻¹ }
/// ```
///
/// on a NON-commuting Gram/penalty pair, where the historical Rayleigh-quotient
/// implementation (diagonal of B only) gave the wrong answer. With
/// `S = diag(1, 4)` and `G = [[1, 0.8], [0.8, 1]]` the true generalized
/// eigenvalues are eig(D^{-1/2} Uᵀ G U D^{-1/2}) ≈ [0.0767072, 1.1732928],
/// whereas the Rayleigh quotients are [1, 0.25]; only the former reproduce the
/// trace identity, and they disagree at λ = 1 (≈0.6111 vs the buggy 0.7000).
#[test]
fn structural_edf_matches_trace_identity_noncommuting_pair() {
// Penalty S = diag(1, 4).
let s = array![[1.0, 0.0], [0.0, 4.0]];
// Design with Gram G = XᵀX = [[1, 0.8], [0.8, 1]]. Use the symmetric
// square root G^{1/2} so that XᵀX = G exactly:
// G = 1.8·v1v1ᵀ + 0.2·v2v2ᵀ, v1=[1,1]/√2, v2=[1,-1]/√2.
let off = 0.5 * (1.8_f64.sqrt() - 0.2_f64.sqrt());
let diag = 0.5 * (1.8_f64.sqrt() + 0.2_f64.sqrt());
let x = array![[diag, off], [off, diag]];
let design = DesignMatrix::from(x);
let penalty = PenaltyMatrix::Dense(s.clone());
let gammas = design_penalty_range_gammas(&design, &penalty)
.expect("2x2 full-rank p×p pair must yield generalized eigenvalues");
assert_eq!(gammas.len(), 2, "range(S) is full rank ⇒ two γ_j");
// Reference: G = XᵀX, and tr(G (G+λS)⁻¹) computed via the closed-form
// 2×2 inverse of M = G + λ S (det/adjugate), independent of the helper.
let g = array![[1.0, 0.8], [0.8, 1.0]];
let trace_g_minv = |lambda: f64| -> f64 {
let m00 = g[(0, 0)] + lambda * s[(0, 0)];
let m01 = g[(0, 1)] + lambda * s[(0, 1)];
let m10 = g[(1, 0)] + lambda * s[(1, 0)];
let m11 = g[(1, 1)] + lambda * s[(1, 1)];
let det = m00 * m11 - m01 * m10;
// M⁻¹ = (1/det) [[m11, -m01], [-m10, m00]];
// tr(G M⁻¹) = (1/det) · [ G00·m11 - G01·m10 - G10·m01 + G11·m00 ].
(g[(0, 0)] * m11 - g[(0, 1)] * m10 - g[(1, 0)] * m01 + g[(1, 1)] * m00) / det
};
for &lambda in &[1.0_f64, 0.3] {
let rho = lambda.ln();
let edf = unit_weight_term_edf(&gammas, rho);
let trace = trace_g_minv(lambda);
assert!(
(edf - trace).abs() < 1e-9,
"structural edf {edf} must equal tr(G(G+λS)⁻¹) {trace} at λ={lambda}",
);
}
// Sanity: the buggy Rayleigh quotients [1, 0.25] would give 0.7 at λ=1,
// which the trace identity (≈0.6111) rejects — guard against regression
// to the diagonal-only computation.
let edf_at_one = unit_weight_term_edf(&gammas, 0.0_f64);
assert!(
(edf_at_one - 0.611111_f64).abs() < 1e-5,
"edf at λ=1 must be ≈0.6111 (true), not 0.7000 (Rayleigh-quotient bug): got {edf_at_one}",
);
}
}