gam 0.3.109 - Docs.rs

use crate::basis::{BasisOptions, PenaltyInfo, PenaltySource};
use crate::custom_family::{
    AdditiveBlockJacobian, BatchedOuterGradientTerms, BlockEffectiveJacobian, BlockWorkingSet,
    BlockwiseFitOptions, CustomFamily, CustomFamilyBlockPsiDerivative,
    CustomFamilyJointDesignChannel, CustomFamilyJointDesignPairContribution,
    CustomFamilyJointPsiOperator, CustomFamilyPsiDesignAction, CustomFamilyPsiLinearMapRef,
    CustomFamilyPsiSecondDesignAction, CustomFamilyWarmStart, ExactNewtonJointGradientEvaluation,
    ExactNewtonJointHessianWorkspace, ExactNewtonJointPsiDirectCache,
    ExactNewtonJointPsiSecondOrderTerms, ExactNewtonJointPsiWorkspace, FamilyChannelHessian,
    FamilyEvaluation, ParameterBlockSpec, ParameterBlockState, PenaltyMatrix, PsiDesignMap,
    evaluate_custom_family_joint_hyper, evaluate_custom_family_joint_hyper_efs, fit_custom_family,
    fit_custom_family_fixed_log_lambdas, resolve_custom_family_x_psi_map,
    resolve_custom_family_x_psi_psi_map, second_psi_linear_map, shared_dense_arc,
    weighted_crossprod_psi_maps,
};
use crate::estimate::UnifiedFitResult;
use crate::faer_ndarray::{fast_ab, fast_atv, fast_av, fast_joint_hessian_2x2};
use crate::families::location_scale_engine::build_location_scale_exact_joint_setup;
use crate::families::parameter_block::ParameterBlockInput;
use crate::families::scale_design::{
    build_scale_deviation_operator, build_scale_deviation_transform_design,
};
use crate::families::sigma_link::{
    LOGB_SIGMA_FLOOR, SigmaJet1, exp_sigma_derivs_up_to_fourth_scalar,
    exp_sigma_derivs_up_to_third, exp_sigma_from_eta_scalar, exp_sigma_jet1_scalar,
    logb_sigma_from_eta_scalar, logb_sigma_jet1_scalar, safe_exp,
};
use crate::families::spatial_psi_bridge::build_block_spatial_psi_derivatives;
// The monotone-wiggle helpers live in the neutral `families::wiggle` module
// (decoupling refactor); this block imports only the ones gamlss's own non-test
// code uses. Symbols used solely by this module's `#[cfg(test)]` block
// (`initializewiggle_knots_from_seed`, `monotone_wiggle_internal_degree`,
// `split_wiggle_penalty_orders`) are imported inside that block instead, so they
// are not flagged unused in a non-test `--lib` build; downstream consumers import
// from `families::wiggle` directly.
use crate::families::wiggle::{
    SelectedWiggleBasis, WiggleBlockConfig, buildwiggle_block_input_from_knots,
    initializewiggle_knots_from_seed, monotone_wiggle_basis_with_derivative_order,
    monotone_wiggle_nonnegative_constraints, select_wiggle_basis_from_seed,
    validate_monotone_wiggle_beta_nonnegative,
};
use crate::generative::{CustomFamilyGenerative, GenerativeSpec, NoiseModel};
use crate::matrix::SymmetricMatrix;
use crate::matrix::{DenseDesignMatrix, DenseDesignOperator, DesignMatrix};
use crate::mixture_link::{inverse_link_jet_for_inverse_link, inverse_link_mu_d1_for_inverse_link};
use crate::pirls::LinearInequalityConstraints;
use crate::probability::{normal_logcdf, normal_logsf, standard_normal_quantile};
use crate::smooth::{
    BlockwisePenalty, ExactJointHyperSetup, PenaltyBlockInfo,
    SpatialLengthScaleOptimizationOptions, SpatialLogKappaCoords, TermCollectionDesign,
    TermCollectionSpec, build_term_collection_design, freeze_term_collection_from_design,
    optimize_spatial_length_scale_exact_joint, spatial_dims_per_term,
    spatial_length_scale_term_indices,
};
use crate::solver::estimate::validate_all_finite_estimation;
use crate::types::{InverseLink, RidgePolicy, StandardLink};
use ndarray::{Array1, Array2, ArrayView1, ArrayView2, Axis, s};
use rayon::prelude::*;
use std::borrow::Cow;
use std::collections::{HashMap, hash_map::DefaultHasher};
use std::hash::{Hash, Hasher};
use std::sync::atomic::AtomicUsize;
use std::sync::{Arc, Mutex};

mod binomial_q_derivs;
use binomial_q_derivs::{
    binomial_neglog_q_derivatives_dispatch, binomial_neglog_q_fourth_derivative_dispatch,
};

mod validation;
use validation::{
    minimum_monotone_wiggle_knot_count, validate_binomial_location_scale_termspec,
    validate_binomial_location_scalewiggle_termspec, validate_binomial_response,
    validate_blockrows, validate_gaussian_location_scale_termspec,
    validate_gaussian_location_scalewiggle_termspec, validate_len_match, validate_term_weights,
    validateweights,
};

mod weighted_design_products;
use weighted_design_products::{
    mirror_upper_to_lower, scaled_outer_add, signedwith_floor, xt_diag_x_dense, xt_diag_x_design,
    xt_diag_y_dense, xt_diag_y_design,
};

/// Typed errors surfaced from this module's helpers and family
/// implementations. The `Display` impl writes the carried `reason` verbatim,
/// so callers that historically returned `Result<_, String>` keep their
/// user-visible text byte-for-byte identical after coercion via the
/// `From<GamlssError> for String` impl below.
#[derive(Debug)]
pub enum GamlssError {
    /// Shape, length, row, or column mismatches between matrices,
    /// vectors, specs, or block configurations.
    DimensionMismatch { reason: String },
    /// Generic input validation that doesn't fit a more specific
    /// variant (e.g. positivity-of-response checks, shape parameter
    /// must be finite > 0).
    InvalidInput { reason: String },
    /// Non-finite values discovered in inputs, coefficients, seeds,
    /// or intermediate quantities required to remain finite.
    NonFinite { reason: String },
    /// A model configuration or feature combination is not supported
    /// by the requested family / link / engine (e.g. identity link on
    /// a binomial mean-wiggle family, unexpected design-map variant).
    UnsupportedConfiguration { reason: String },
    /// Bound, range, monotonicity, or sign constraints violated by
    /// supplied parameters or coefficients.
    ConstraintViolation { reason: String },
    /// Numerical failures during inner solves, integration, or
    /// optimization (invalid probabilities, non-finite log-likelihood,
    /// invalid λ, divergence).
    NumericalFailure { reason: String },
}

impl std::fmt::Display for GamlssError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            GamlssError::DimensionMismatch { reason }
            | GamlssError::InvalidInput { reason }
            | GamlssError::NonFinite { reason }
            | GamlssError::UnsupportedConfiguration { reason }
            | GamlssError::ConstraintViolation { reason }
            | GamlssError::NumericalFailure { reason } => f.write_str(reason),
        }
    }
}

impl std::error::Error for GamlssError {}

impl From<GamlssError> for String {
    fn from(err: GamlssError) -> Self {
        err.to_string()
    }
}

/// Numerical floor on μ ∈ (0, 1) used only for downstream `1/μ` and
/// `1/(1-μ)` divisions and for `μ.ln()` / `(1-μ).ln()` in the generic
/// composed-link binomial log-likelihood (where the logit-stable
/// `log_expit` form is unavailable because `q` is the composed link
/// argument, not the raw logit η). Pure numerical safety, NOT a model
/// assumption — when the optimizer pushes μ to the floor it indicates a
/// separated/saturated fit which is detected and surfaced upstream
/// (`detect_logit_instability`, `Unstable` PIRLS status). For
/// composed-link μ, derivatives `dμ/dq` etc. are NOT zeroed when the
/// floor is hit; they carry the legitimate gradient signal of the
/// outer link and zeroing them would create a phantom flat region that
/// the optimizer would converge to as a stationary point.
const MIN_PROB: f64 = 1e-10;
const MIN_DERIV: f64 = 1e-8;
/// Lower clamp on POSITIVE working weights `w_i = (dμ/dη)² / V(μ_i)`
/// to keep `Xᵀ W X` numerically representable. Strictly numerical:
/// `w` enters subsequent dense matrix products and a true zero (which
/// happens when `dμ/dη = 0` at saturation, e.g. logistic μ → 0 with
/// `dμ/dη = μ(1-μ)`) is harmless but a denormal `w` propagates as
/// inf/NaN through `XᵀWX` because `w * (x_i x_j)` underflows
/// non-uniformly. `floor_positiveweight` returns 0 for non-finite or
/// non-positive inputs (so saturation correctly drops the row from
/// the inner Newton system); the floor only fires for *strictly
/// positive* tiny weights. The 1e-12 magnitude is chosen so that
/// `1e-12 · max|x|² · n` stays comfortably above `f64::MIN_POSITIVE`
/// at biobank scale.
///
/// This is the canonical PIRLS positive-weight floor (`1e-12`); the value is
/// owned by [`crate::solver::pirls::MIN_WEIGHT`] so every floored family shares
/// one definition rather than re-declaring it per module.
use crate::solver::pirls::MIN_WEIGHT;
/// Hard symmetric clamp on η used by the Poisson / Gaussian / Gamma working-
/// model log-likelihood loops to keep `exp(η)` and `log(σ)` finite under the
/// IRLS step. Hoisted out of each loop so all three families share the same
/// numerical regime.
const ETA_HARD_CLAMP: f64 = 30.0;

/// Saturated `exp(η)` used by every log-link mean reconstruction in this
/// module: clamp η into `[−ETA_HARD_CLAMP, ETA_HARD_CLAMP]` so `exp` stays
/// finite, then floor at `MIN_WEIGHT` so downstream divisions never see
/// exact zero. Centralising the formula here means a tolerance change
/// propagates to all three families (Poisson / Gaussian / Gamma) without
/// risk of one path drifting.
#[inline]
fn saturated_exp_eta(eta: f64) -> f64 {
    eta.clamp(-ETA_HARD_CLAMP, ETA_HARD_CLAMP)
        .exp()
        .max(MIN_WEIGHT)
}

/// Floor applied to a fitted smoothing parameter λ before `ln(λ)` is taken to
/// seed an outer-loop `initial_log_lambdas` warm start. A pilot fit can return
/// λ underflowed to exactly 0 for a deselected (effectively unpenalized) term;
/// `ln(0) = -inf` would poison the seed, so we floor at the smallest λ that is
/// still numerically distinguishable from zero in the log-domain rather than a
/// modelling-meaningful value. `ln(1e-12) ≈ -27.6` sits well below any λ the
/// outer optimizer would select, so a genuinely tiny pilot λ still seeds the
/// search near its lower edge.
const WARMSTART_LOG_LAMBDA_FLOOR: f64 = 1e-12;

const EXACT_DENSE_BLOCK_BUDGET_BYTES: usize = 512 * 1024 * 1024;
const EXACT_DENSE_TOTAL_BUDGET_BYTES: usize = 2 * 1024 * 1024 * 1024;
const GAMLSS_ROWWISE_PAR_MIN_N: usize = 4096;
const GAMLSS_PROJECTED_TRACE_TARGET_BYTES: usize = 32 * 1024 * 1024;
const GAMLSS_PROJECTED_TRACE_MIN_CHUNK_ROWS: usize = 64;
const GAMLSS_PROJECTED_TRACE_MAX_CHUNK_ROWS: usize = 8192;

fn gamlss_projected_trace_chunk_rows(
    rank: usize,
    projected_channel_count: usize,
    gram_column_count: usize,
) -> usize {
    let per_row_values = rank
        .saturating_mul(projected_channel_count.max(1))
        .saturating_add(gram_column_count.max(1))
        .max(1);
    let per_row_bytes = per_row_values.saturating_mul(std::mem::size_of::<f64>());
    let rows = GAMLSS_PROJECTED_TRACE_TARGET_BYTES / per_row_bytes.max(1);
    rows.clamp(
        GAMLSS_PROJECTED_TRACE_MIN_CHUNK_ROWS,
        GAMLSS_PROJECTED_TRACE_MAX_CHUNK_ROWS,
    )
}

fn gamlss_rowwise_map<F>(n: usize, f: F) -> Array1<f64>
where
    F: Fn(usize) -> f64 + Sync,
{
    if n >= GAMLSS_ROWWISE_PAR_MIN_N {
        Array1::from((0..n).into_par_iter().map(&f).collect::<Vec<f64>>())
    } else {
        Array1::from_iter((0..n).map(f))
    }
}

fn gamlss_rowwise_map_result<F>(n: usize, f: F) -> Result<Array1<f64>, String>
where
    F: Fn(usize) -> Result<f64, String> + Sync,
{
    if n >= GAMLSS_ROWWISE_PAR_MIN_N {
        let values: Result<Vec<f64>, String> = (0..n).into_par_iter().map(&f).collect();
        Ok(Array1::from(values?))
    } else {
        let mut out = Array1::<f64>::zeros(n);
        for i in 0..n {
            out[i] = f(i)?;
        }
        Ok(out)
    }
}

enum DenseOrOperator<'a> {
    Borrowed(&'a Array2<f64>),
    Owned(Array2<f64>),
    Operator(DesignMatrix),
}

impl DenseOrOperator<'_> {
    fn nrows(&self) -> usize {
        match self {
            Self::Borrowed(dense) => dense.nrows(),
            Self::Owned(dense) => dense.nrows(),
            Self::Operator(design) => design.nrows(),
        }
    }

    fn ncols(&self) -> usize {
        match self {
            Self::Borrowed(dense) => dense.ncols(),
            Self::Owned(dense) => dense.ncols(),
            Self::Operator(design) => design.ncols(),
        }
    }

    fn row_chunk(&self, rows: std::ops::Range<usize>) -> Result<Array2<f64>, String> {
        match self {
            Self::Borrowed(dense) => Ok(dense.slice(s![rows, ..]).to_owned()),
            Self::Owned(dense) => Ok(dense.slice(s![rows, ..]).to_owned()),
            Self::Operator(design) => design.try_row_chunk(rows).map_err(|e| e.to_string()),
        }
    }

    fn dot(&self, beta: ArrayView1<'_, f64>) -> Array1<f64> {
        let n = self.nrows();
        let p = self.ncols();
        assert_eq!(beta.len(), p);
        match self {
            Self::Borrowed(dense) => fast_av(*dense, &beta),
            Self::Owned(dense) => fast_av(dense, &beta),
            Self::Operator(design) => {
                let mut out = Array1::<f64>::zeros(n);
                for rows in exact_design_row_chunks(n, p) {
                    let chunk = design
                        .try_row_chunk(rows.clone())
                        .expect("gamlss DesignSlot::dot: design row chunk materialization failed");
                    out.slice_mut(s![rows]).assign(&fast_av(&chunk, &beta));
                }
                out
            }
        }
    }
}

/// Resolve a single dense block design from a `ParameterBlockSpec`, falling
/// back to materializing the sparse representation through the policy when
/// the dense form isn't already cached. Returns `Cow::Borrowed` whenever the
/// spec already holds a dense array; `Cow::Owned` only after a forced
/// materialization. The `materialization_label` string is woven into the
/// materializer's error so callers can pin which block failed.
fn dense_block_from_spec<'a>(
    spec: &'a ParameterBlockSpec,
    material_policy: &crate::resource::MaterializationPolicy,
    materialization_label: &str,
) -> Result<Cow<'a, Array2<f64>>, String> {
    match spec.design.as_dense_ref() {
        Some(d) => Ok(Cow::Borrowed(d)),
        None => Ok(Cow::Owned(
            spec.design
                .try_to_dense_with_policy(material_policy, "gamlss dense_block_from_spec")
                .map_err(|e| format!("{materialization_label}: {e}"))?
                .as_ref()
                .clone(),
        )),
    }
}

/// Resolve the (primary, log-σ) pair of dense block designs that every
/// LocationScale family's spec-aware exact path needs. The primary block is
/// the family-specific "mean" axis (μ for Gaussian, latent t for Binomial);
/// the `short_family_name` ("GaussianLocationScale", "BinomialLocationScale",
/// or their Wiggle siblings) and `primary_label` ("mu" / "threshold") are
/// woven into the per-block materialization label for diagnostics.
fn dense_locscale_block_designs_fromspecs<'a>(
    specs: &'a [ParameterBlockSpec],
    expected_count: usize,
    family_name: &str,
    short_family_name: &str,
    primary_block_idx: usize,
    log_sigma_block_idx: usize,
    primary_label: &str,
    material_policy: &crate::resource::MaterializationPolicy,
) -> Result<(Cow<'a, Array2<f64>>, Cow<'a, Array2<f64>>), String> {
    if specs.len() != expected_count {
        return Err(GamlssError::DimensionMismatch {
            reason: format!(
                "{family_name} expects {expected_count} specs, got {}",
                specs.len()
            ),
        }
        .into());
    }
    let primary = dense_block_from_spec(
        &specs[primary_block_idx],
        material_policy,
        &format!("{short_family_name} dense_block_designs_fromspecs {primary_label}"),
    )?;
    let log_sigma = dense_block_from_spec(
        &specs[log_sigma_block_idx],
        material_policy,
        &format!("{short_family_name} dense_block_designs_fromspecs log_sigma"),
    )?;
    Ok((primary, log_sigma))
}

/// Materialize a single location-scale family's two cached block designs
/// (`primary` = mu/threshold, plus `log_sigma`) into dense matrices, borrowing
/// when the design is already dense and owning a policy-materialized copy
/// otherwise. Every non-wiggle and wiggle location-scale family's
/// `dense_block_designs` method is identical bar the accessed field and the
/// diagnostic labels, so both bits are passed in.
fn dense_locscale_block_designs_cached<'a>(
    primary_design: Option<&'a DesignMatrix>,
    log_sigma_design: Option<&'a DesignMatrix>,
    family_name: &str,
    short_family_name: &str,
    primary_label: &str,
    material_policy: &crate::resource::MaterializationPolicy,
) -> Result<(Cow<'a, Array2<f64>>, Cow<'a, Array2<f64>>), String> {
    let primary_design = primary_design
        .ok_or_else(|| format!("{family_name} exact path is missing {primary_label} design"))?;
    let log_sigma_design = log_sigma_design
        .ok_or_else(|| format!("{family_name} exact path is missing log-sigma design"))?;
    let primary = match primary_design.as_dense_ref() {
        Some(d) => Cow::Borrowed(d),
        None => Cow::Owned(
            primary_design
                .try_to_dense_with_policy(material_policy, "gamlss dense_locscale_block_designs")
                .map_err(|e| {
                    format!("{short_family_name} dense_block_designs {primary_label}: {e}")
                })?
                .as_ref()
                .clone(),
        ),
    };
    let log_sigma = match log_sigma_design.as_dense_ref() {
        Some(d) => Cow::Borrowed(d),
        None => Cow::Owned(
            log_sigma_design
                .try_to_dense_with_policy(material_policy, "gamlss dense_locscale_block_designs")
                .map_err(|e| format!("{short_family_name} dense_block_designs log_sigma: {e}"))?
                .as_ref()
                .clone(),
        ),
    };
    Ok((primary, log_sigma))
}

/// One resolved ψ-direction for a two-axis (primary + log-σ) location-scale
/// family. Holds the neutral pieces shared by every such family's
/// `exact_newton_joint_psi_direction`; each family wraps these into its own
/// named struct (mu/threshold field renames only).
struct LocScalePsiDirectionParts {
    block_idx: usize,
    local_idx: usize,
    primary_psi: PsiDesignMap,
    log_sigma_psi: PsiDesignMap,
    primary_z: Array1<f64>,
    log_sigma_z: Array1<f64>,
}

/// Shared body of every two-axis location-scale family's
/// `exact_newton_joint_psi_direction`. Walks the flat ψ-derivative list,
/// resolves the ψ-design map for the selected block (primary = block 0, log-σ
/// = block 1; the off-axis map is the matching `Zero`), and applies each
/// block's β via `forward_mul`. The wiggle block (and any other index) yields
/// `None`, matching the per-family methods. The only per-family variation —
/// the column counts, the two block betas, the block-list length (2 or 3) and
/// the diagnostic label prefix — is passed in; the math is identical across
/// Gaussian/Binomial × wiggle/non-wiggle.
#[allow(clippy::too_many_arguments)]
fn locscale_joint_psi_direction_parts(
    block_states: &[ParameterBlockState],
    derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
    psi_index: usize,
    n: usize,
    p_primary: usize,
    p_log_sigma: usize,
    primary_block_idx: usize,
    log_sigma_block_idx: usize,
    expected_blocks: usize,
    family_name: &str,
    primary_label: &str,
    policy: &crate::resource::ResourcePolicy,
) -> Result<Option<LocScalePsiDirectionParts>, String> {
    if block_states.len() != expected_blocks || derivative_blocks.len() != expected_blocks {
        return Err(GamlssError::DimensionMismatch {
            reason: format!(
                "{family_name} joint psi direction expects {expected_blocks} blocks and {expected_blocks} derivative block lists, got {} and {}",
                block_states.len(),
                derivative_blocks.len()
            ),
        }
        .into());
    }
    let beta_primary = &block_states[primary_block_idx].beta;
    let beta_log_sigma = &block_states[log_sigma_block_idx].beta;

    let mut global = 0usize;
    for (block_idx, block_derivs) in derivative_blocks.iter().enumerate() {
        for (local_idx, deriv) in block_derivs.iter().enumerate() {
            if global == psi_index {
                let primary_psi;
                let log_sigma_psi;
                let primary_z;
                let log_sigma_z;
                if block_idx == primary_block_idx {
                    primary_psi = resolve_custom_family_x_psi_map(
                        deriv,
                        n,
                        p_primary,
                        0..n,
                        &format!("{family_name} {primary_label}"),
                        policy,
                    )?;
                    primary_z = primary_psi
                        .forward_mul(beta_primary.view())
                        .map_err(|e| format!("{family_name} {primary_label} forward_mul: {e}"))?;
                    log_sigma_psi = PsiDesignMap::Zero {
                        nrows: n,
                        ncols: p_log_sigma,
                    };
                    log_sigma_z = Array1::<f64>::zeros(n);
                } else if block_idx == log_sigma_block_idx {
                    log_sigma_psi = resolve_custom_family_x_psi_map(
                        deriv,
                        n,
                        p_log_sigma,
                        0..n,
                        &format!("{family_name} log-sigma"),
                        policy,
                    )?;
                    log_sigma_z = log_sigma_psi
                        .forward_mul(beta_log_sigma.view())
                        .map_err(|e| format!("{family_name} log-sigma forward_mul: {e}"))?;
                    primary_psi = PsiDesignMap::Zero {
                        nrows: n,
                        ncols: p_primary,
                    };
                    primary_z = Array1::<f64>::zeros(n);
                } else {
                    return Ok(None);
                }
                return Ok(Some(LocScalePsiDirectionParts {
                    block_idx,
                    local_idx,
                    primary_psi,
                    log_sigma_psi,
                    primary_z,
                    log_sigma_z,
                }));
            }
            global += 1;
        }
    }
    Ok(None)
}

/// Shared second-derivative design drift assembly for two-axis location-scale
/// joint-ψ paths. The family-specific methods differ only by block constants,
/// labels, and field names; the ψψ map lookup and `X_{ab} β` action are the
/// same for Gaussian/Binomial and wiggle/non-wiggle variants.
struct LocScalePsiDriftConfig<'a> {
    n: usize,
    p_primary: usize,
    p_log_sigma: usize,
    primary_block_idx: usize,
    log_sigma_block_idx: usize,
    family_name: &'a str,
    primary_label: &'a str,
    policy: &'a crate::resource::ResourcePolicy,
}

fn locscale_joint_psisecond_design_drifts(
    block_states: &[ParameterBlockState],
    derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
    psi_a: &LocationScaleJointPsiDirection,
    psi_b: &LocationScaleJointPsiDirection,
    cfg: LocScalePsiDriftConfig<'_>,
) -> Result<LocationScaleJointPsiSecondDrifts, String> {
    let beta_primary = &block_states[cfg.primary_block_idx].beta;
    let beta_log_sigma = &block_states[cfg.log_sigma_block_idx].beta;
    let mut primary_ab_action = None;
    let mut log_sigma_ab_action = None;
    let mut primary_ab = None;
    let mut log_sigma_ab = None;

    // Smooth ψ second derivatives are block-local. Cross-block ψ_a/ψ_b
    // design second derivatives are therefore zero unless the derivative
    // payload itself supplies them for the same moving block.
    if psi_a.block_idx == psi_b.block_idx {
        let deriv = &derivative_blocks[psi_a.block_idx][psi_a.local_idx];
        let deriv_b = &derivative_blocks[psi_b.block_idx][psi_b.local_idx];
        if psi_a.block_idx == cfg.primary_block_idx {
            let (action, matrix) = psi_psi_map_to_drift_slots(
                deriv,
                deriv_b,
                psi_b.local_idx,
                cfg.n,
                cfg.p_primary,
                &format!("{} {}", cfg.family_name, cfg.primary_label),
                cfg.policy,
            )?;
            primary_ab_action = action;
            primary_ab = matrix;
        } else if psi_a.block_idx == cfg.log_sigma_block_idx {
            let (action, matrix) = psi_psi_map_to_drift_slots(
                deriv,
                deriv_b,
                psi_b.local_idx,
                cfg.n,
                cfg.p_log_sigma,
                &format!("{} log-sigma", cfg.family_name),
                cfg.policy,
            )?;
            log_sigma_ab_action = action;
            log_sigma_ab = matrix;
        }
    }

    let z_primary_ab = second_psi_linear_map(
        primary_ab_action.as_ref(),
        primary_ab.as_ref(),
        cfg.n,
        cfg.p_primary,
    )
    .forward_mul(beta_primary.view());
    let z_ls_ab = second_psi_linear_map(
        log_sigma_ab_action.as_ref(),
        log_sigma_ab.as_ref(),
        cfg.n,
        cfg.p_log_sigma,
    )
    .forward_mul(beta_log_sigma.view());

    Ok(LocationScaleJointPsiSecondDrifts {
        x_primary_ab_action: primary_ab_action,
        x_ls_ab_action: log_sigma_ab_action,
        x_primary_ab: primary_ab,
        x_ls_ab: log_sigma_ab,
        z_primary_ab,
        z_ls_ab,
    })
}

fn psi_psi_map_to_drift_slots(
    deriv: &crate::custom_family::CustomFamilyBlockPsiDerivative,
    deriv_b: &crate::custom_family::CustomFamilyBlockPsiDerivative,
    local_idx_b: usize,
    n: usize,
    p: usize,
    label: &str,
    policy: &crate::resource::ResourcePolicy,
) -> Result<
    (
        Option<crate::custom_family::CustomFamilyPsiSecondDesignAction>,
        Option<Array2<f64>>,
    ),
    String,
> {
    match resolve_custom_family_x_psi_psi_map(
        deriv,
        deriv_b,
        local_idx_b,
        n,
        p,
        0..n,
        label,
        policy,
    )? {
        crate::custom_family::PsiDesignMap::Second { action } => Ok((Some(action), None)),
        crate::custom_family::PsiDesignMap::Dense { matrix } => Ok((None, Some((*matrix).clone()))),
        crate::custom_family::PsiDesignMap::Zero { .. } => Ok((None, None)),
        crate::custom_family::PsiDesignMap::First { .. } => {
            Err(GamlssError::UnsupportedConfiguration {
                reason: format!("{label}: unexpected First variant from _psi_psi_map"),
            }
            .into())
        }
    }
}

fn dense_block_or_operator<'a>(
    design: &'a DesignMatrix,
    n: usize,
    p: usize,
    budget_bytes: usize,
    policy: &crate::resource::ResourcePolicy,
) -> DenseOrOperator<'a> {
    if let Some(dense) = design.as_dense_ref() {
        return DenseOrOperator::Borrowed(dense);
    }

    let dense_bytes = 8usize.saturating_mul(n).saturating_mul(p);
    if dense_bytes <= budget_bytes
        && let Ok(arc) = design
            .try_to_dense_with_policy(&policy.material_policy(), "gamlss dense_block_or_operator")
    {
        return DenseOrOperator::Owned(arc.as_ref().clone());
    }

    DenseOrOperator::Operator(design.clone())
}

fn dense_blocks_planned_budget(blocks: &[&DesignMatrix]) -> Vec<usize> {
    let mut planned = vec![0; blocks.len()];
    let mut total = 0usize;
    for (idx, design) in blocks.iter().enumerate() {
        if design.as_dense_ref().is_some() {
            continue;
        }
        let bytes = 8usize
            .saturating_mul(design.nrows())
            .saturating_mul(design.ncols());
        if bytes <= EXACT_DENSE_BLOCK_BUDGET_BYTES
            && total.saturating_add(bytes) <= EXACT_DENSE_TOTAL_BUDGET_BYTES
        {
            planned[idx] = bytes;
            total += bytes;
        }
    }
    planned
}

pub(super) fn exact_design_row_chunks(
    n: usize,
    p: usize,
) -> impl Iterator<Item = std::ops::Range<usize>> {
    const TARGET_BYTES: usize = 8 * 1024 * 1024;
    const MIN_ROWS: usize = 512;
    const MAX_ROWS: usize = 131_072;
    let rows = (TARGET_BYTES / (p.max(1) * 8))
        .clamp(MIN_ROWS, MAX_ROWS)
        .min(n.max(1));
    (0..n)
        .step_by(rows)
        .map(move |start| start..(start + rows).min(n))
}

fn design_weighted_column_squares(
    design: &DesignMatrix,
    weights: &Array1<f64>,
) -> Result<Array1<f64>, String> {
    let n = design.nrows();
    let p = design.ncols();
    if weights.len() != n {
        return Err(GamlssError::DimensionMismatch {
            reason: format!(
                "design weighted column squares dimension mismatch: weights={}, rows={}",
                weights.len(),
                n
            ),
        }
        .into());
    }
    let mut out = Array1::<f64>::zeros(p);
    for rows in exact_design_row_chunks(n, p) {
        let chunk = design.try_row_chunk(rows.clone()).map_err(|e| {
            format!("design weighted column squares row chunk materialization failed: {e}")
        })?;
        for (local_i, row) in chunk.outer_iter().enumerate() {
            let w = weights[rows.start + local_i];
            if w == 0.0 {
                continue;
            }
            for j in 0..p {
                let x = row[j];
                out[j] += w * x * x;
            }
        }
    }
    Ok(out)
}

#[inline]
fn floor_positiveweight(rawweight: f64, minweight: f64) -> f64 {
    if !rawweight.is_finite() || rawweight <= 0.0 {
        0.0
    } else {
        rawweight.max(minweight)
    }
}

#[inline]
fn logb_dlog_sigma_deta(sigma: f64, d_sigma_deta: f64) -> f64 {
    if d_sigma_deta.is_infinite() {
        1.0
    } else {
        let value = d_sigma_deta / sigma;
        if value.is_finite() {
            value.clamp(0.0, 1.0)
        } else {
            0.0
        }
    }
}

#[inline]
fn gaussian_log_sigma_irlsinfo_directional_derivative(
    weight: f64,
    sigma: f64,
    d_sigma_deta: f64,
    d_eta: f64,
) -> f64 {
    if weight == 0.0 || d_eta == 0.0 || !sigma.is_finite() || sigma <= 0.0 {
        return 0.0;
    }
    // Logb form mirrors gaussian_jointrow_scalars: κ = exp(η)/(b + exp(η)) ∈ [0, 1)
    // and dκ/dη = κ(1−κ). Use dσ/dη over σ directly so the η → −∞ tail
    // preserves subnormal information instead of cancelling in `1 − b/σ`;
    // the helper handles the η → +∞ inf/inf case by returning the analytic
    // limit 1.
    let g = logb_dlog_sigma_deta(sigma, d_sigma_deta);
    if !g.is_finite() || !(0.0..1.0).contains(&g) {
        return 0.0;
    }
    let rawinfo = 2.0 * weight * g * g;
    if !rawinfo.is_finite() || rawinfo <= MIN_WEIGHT {
        return 0.0;
    }
    let dg_deta = g * (1.0 - g);
    let dw = 4.0 * weight * g * dg_deta * d_eta;
    if dw.is_finite() { dw } else { 0.0 }
}

#[derive(Clone, Copy)]
struct GaussianDiagonalRowKernel {
    log_likelihood: f64,
    location_working_weight: f64,
    location_working_shift: f64,
    log_sigma_working_weight: f64,
    log_sigma_working_response: f64,
}

#[inline]
fn gaussian_diagonal_row_kernel(
    y: f64,
    location_eta: f64,
    eta_log_sigma: f64,
    obs_weight: f64,
    ln2pi: f64,
) -> GaussianDiagonalRowKernel {
    if obs_weight == 0.0 {
        return GaussianDiagonalRowKernel {
            log_likelihood: 0.0,
            location_working_weight: 0.0,
            location_working_shift: 0.0,
            log_sigma_working_weight: 0.0,
            log_sigma_working_response: eta_log_sigma,
        };
    }

    // logb noise link σ = b + exp(η) bounds σ ≥ b > 0 by construction, so the
    // Gaussian location-scale objective ½Σ(y−μ)²/σ² + Σlog σ is bounded below
    // for any finite data. Its working weight 1/σ² is bounded by 1/b², so
    // H_μμ has bounded condition number — no after-the-fact floor or cap is
    // needed (the previous (1e-12, 1e24) clamp was a numerical bandaid for the
    // pure-exp link's σ→0 singularity and is structurally unnecessary here).
    // ApproxKind: Exact — working weight analytically bounded in (0, 1/b²].
    let SigmaJet1 { sigma, d1 } = logb_sigma_jet1_scalar(eta_log_sigma);
    let inv_s2 = (sigma * sigma).recip();
    let residual = y - location_eta;
    let location_working_weight = floor_positiveweight(obs_weight * inv_s2, MIN_WEIGHT);
    // dlog σ/dη = (∂σ/∂η)/σ = exp(η)/(b + exp(η)) ∈ [0, 1).
    // Use dσ/dη over σ directly so the η→−∞ tail preserves subnormal
    // derivative information instead of cancelling in `1 − b/σ`; the helper
    // returns the analytic limit 1 for the η→+∞ inf/inf case.
    // Fisher info per obs = 2·(dσ/dη)²/σ² = 2·dlog_sigma_deta², matching the
    // formula for the pure-exp link (where dlog_sigma_deta ≡ 1).
    let dlog_sigma_deta = logb_dlog_sigma_deta(sigma, d1);
    let log_sigma_working_weight = floor_positiveweight(
        2.0 * obs_weight * dlog_sigma_deta * dlog_sigma_deta,
        MIN_WEIGHT,
    );
    let log_sigma_score = obs_weight * (residual * residual * inv_s2 - 1.0) * dlog_sigma_deta;
    let log_sigma_working_response = if log_sigma_working_weight == 0.0 {
        eta_log_sigma
    } else {
        eta_log_sigma + log_sigma_score / log_sigma_working_weight
    };

    GaussianDiagonalRowKernel {
        log_likelihood: obs_weight
            * (-0.5 * (residual * residual * inv_s2 + ln2pi + 2.0 * sigma.ln())),
        location_working_weight,
        location_working_shift: residual,
        log_sigma_working_weight,
        log_sigma_working_response,
    }
}

#[derive(Clone, Copy)]
struct GamlssLambdaLayout {
    k_mean: usize,
    k_noise: usize,
    kwiggle: usize,
}

impl GamlssLambdaLayout {
    fn two_block(k_mean: usize, k_noise: usize) -> Self {
        Self {
            k_mean,
            k_noise,
            kwiggle: 0,
        }
    }

    fn withwiggle(k_mean: usize, k_noise: usize, kwiggle: usize) -> Self {
        Self {
            k_mean,
            k_noise,
            kwiggle,
        }
    }

    fn total(self) -> usize {
        self.k_mean + self.k_noise + self.kwiggle
    }

    fn mean_end(self) -> usize {
        self.k_mean
    }

    fn noise_start(self) -> usize {
        self.k_mean
    }

    fn noise_end(self) -> usize {
        self.k_mean + self.k_noise
    }

    fn wiggle_start(self) -> usize {
        self.k_mean + self.k_noise
    }

    fn wiggle_end(self) -> usize {
        self.k_mean + self.k_noise + self.kwiggle
    }

    fn validate_theta_len(self, theta_len: usize, context: &str) -> Result<(), String> {
        let needed = self.total();
        if theta_len < needed {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "{context} theta too short: got {}, need at least {}",
                    theta_len, needed
                ),
            }
            .into());
        }
        Ok(())
    }

    fn mean_from(self, theta: &Array1<f64>) -> Array1<f64> {
        theta.slice(s![0..self.mean_end()]).to_owned()
    }

    fn noise_from(self, theta: &Array1<f64>) -> Array1<f64> {
        theta
            .slice(s![self.noise_start()..self.noise_end()])
            .to_owned()
    }

    fn wiggle_from(self, theta: &Array1<f64>) -> Array1<f64> {
        theta
            .slice(s![self.wiggle_start()..self.wiggle_end()])
            .to_owned()
    }
}

#[derive(Clone, Copy)]
struct GamlssBetaLayout {
    pt: usize,
    pls: usize,
    pw: usize,
}

impl GamlssBetaLayout {
    fn withwiggle(pt: usize, pls: usize, pw: usize) -> Self {
        Self { pt, pls, pw }
    }

    fn total(self) -> usize {
        self.pt + self.pls + self.pw
    }

    fn split_three(
        self,
        flat: &Array1<f64>,
        context: &str,
    ) -> Result<(Array1<f64>, Array1<f64>, Array1<f64>), String> {
        if flat.len() != self.total() {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "{context} length mismatch: got {}, expected {}",
                    flat.len(),
                    self.total()
                ),
            }
            .into());
        }
        Ok((
            flat.slice(s![0..self.pt]).to_owned(),
            flat.slice(s![self.pt..self.pt + self.pls]).to_owned(),
            flat.slice(s![self.pt + self.pls..self.total()]).to_owned(),
        ))
    }
}

#[derive(Clone, Debug)]
pub struct FamilyMetadata {
    pub name: &'static str,
    pub parameternames: &'static [&'static str],
    pub parameter_links: &'static [ParameterLink],
}

const DEFAULT_GAUGE_PRIORITY: u8 = 100;
const LINK_WIGGLE_GAUGE_PRIORITY: u8 = 80;

fn initial_log_lambdas_orzeros(block: &ParameterBlockInput) -> Result<Array1<f64>, String> {
    let k = block.penalties.len();
    let lambdas = block
        .initial_log_lambdas
        .clone()
        .unwrap_or_else(|| Array1::<f64>::zeros(k));
    if lambdas.len() != k {
        return Err(GamlssError::DimensionMismatch {
            reason: format!(
                "initial_log_lambdas length mismatch: got {}, expected {}",
                lambdas.len(),
                k
            ),
        }
        .into());
    }
    Ok(lambdas)
}

fn build_two_block_exact_joint_setup(
    data: ArrayView2<'_, f64>,
    meanspec: &TermCollectionSpec,
    noisespec: &TermCollectionSpec,
    mean_penalties: usize,
    noise_penalties: usize,
    extra_rho0: &[f64],
    rho0_override: Option<&Array1<f64>>,
    kappa_options: &SpatialLengthScaleOptimizationOptions,
) -> ExactJointHyperSetup {
    // GAMLSS-specific part: assemble the rho seed in [mean | noise | extra]
    // penalty order, honoring a caller override when it matches the layout.
    let rho_dim = mean_penalties + noise_penalties + extra_rho0.len();
    let mut rho0vec = Array1::<f64>::zeros(rho_dim);
    if let Some(rho0) = rho0_override.filter(|rho0| rho0.len() == rho_dim) {
        rho0vec.assign(rho0);
    } else {
        for (i, &rho_init) in extra_rho0.iter().enumerate() {
            rho0vec[mean_penalties + noise_penalties + i] = rho_init;
        }
    }

    // Generic part: per-block log(kappa) seed/bounds and exact-joint assembly,
    // with the two linear predictors (mean, noise) in theta order.
    build_location_scale_exact_joint_setup(data, &[meanspec, noisespec], rho0vec, kappa_options)
}

pub(crate) fn solve_penalizedweighted_projection(
    design: &DesignMatrix,
    offset: &Array1<f64>,
    target_eta: &Array1<f64>,
    weights: &Array1<f64>,
    penalties: &[PenaltyMatrix],
    log_lambdas: &Array1<f64>,
    ridge_floor: f64,
) -> Result<Array1<f64>, String> {
    let n = design.nrows();
    let p = design.ncols();
    if offset.len() != n || target_eta.len() != n || weights.len() != n {
        return Err(GamlssError::DimensionMismatch {
            reason: "solve_penalizedweighted_projection dimension mismatch".to_string(),
        }
        .into());
    }
    if penalties.len() != log_lambdas.len() {
        return Err(GamlssError::DimensionMismatch {
            reason: format!(
                "solve_penalizedweighted_projection lambda mismatch: penalties={}, log_lambdas={}",
                penalties.len(),
                log_lambdas.len()
            ),
        }
        .into());
    }

    let y_star = target_eta - offset;
    let xtwy = design.compute_xtwy(weights, &y_star)?;
    let mut penalty_system = if penalties.is_empty() {
        None
    } else {
        Some(Array2::<f64>::zeros((p, p)))
    };
    for (k, s) in penalties.iter().enumerate() {
        let lambda = log_lambdas[k].exp();
        if !lambda.is_finite() || lambda < 0.0 {
            return Err(GamlssError::NumericalFailure { reason: format!(
                "solve_penalizedweighted_projection encountered invalid lambda at index {k}: {}",
                log_lambdas[k]
            ) }.into());
        }
        if s.nrows() != p || s.ncols() != p {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "solve_penalizedweighted_projection penalty shape mismatch at index {k}: \
                 penalty is {}x{} but design has {} columns",
                    s.nrows(),
                    s.ncols(),
                    p
                ),
            }
            .into());
        }
        if let Some(system) = penalty_system.as_mut() {
            s.add_scaled_to(lambda, system);
        }
    }

    let beta = design.solve_systemwith_policy(
        weights,
        &xtwy,
        penalty_system.as_ref(),
        ridge_floor.max(1e-12),
        RidgePolicy::explicit_stabilization_pospart(),
    )?;
    if beta.iter().any(|v| !v.is_finite()) {
        return Err(
            "solve_penalizedweighted_projection produced non-finite coefficients".to_string(),
        );
    }
    Ok(beta)
}

fn gaussian_location_scalewarm_start(
    y: &Array1<f64>,
    weights: &Array1<f64>,
    mu_block: &ParameterBlockSpec,
    log_sigma_block: &ParameterBlockSpec,
    ridge_floor: f64,
    mean_beta_hint: Option<&Array1<f64>>,
    noise_beta_hint: Option<&Array1<f64>>,
) -> Result<(Array1<f64>, Array1<f64>, f64), String> {
    let betamu = if let Some(beta) = mean_beta_hint {
        beta.clone()
    } else {
        solve_penalizedweighted_projection(
            &mu_block.design,
            &mu_block.offset,
            y,
            weights,
            &mu_block.penalties,
            &mu_block.initial_log_lambdas,
            ridge_floor,
        )?
    };
    let mut mu_hat = mu_block.solver_design().matrixvectormultiply(&betamu);
    mu_hat += mu_block.solver_offset();
    let mut weighted_ss = 0.0;
    let mut weight_sum = 0.0;
    for i in 0..y.len() {
        let wi = weights[i].max(0.0);
        let resid = y[i] - mu_hat[i];
        weighted_ss += wi * resid * resid;
        weight_sum += wi;
    }
    if !weighted_ss.is_finite() || !weight_sum.is_finite() || weight_sum <= 0.0 {
        return Err(
            "gaussian location-scale warm start could not estimate residual scale".to_string(),
        );
    }
    // Warm-start σ̂ must clear the logb floor so the inverse link
    //   η = log(σ − b)
    // is finite. Use a relative cushion above b so the warm-start is in the
    // smooth interior of the link domain.
    let sigma_hat = (weighted_ss / weight_sum)
        .sqrt()
        .max(LOGB_SIGMA_FLOOR * 1.5);
    let beta_log_sigma = if let Some(beta) = noise_beta_hint {
        beta.clone()
    } else {
        let eta_sigma = (sigma_hat - LOGB_SIGMA_FLOOR).ln();
        let sigma_target = Array1::from_elem(y.len(), eta_sigma);
        solve_penalizedweighted_projection(
            &log_sigma_block.design,
            &log_sigma_block.offset,
            &sigma_target,
            weights,
            &log_sigma_block.penalties,
            &log_sigma_block.initial_log_lambdas,
            ridge_floor,
        )?
    };
    Ok((betamu, beta_log_sigma, sigma_hat))
}

/// Total output count for every two-block location-scale family in this
/// module (mu/log_sigma or threshold/log_sigma). The wiggle variants add a
/// third zero-channel block but still drive only two output channels.
const LOCATION_SCALE_N_OUTPUTS: usize = 2;

/// Construct a fully wired location-scale parameter block.
///
/// This is the **only** way to build a LocationScale `ParameterBlockSpec` in
/// this module — by construction the `AdditiveBlockJacobian` callback is
/// always installed, so the channel-aware identifiability audit cannot be
/// silently bypassed by a future `build_blocks` impl that forgets to wire
/// the callback at the tail (re-introducing #319).
///
/// `own_output` is the zero-based output channel this block drives
/// (e.g. 0 for `mu`/`threshold`, 1 for `log_sigma`). `n_family_outputs` is
/// fixed at [`LOCATION_SCALE_N_OUTPUTS`] for every two-block family here
/// but is exposed so the helper composes cleanly with any future
/// k-block extension.
#[allow(clippy::too_many_arguments)]
fn build_location_scale_block(
    name: impl Into<String>,
    design: DesignMatrix,
    offset: Array1<f64>,
    penalties: Vec<PenaltyMatrix>,
    nullspace_dims: Vec<usize>,
    initial_log_lambdas: Array1<f64>,
    initial_beta: Option<Array1<f64>>,
    own_output: usize,
    n_family_outputs: usize,
    caller: &str,
) -> Result<ParameterBlockSpec, String> {
    if own_output >= n_family_outputs {
        return Err(format!(
            "{caller}: own_output={own_output} >= n_family_outputs={n_family_outputs}"
        ));
    }
    let mut spec = ParameterBlockSpec {
        name: name.into(),
        design,
        offset,
        penalties,
        nullspace_dims,
        initial_log_lambdas,
        initial_beta,
        gauge_priority: 100,
        jacobian_callback: None,
        stacked_design: None,
        stacked_offset: None,
    };
    let dense = spec.effective_design(caller)?;
    spec.jacobian_callback = Some(std::sync::Arc::new(AdditiveBlockJacobian {
        design: dense,
        own_output,
        n_family_outputs,
    }));
    Ok(spec)
}

/// Construct the wiggle block that accompanies a two-block location-scale
/// family. The wiggle modulates the inverse link nonlinearly and
/// contributes no linear effective Jacobian — the installed callback
/// therefore exposes a zero `(n × p_w)` design under
/// `n_family_outputs = LOCATION_SCALE_N_OUTPUTS`.
#[allow(clippy::too_many_arguments)]
fn build_location_scale_wiggle_block(
    name: impl Into<String>,
    design: DesignMatrix,
    offset: Array1<f64>,
    penalties: Vec<PenaltyMatrix>,
    nullspace_dims: Vec<usize>,
    initial_log_lambdas: Array1<f64>,
    initial_beta: Option<Array1<f64>>,
    n_rows: usize,
) -> Result<ParameterBlockSpec, String> {
    let p_w = design.ncols();
    let mut spec = ParameterBlockSpec {
        name: name.into(),
        design,
        offset,
        penalties,
        nullspace_dims,
        initial_log_lambdas,
        initial_beta,
        gauge_priority: 100,
        jacobian_callback: None,
        stacked_design: None,
        stacked_offset: None,
    };
    spec.jacobian_callback = Some(std::sync::Arc::new(AdditiveBlockJacobian {
        design: ndarray::Array2::<f64>::zeros((n_rows, p_w)),
        own_output: 0,
        n_family_outputs: LOCATION_SCALE_N_OUTPUTS,
    }));
    Ok(spec)
}

fn prepared_gaussian_log_sigma_design(
    mu_design: &DesignMatrix,
    log_sigma_design: &DesignMatrix,
) -> Result<DesignMatrix, String> {
    if mu_design.nrows() != log_sigma_design.nrows() {
        return Err(GamlssError::DimensionMismatch {
            reason: format!(
                "gaussian log-sigma design row mismatch: mean rows={}, log_sigma rows={}",
                mu_design.nrows(),
                log_sigma_design.nrows()
            ),
        }
        .into());
    }
    // Gaussian location-scale remains identifiable even when μ and log σ use
    // the same covariate basis:
    //
    //   L(μ, η) = 0.5 * Σ_i [ (y_i - μ_i)^2 exp(-2η_i) + 2η_i ],
    //   μ = X_μ β_μ,  η = X_σ β_σ.
    //
    // Shared columns are not a frame mismatch. β_μ and β_σ enter through
    // different sufficient statistics (residual and residual²), so replacing
    // X_σ with (I - P_{X_μ}) X_σ would impose an extra constraint and can
    // erase real heteroscedastic signal when the two blocks share a basis.
    Ok(log_sigma_design.clone())
}

fn identified_binomial_log_sigma_design(
    threshold_design: &TermCollectionDesign,
    log_sigma_design: &TermCollectionDesign,
    weights: &Array1<f64>,
) -> Result<DesignMatrix, String> {
    let non_intercept_start = log_sigma_design
        .intercept_range
        .end
        .min(log_sigma_design.design.ncols());
    let transform = build_scale_deviation_transform_design(
        &threshold_design.design,
        &log_sigma_design.design,
        weights,
        non_intercept_start,
    )?;
    build_scale_deviation_operator(
        threshold_design.design.clone(),
        log_sigma_design.design.clone(),
        &transform,
    )
}

fn identity_penalty(dim: usize) -> Array2<f64> {
    let mut penalty = Array2::<f64>::zeros((dim, dim));
    for i in 0..dim {
        penalty[[i, i]] = 1.0;
    }
    penalty
}

fn append_binomial_log_sigma_shrinkage_penalty_design(design: &mut TermCollectionDesign) {
    let p = design.design.ncols();
    design
        .penalties
        .push(BlockwisePenalty::new(0..p, identity_penalty(p)));
    // Identity penalty penalizes the full space → nullspace dimension is 0.
    design.nullspace_dims.push(0);
    design.penaltyinfo.push(PenaltyBlockInfo {
        global_index: design.penaltyinfo.len(),
        termname: Some("log_sigma_shrinkage".to_string()),
        penalty: PenaltyInfo {
            source: PenaltySource::Other("shrinkage".to_string()),
            original_index: 0,
            active: true,
            effective_rank: p,
            dropped_reason: None,
            nullspace_dim_hint: 0,
            normalization_scale: 1.0,
            kronecker_factors: None,
        },
    });
}

/// Build the (mean, log-σ) parameter-block pair for a Gaussian location-scale
/// family. Shared verbatim by the non-wiggle and wiggle Gaussian builders so the
/// scale-block construction — prepared log-σ design, the REML-selected full-span
/// shrinkage penalty on the scale nullspace, and the joint Gaussian warm start —
/// lives in exactly one place. Callers supply the per-block log-λ vectors sliced
/// from their own layout (two-block vs with-wiggle) and append any extra blocks.
#[allow(clippy::too_many_arguments)]
fn build_gaussian_mean_and_scale_blocks(
    y: &Array1<f64>,
    weights: &Array1<f64>,
    mean_design: &TermCollectionDesign,
    noise_design: &TermCollectionDesign,
    mean_offset: &Array1<f64>,
    noise_offset: &Array1<f64>,
    mean_log_lambdas: Array1<f64>,
    noise_log_lambdas: Array1<f64>,
    mean_beta_hint: Option<Array1<f64>>,
    noise_beta_hint: Option<Array1<f64>>,
    context: &str,
) -> Result<(ParameterBlockSpec, ParameterBlockSpec), String> {
    let mut meanspec = build_location_scale_block(
        "mu",
        mean_design.design.clone(),
        mean_offset.clone(),
        mean_design.penalties_as_penalty_matrix(),
        mean_design.nullspace_dims.clone(),
        mean_log_lambdas,
        mean_beta_hint,
        0,
        LOCATION_SCALE_N_OUTPUTS,
        &format!("{context}: mu"),
    )?;
    let prepared_noise_design =
        prepared_gaussian_log_sigma_design(&mean_design.design, &noise_design.design)?;
    let p_noise = prepared_noise_design.ncols();
    let mut log_sigma_penalty_matrices = noise_design.penalties_as_penalty_matrix();
    log_sigma_penalty_matrices.push(PenaltyMatrix::Dense(identity_penalty(p_noise)));
    let mut log_sigma_nullspace_dims = noise_design.nullspace_dims.clone();
    // Identity penalty penalizes the full log-sigma space -> nullspace 0.
    log_sigma_nullspace_dims.push(0);
    let mut noisespec = build_location_scale_block(
        "log_sigma",
        prepared_noise_design,
        noise_offset.clone(),
        log_sigma_penalty_matrices,
        log_sigma_nullspace_dims,
        noise_log_lambdas,
        noise_beta_hint,
        1,
        LOCATION_SCALE_N_OUTPUTS,
        &format!("{context}: log_sigma"),
    )?;
    if meanspec.initial_beta.is_none() || noisespec.initial_beta.is_none() {
        let (betamu0, beta_ls0, _) = gaussian_location_scalewarm_start(
            y,
            weights,
            &meanspec,
            &noisespec,
            1e-10,
            meanspec.initial_beta.as_ref(),
            noisespec.initial_beta.as_ref(),
        )?;
        if meanspec.initial_beta.is_none() {
            meanspec.initial_beta = Some(betamu0);
        }
        if noisespec.initial_beta.is_none() {
            noisespec.initial_beta = Some(beta_ls0);
        }
    }
    Ok((meanspec, noisespec))
}

/// Build the (threshold, log-σ) parameter-block pair for a Binomial
/// location-scale family. Shared by the non-wiggle and wiggle Binomial builders;
/// mirrors [`build_gaussian_mean_and_scale_blocks`] but with the binomial-
/// identified log-σ design, the link-aware joint warm start, and the same
/// REML-selected full-span scale shrinkage penalty.
#[allow(clippy::too_many_arguments)]
fn build_binomial_threshold_and_scale_blocks(
    y: &Array1<f64>,
    weights: &Array1<f64>,
    link_kind: &InverseLink,
    mean_design: &TermCollectionDesign,
    noise_design: &TermCollectionDesign,
    mean_offset: &Array1<f64>,
    noise_offset: &Array1<f64>,
    mean_log_lambdas: Array1<f64>,
    noise_log_lambdas: Array1<f64>,
    mean_beta_hint: Option<Array1<f64>>,
    noise_beta_hint: Option<Array1<f64>>,
    context: &str,
) -> Result<(ParameterBlockSpec, ParameterBlockSpec), String> {
    let identifiednoise_design =
        identified_binomial_log_sigma_design(mean_design, noise_design, weights)?;
    let p_noise = identifiednoise_design.ncols();
    let mut log_sigma_penalty_matrices: Vec<PenaltyMatrix> =
        noise_design.penalties_as_penalty_matrix();
    log_sigma_penalty_matrices.push(PenaltyMatrix::Dense(identity_penalty(p_noise)));
    let mut thresholdspec = build_location_scale_block(
        "threshold",
        mean_design.design.clone(),
        mean_offset.clone(),
        mean_design.penalties_as_penalty_matrix(),
        vec![],
        mean_log_lambdas,
        mean_beta_hint,
        0,
        LOCATION_SCALE_N_OUTPUTS,
        &format!("{context}: threshold"),
    )?;
    let mut log_sigmaspec = build_location_scale_block(
        "log_sigma",
        identifiednoise_design,
        noise_offset.clone(),
        log_sigma_penalty_matrices,
        vec![],
        noise_log_lambdas,
        noise_beta_hint,
        1,
        LOCATION_SCALE_N_OUTPUTS,
        &format!("{context}: log_sigma"),
    )?;
    if thresholdspec.initial_beta.is_none() || log_sigmaspec.initial_beta.is_none() {
        let (beta_t0, beta_ls0) = binomial_location_scalewarm_start(
            y,
            weights,
            link_kind,
            &thresholdspec,
            &log_sigmaspec,
            thresholdspec.initial_beta.as_ref(),
            log_sigmaspec.initial_beta.as_ref(),
        )?;
        if thresholdspec.initial_beta.is_none() {
            thresholdspec.initial_beta = Some(beta_t0);
        }
        if log_sigmaspec.initial_beta.is_none() {
            log_sigmaspec.initial_beta = Some(beta_ls0);
        }
    }
    Ok((thresholdspec, log_sigmaspec))
}

/// Convert a wiggle block's `PenaltySpec`s into the `PenaltyMatrix` list the
/// location-scale wiggle block expects. Shared by the Gaussian and Binomial
/// wiggle builders, which previously inlined the identical match.
fn wiggle_block_penalty_matrices(wiggle_block: &ParameterBlockInput) -> Vec<PenaltyMatrix> {
    let p_wiggle = wiggle_block.design.ncols();
    wiggle_block
        .penalties
        .iter()
        .map(|spec| match spec {
            crate::solver::estimate::PenaltySpec::Block {
                local, col_range, ..
            } => PenaltyMatrix::Blockwise {
                local: local.clone(),
                col_range: col_range.clone(),
                total_dim: p_wiggle,
            },
            crate::solver::estimate::PenaltySpec::Dense(m)
            | crate::solver::estimate::PenaltySpec::DenseWithMean { matrix: m, .. } => {
                PenaltyMatrix::Dense(m.clone())
            }
        })
        .collect()
}

fn binomial_location_scale_link_eta_from_probability(
    link_kind: &InverseLink,
    probability: f64,
) -> Result<f64, String> {
    let target = probability.clamp(1e-6, 1.0 - 1e-6);
    match link_kind {
        InverseLink::Standard(StandardLink::Logit) => Ok((target / (1.0 - target)).ln()),
        InverseLink::Standard(StandardLink::Probit) => standard_normal_quantile(target)
            .map_err(|err| format!("failed to invert probit warm-start probability: {err}")),
        InverseLink::Standard(StandardLink::CLogLog) => Ok((-((1.0 - target).ln())).ln()),
        other => Err(GamlssError::UnsupportedConfiguration { reason: format!(
            "binomial location-scale warm start requires logit, probit, or cloglog link, got {other:?}"
        ) }.into()),
    }
}

fn weighted_binomial_prevalence(y: &Array1<f64>, weights: &Array1<f64>) -> Result<f64, String> {
    if y.len() != weights.len() {
        return Err(GamlssError::DimensionMismatch { reason: format!(
            "binomial location-scale warm start dimension mismatch: y has length {}, weights have length {}",
            y.len(),
            weights.len()
        ) }.into());
    }
    let mut weight_sum = 0.0;
    let mut success_sum = 0.0;
    for (&yi, &wi) in y.iter().zip(weights.iter()) {
        if !yi.is_finite() {
            return Err(GamlssError::NonFinite {
                reason: format!(
                    "binomial location-scale warm start encountered non-finite response {yi}"
                ),
            }
            .into());
        }
        let weight = floor_positiveweight(wi, MIN_WEIGHT);
        if weight > 0.0 {
            weight_sum += weight;
            success_sum += weight * yi;
        }
    }
    if !weight_sum.is_finite() || weight_sum <= 0.0 {
        return Err(
            "binomial location-scale warm start requires positive total weight".to_string(),
        );
    }
    Ok(success_sum / weight_sum)
}

fn project_constant_eta_into_block(
    block: &ParameterBlockSpec,
    weights: &Array1<f64>,
    eta: f64,
) -> Result<Array1<f64>, String> {
    let target_eta = Array1::from_elem(block.design.nrows(), eta);
    solve_penalizedweighted_projection(
        &block.design,
        &block.offset,
        &target_eta,
        weights,
        &block.penalties,
        &block.initial_log_lambdas,
        1e-10,
    )
}

// Deterministic warm start for the binomial location-scale model. This stays
// out of the optimizer: it projects a prevalence-matched threshold and neutral
// log-sigma value into the actual penalized block spaces.
fn binomial_location_scalewarm_start(
    y: &Array1<f64>,
    weights: &Array1<f64>,
    link_kind: &InverseLink,
    threshold_block: &ParameterBlockSpec,
    log_sigma_block: &ParameterBlockSpec,
    mean_beta_hint: Option<&Array1<f64>>,
    noise_beta_hint: Option<&Array1<f64>>,
) -> Result<(Array1<f64>, Array1<f64>), String> {
    if let (Some(mean_beta), Some(noise_beta)) = (mean_beta_hint, noise_beta_hint) {
        return Ok((mean_beta.clone(), noise_beta.clone()));
    }

    let beta_threshold = match mean_beta_hint {
        Some(beta) => beta.clone(),
        None => {
            let prevalence = weighted_binomial_prevalence(y, weights)?;
            let eta = binomial_location_scale_link_eta_from_probability(link_kind, prevalence)?;
            project_constant_eta_into_block(threshold_block, weights, eta)?
        }
    };
    let beta_log_sigma = match noise_beta_hint {
        Some(beta) => beta.clone(),
        None => project_constant_eta_into_block(log_sigma_block, weights, 0.0)?,
    };
    Ok((beta_threshold, beta_log_sigma))
}

#[derive(Clone)]
struct BinomialMeanWiggleSpec {
    pub y: Array1<f64>,
    pub weights: Array1<f64>,
    pub link_kind: InverseLink,
    pub wiggle_knots: Array1<f64>,
    pub wiggle_degree: usize,
    pub eta_block: ParameterBlockInput,
    pub wiggle_block: ParameterBlockInput,
}

#[derive(Clone)]
pub struct GaussianLocationScaleTermSpec {
    pub y: Array1<f64>,
    pub weights: Array1<f64>,
    pub meanspec: TermCollectionSpec,
    pub log_sigmaspec: TermCollectionSpec,
    pub mean_offset: Array1<f64>,
    pub log_sigma_offset: Array1<f64>,
}

#[derive(Clone)]
pub struct GaussianLocationScaleWiggleTermSpec {
    pub y: Array1<f64>,
    pub weights: Array1<f64>,
    pub meanspec: TermCollectionSpec,
    pub log_sigmaspec: TermCollectionSpec,
    pub mean_offset: Array1<f64>,
    pub log_sigma_offset: Array1<f64>,
    pub wiggle_knots: Array1<f64>,
    pub wiggle_degree: usize,
    pub wiggle_block: ParameterBlockInput,
}

#[derive(Clone)]
pub struct BinomialLocationScaleTermSpec {
    pub y: Array1<f64>,
    pub weights: Array1<f64>,
    pub link_kind: InverseLink,
    pub thresholdspec: TermCollectionSpec,
    pub log_sigmaspec: TermCollectionSpec,
    pub threshold_offset: Array1<f64>,
    pub log_sigma_offset: Array1<f64>,
}

#[derive(Clone)]
pub struct BinomialLocationScaleWiggleTermSpec {
    pub y: Array1<f64>,
    pub weights: Array1<f64>,
    pub link_kind: InverseLink,
    pub thresholdspec: TermCollectionSpec,
    pub log_sigmaspec: TermCollectionSpec,
    pub threshold_offset: Array1<f64>,
    pub log_sigma_offset: Array1<f64>,
    pub wiggle_knots: Array1<f64>,
    pub wiggle_degree: usize,
    pub wiggle_block: ParameterBlockInput,
}

#[derive(Clone, Debug)]
pub struct BlockwiseTermFitResult {
    pub fit: UnifiedFitResult,
    pub meanspec_resolved: TermCollectionSpec,
    pub noisespec_resolved: TermCollectionSpec,
    pub mean_design: TermCollectionDesign,
    pub noise_design: TermCollectionDesign,
}

pub(crate) struct BlockwiseTermFitResultParts {
    pub fit: UnifiedFitResult,
    pub meanspec_resolved: TermCollectionSpec,
    pub noisespec_resolved: TermCollectionSpec,
    pub mean_design: TermCollectionDesign,
    pub noise_design: TermCollectionDesign,
}

pub struct BlockwiseTermWiggleFitResult {
    pub fit: BlockwiseTermFitResult,
    pub wiggle_knots: Array1<f64>,
    pub wiggle_degree: usize,
}

pub struct BinomialMeanWiggleTermFitResult {
    pub fit: UnifiedFitResult,
    pub resolvedspec: TermCollectionSpec,
    pub design: TermCollectionDesign,
    pub wiggle_knots: Array1<f64>,
    pub wiggle_degree: usize,
}

struct BlockwiseTermWiggleFitResultParts {
    pub fit: BlockwiseTermFitResult,
    pub wiggle_knots: Array1<f64>,
    pub wiggle_degree: usize,
}

fn validate_term_collection_design(
    label: &str,
    design: &TermCollectionDesign,
) -> Result<(), String> {
    let p = design.design.ncols();
    let n = design.design.nrows();
    for rows in exact_design_row_chunks(n, p) {
        let chunk = design
            .design
            .try_row_chunk(rows)
            .map_err(|e| format!("{label}.design row chunk materialization failed: {e}"))?;
        validate_all_finite_estimation(&format!("{label}.design"), chunk.iter().copied())
            .map_err(|e| e.to_string())?;
    }
    if design.nullspace_dims.len() != design.penalties.len() {
        return Err(GamlssError::DimensionMismatch {
            reason: format!(
                "{label}.nullspace_dims length mismatch: got {}, expected {}",
                design.nullspace_dims.len(),
                design.penalties.len()
            ),
        }
        .into());
    }
    if design.penaltyinfo.len() != design.penalties.len() {
        return Err(GamlssError::DimensionMismatch {
            reason: format!(
                "{label}.penaltyinfo length mismatch: got {}, expected {}",
                design.penaltyinfo.len(),
                design.penalties.len()
            ),
        }
        .into());
    }
    for (idx, bp) in design.penalties.iter().enumerate() {
        validate_all_finite_estimation(
            &format!("{label}.penalties[{idx}]"),
            bp.local.iter().copied(),
        )
        .map_err(|e| e.to_string())?;
        if bp.col_range.end > p {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "{label}.penalties[{idx}] col_range {}..{} exceeds design width {}",
                    bp.col_range.start, bp.col_range.end, p
                ),
            }
            .into());
        }
    }
    if let Some(bounds) = design.coefficient_lower_bounds.as_ref() {
        if bounds.len() != p {
            return Err(GamlssError::ConstraintViolation {
                reason: format!(
                    "{label}.coefficient_lower_bounds length mismatch: got {}, expected {p}",
                    bounds.len()
                ),
            }
            .into());
        }
        for (idx, &bound) in bounds.iter().enumerate() {
            if !(bound.is_finite() || bound == f64::NEG_INFINITY) {
                return Err(GamlssError::NonFinite { reason: format!(
                    "{label}.coefficient_lower_bounds[{idx}] must be finite or -inf, got {bound}",
                ) }.into());
            }
        }
    }
    if let Some(constraints) = design.linear_constraints.as_ref() {
        validate_all_finite_estimation(
            &format!("{label}.linear_constraints.a"),
            constraints.a.iter().copied(),
        )
        .map_err(|e| e.to_string())?;
        validate_all_finite_estimation(
            &format!("{label}.linear_constraints.b"),
            constraints.b.iter().copied(),
        )
        .map_err(|e| e.to_string())?;
        if constraints.a.ncols() != p {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "{label}.linear_constraints.a column mismatch: got {}, expected {p}",
                    constraints.a.ncols()
                ),
            }
            .into());
        }
        if constraints.a.nrows() != constraints.b.len() {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "{label}.linear_constraints row mismatch: a has {}, b has {}",
                    constraints.a.nrows(),
                    constraints.b.len()
                ),
            }
            .into());
        }
    }
    if design.intercept_range.start > design.intercept_range.end || design.intercept_range.end > p {
        return Err(GamlssError::ConstraintViolation {
            reason: format!(
                "{label}.intercept_range out of bounds: {:?} for {} columns",
                design.intercept_range, p
            ),
        }
        .into());
    }
    Ok(())
}

impl BlockwiseTermFitResult {
    pub(crate) fn try_from_parts(parts: BlockwiseTermFitResultParts) -> Result<Self, String> {
        let BlockwiseTermFitResultParts {
            fit,
            meanspec_resolved,
            noisespec_resolved,
            mean_design,
            noise_design,
        } = parts;

        fit.validate_numeric_finiteness()
            .map_err(|e| format!("{e}"))?;
        if fit.block_states.len() < 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BlockwiseTermFitResult requires at least 2 block states, got {}",
                    fit.block_states.len()
                ),
            }
            .into());
        }
        validate_term_collection_design("blockwise_term.mean_design", &mean_design)?;
        validate_term_collection_design("blockwise_term.noise_design", &noise_design)?;
        if mean_design.design.nrows() != noise_design.design.nrows() {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BlockwiseTermFitResult row mismatch: mean_design={}, noise_design={}",
                    mean_design.design.nrows(),
                    noise_design.design.nrows()
                ),
            }
            .into());
        }
        if fit.block_states[0].beta.len() != mean_design.design.ncols() {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BlockwiseTermFitResult mean beta length mismatch: got {}, expected {}",
                    fit.block_states[0].beta.len(),
                    mean_design.design.ncols()
                ),
            }
            .into());
        }
        if fit.block_states[1].beta.len() != noise_design.design.ncols() {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BlockwiseTermFitResult noise beta length mismatch: got {}, expected {}",
                    fit.block_states[1].beta.len(),
                    noise_design.design.ncols()
                ),
            }
            .into());
        }
        if fit.block_states[0].eta.len() != mean_design.design.nrows() {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BlockwiseTermFitResult mean eta length mismatch: got {}, expected {}",
                    fit.block_states[0].eta.len(),
                    mean_design.design.nrows()
                ),
            }
            .into());
        }
        if fit.block_states[1].eta.len() != noise_design.design.nrows() {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BlockwiseTermFitResult noise eta length mismatch: got {}, expected {}",
                    fit.block_states[1].eta.len(),
                    noise_design.design.nrows()
                ),
            }
            .into());
        }

        Ok(Self {
            fit,
            meanspec_resolved,
            noisespec_resolved,
            mean_design,
            noise_design,
        })
    }

    fn validate_numeric_finiteness(&self) -> Result<(), String> {
        Self::try_from_parts(BlockwiseTermFitResultParts {
            fit: self.fit.clone(),
            meanspec_resolved: self.meanspec_resolved.clone(),
            noisespec_resolved: self.noisespec_resolved.clone(),
            mean_design: self.mean_design.clone(),
            noise_design: self.noise_design.clone(),
        })
        .map(|_| ())
    }
}

impl BlockwiseTermWiggleFitResult {
    fn try_from_parts(parts: BlockwiseTermWiggleFitResultParts) -> Result<Self, String> {
        let BlockwiseTermWiggleFitResultParts {
            fit,
            wiggle_knots,
            wiggle_degree,
        } = parts;

        fit.validate_numeric_finiteness()
            .map_err(|e| e.to_string())?;
        if fit.fit.block_states.len() < 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BlockwiseTermWiggleFitResult requires at least 3 block states, got {}",
                    fit.fit.block_states.len()
                ),
            }
            .into());
        }
        if wiggle_knots.is_empty() {
            return Err(GamlssError::UnsupportedConfiguration {
                reason: "BlockwiseTermWiggleFitResult requires non-empty wiggle_knots".to_string(),
            }
            .into());
        }
        validate_all_finite_estimation(
            "blockwise_term_wiggle.wiggle_knots",
            wiggle_knots.iter().copied(),
        )
        .map_err(|e| e.to_string())?;

        Ok(Self {
            fit,
            wiggle_knots,
            wiggle_degree,
        })
    }
}

pub struct BinomialLocationScaleFitResult {
    pub fit: BlockwiseTermFitResult,
    pub wiggle_knots: Option<Array1<f64>>,
    pub wiggle_degree: Option<usize>,
    pub beta_link_wiggle: Option<Vec<f64>>,
}

pub struct GaussianLocationScaleFitResult {
    pub fit: BlockwiseTermFitResult,
    pub wiggle_knots: Option<Array1<f64>>,
    pub wiggle_degree: Option<usize>,
    pub beta_link_wiggle: Option<Vec<f64>>,
    /// Response standardization factor applied internally during fitting.
    ///
    /// The Gaussian location-scale path fits on `y / response_scale` so the
    /// fixed log-σ soft floor `LOGB_SIGMA_FLOOR = 0.01` is *operationally*
    /// scale-relative (1 % of the response spread) rather than absolute,
    /// keeping κ = dlogσ/dη ≈ 1 across the realistic σ range and informing the
    /// scale block like gamlss. The returned coefficient `blocks`, `beta`, and
    /// link-wiggle knots/coefficients are already mapped back to **raw response
    /// units** (the Location/Mean block scaled by `response_scale`, the Scale
    /// block intercept shifted by `+ln(response_scale)`), so downstream
    /// reconstruction `μ = X_mean·β` comes out in raw units with no further
    /// rescaling.
    ///
    /// The σ reconstruction, however, **must scale the floor too** to stay
    /// response-scale-equivariant (#884):
    ///
    /// ```text
    /// σ = response_scale·LOGB_SIGMA_FLOOR + exp(X_scale·β)
    ///   = response_scale·(LOGB_SIGMA_FLOOR + exp(η_internal)).
    /// ```
    ///
    /// The intercept shift carries only the `exp(η)` term; reconstructing with a
    /// raw `LOGB_SIGMA_FLOOR` instead of `response_scale·LOGB_SIGMA_FLOOR` leaves
    /// the non-equivariant residual `LOGB_SIGMA_FLOOR·(1 − response_scale)`.
    ///
    /// This field records the factor that was applied for transparency,
    /// covariance bookkeeping, and the equivariant σ-floor reconstruction; it is
    /// `1.0` when no standardization was needed (degenerate constant response).
    pub response_scale: f64,
}

fn fit_binomial_mean_wiggle(
    spec: BinomialMeanWiggleSpec,
    options: &BlockwiseFitOptions,
) -> Result<UnifiedFitResult, String> {
    let n = spec.y.len();
    validate_len_match("weights vs y", n, spec.weights.len())?;
    validateweights(&spec.weights, "fit_binomial_mean_wiggle")?;
    validate_binomial_response(&spec.y, "fit_binomial_mean_wiggle")?;
    validate_blockrows("eta", n, &spec.eta_block)?;
    validate_blockrows("wiggle", n, &spec.wiggle_block)?;
    if matches!(
        spec.link_kind,
        InverseLink::Standard(StandardLink::Identity)
    ) {
        return Err(GamlssError::UnsupportedConfiguration {
            reason: "fit_binomial_mean_wiggle does not support identity link".to_string(),
        }
        .into());
    }
    crate::inference::formula_dsl::require_binomial_inverse_link_supports_joint_wiggle(
        &spec.link_kind,
        "fit_binomial_mean_wiggle",
    )?;
    if spec.wiggle_degree < 2 {
        return Err(GamlssError::ConstraintViolation {
            reason: format!(
                "fit_binomial_mean_wiggle: wiggle_degree must be >= 2, got {}",
                spec.wiggle_degree
            ),
        }
        .into());
    }
    let minimum_knots = minimum_monotone_wiggle_knot_count(spec.wiggle_degree)?;
    if spec.wiggle_knots.len() < minimum_knots {
        return Err(GamlssError::DimensionMismatch { reason: format!(
            "fit_binomial_mean_wiggle: wiggle_knots length {} is too short for degree {} (need at least {})",
            spec.wiggle_knots.len(),
            spec.wiggle_degree,
            minimum_knots
        ) }.into());
    }

    let family = BinomialMeanWiggleFamily {
        y: spec.y,
        weights: spec.weights,
        link_kind: spec.link_kind,
        wiggle_knots: spec.wiggle_knots,
        wiggle_degree: spec.wiggle_degree,
        policy: crate::resource::ResourcePolicy::default_library(),
    };
    let blocks = vec![
        // The wiggle block is a DYNAMIC monotone I-spline basis that the
        // family regenerates at full (raw) width every inner iteration
        // (`block_geometry_is_dynamic` + the `x.ncols() == spec.design.ncols()`
        // assertion in `block_geometry`), so it cannot tolerate a physical
        // column drop. The level/intercept direction that the I-spline shares
        // with the eta block must therefore be yielded by the *eta* block,
        // whose static term-collection design is safely column-reducible (and
        // lifted back via the canonical per-block transform `T`). Give the eta
        // block the lower gauge priority so the canonical-gauge RRQR routes the
        // shared-level alias drop onto eta and leaves the dynamic wiggle basis
        // full-width.
        spec.eta_block
            .intospec_with_gauge_priority("eta", LINK_WIGGLE_GAUGE_PRIORITY)?,
        spec.wiggle_block.intospec("wiggle")?,
    ];
    fit_custom_family(&family, &blocks, options).map_err(|e| e.to_string())
}

trait LocationScaleFamilyBuilder {
    type Family: CustomFamily + Clone + Send + Sync + 'static;

    fn meanspec(&self) -> &TermCollectionSpec;
    fn noisespec(&self) -> &TermCollectionSpec;

    fn build_blocks(
        &self,
        theta: &Array1<f64>,
        mean_design: &TermCollectionDesign,
        noise_design: &TermCollectionDesign,
        mean_beta_hint: Option<Array1<f64>>,
        noise_beta_hint: Option<Array1<f64>>,
    ) -> Result<Vec<ParameterBlockSpec>, String>;

    fn build_family(
        &self,
        mean_design: &TermCollectionDesign,
        noise_design: &TermCollectionDesign,
    ) -> Self::Family;

    fn extract_primary_betas(
        &self,
        fit: &UnifiedFitResult,
    ) -> Result<(Array1<f64>, Array1<f64>), String>;

    fn mean_penalty_count(&self, mean_design: &TermCollectionDesign) -> usize {
        mean_design.penalties.len()
    }

    fn noise_penalty_count(&self, noise_design: &TermCollectionDesign) -> usize {
        noise_design.penalties.len()
    }

    fn exact_spatial_joint_supported(&self) -> bool {
        false
    }

    fn require_exact_spatial_joint(&self) -> bool {
        false
    }

    fn exact_spatial_seed_risk_profile(&self) -> crate::seeding::SeedRiskProfile {
        crate::seeding::SeedRiskProfile::GeneralizedLinear
    }

    fn extra_rho0(&self) -> Result<Array1<f64>, String> {
        Ok(Array1::zeros(0))
    }

    fn build_psiderivative_blocks(
        &self,
        arr: ndarray::ArrayView2<'_, f64>,
        term_spec: &TermCollectionSpec,
        term_spec2: &TermCollectionSpec,
        term_design: &TermCollectionDesign,
        term_design2: &TermCollectionDesign,
    ) -> Result<Vec<Vec<CustomFamilyBlockPsiDerivative>>, String>;
}

fn fit_location_scale_terms<B: LocationScaleFamilyBuilder>(
    data: ndarray::ArrayView2<'_, f64>,
    builder: B,
    options: &BlockwiseFitOptions,
    kappa_options: &SpatialLengthScaleOptimizationOptions,
) -> Result<BlockwiseTermFitResult, String> {
    // Large-n location-scale fits keep the caller's explicit Hessian request.
    // The unified REML evaluator chooses a dense or matrix-free exact
    // representation from the realized (n, p, K) work model, so there is no
    // biobank-scale downgrade to BFGS here.

    let mut mean_beta_hint: Option<Array1<f64>> = None;
    let mut noise_beta_hint: Option<Array1<f64>> = None;
    let extra_rho0 = builder.extra_rho0()?;

    let mean_boot_design =
        build_term_collection_design(data, builder.meanspec()).map_err(|e| e.to_string())?;
    let noise_boot_design =
        build_term_collection_design(data, builder.noisespec()).map_err(|e| e.to_string())?;
    let mean_bootspec = freeze_term_collection_from_design(builder.meanspec(), &mean_boot_design)
        .map_err(|e| e.to_string())?;
    let noise_bootspec =
        freeze_term_collection_from_design(builder.noisespec(), &noise_boot_design)
            .map_err(|e| e.to_string())?;

    let require_exact_spatial_joint = builder.require_exact_spatial_joint();
    let analytic_joint_derivatives_check = if builder.exact_spatial_joint_supported() {
        builder
            .build_psiderivative_blocks(
                data,
                &mean_bootspec,
                &noise_bootspec,
                &mean_boot_design,
                &noise_boot_design,
            )
            .map(|_| ())
    } else {
        Err(
            "analytic spatial psi derivatives are unavailable for this location-scale family"
                .to_string(),
        )
    };
    let analytic_joint_derivatives_available = analytic_joint_derivatives_check.is_ok();
    if require_exact_spatial_joint {
        analytic_joint_derivatives_check.map_err(|err| {
            format!("exact two-block spatial path requires analytic psi derivatives: {err}")
        })?;
    }
    let mean_penalty_count = builder.mean_penalty_count(&mean_boot_design);
    let noise_penalty_count = builder.noise_penalty_count(&noise_boot_design);

    // Honor an explicit user-supplied `length_scale=X` on every spatial term
    // in both the mean and noise blocks: when every term is κ-locked (no
    // anisotropy, no per-axis ψ contrasts), the joint-spatial outer optimizer
    // has nothing to optimize. Routing through it anyway wraps the full
    // two-block coefficient solve inside an unnecessary outer loop where
    // each evaluation runs the inner Newton from scratch. This is the same
    // short-circuit the Bernoulli marginal-slope entry point performs at
    // bernoulli_marginal_slope.rs:16432-16442; mirroring it here makes the
    // GAMLSS path skip straight to the `(!enabled || log_kappa_dim == 0)`
    // fast path in `optimize_spatial_length_scale_exact_joint`.
    let mut effective_kappa_options = kappa_options.clone();
    if effective_kappa_options.enabled
        && crate::smooth::all_spatial_terms_kappa_fixed(&mean_bootspec)
        && crate::smooth::all_spatial_terms_kappa_fixed(&noise_bootspec)
    {
        log::info!(
            "[GAMLSS spatial] disabling κ/ψ optimization: every spatial term in \
             both blocks has an explicit length_scale and no anisotropy; \
             user-supplied kernel scale is fixed"
        );
        effective_kappa_options.enabled = false;
    }
    let kappa_options: &SpatialLengthScaleOptimizationOptions = &effective_kappa_options;

    // Macro to invoke the exact-joint spatial optimizer with shared closures.
    // The exact path evaluates the full profiled/Laplace objective over
    // theta = [rho, psi] with the real joint Hessian required by NewtonTR/ARC.
    macro_rules! run_exact_joint_spatial {
        () => {{
            let joint_setup = build_two_block_exact_joint_setup(
                data,
                builder.meanspec(),
                builder.noisespec(),
                mean_penalty_count,
                noise_penalty_count,
                extra_rho0.as_slice().unwrap_or(&[]),
                None,
                kappa_options,
            );
            let mean_terms = spatial_length_scale_term_indices(builder.meanspec());
            let noise_terms = spatial_length_scale_term_indices(builder.noisespec());
            let mean_beta_hint_cell = std::cell::RefCell::new(mean_beta_hint.clone());
            let noise_beta_hint_cell = std::cell::RefCell::new(noise_beta_hint.clone());
            let hyper_warm_start_cell =
                std::cell::RefCell::new(None::<CustomFamilyWarmStart>);
            // Two-block GAMLSS/location-scale joint likelihoods have a
            // β-dependent cross-block Hessian (the (μ,log σ) / (t,log σ)
            // off-diagonal blocks involve residual/response scalars that
            // shift when β moves). The Wood-Fasiolo structural property
            // `H^{-1/2} B_k H^{-1/2} ≽ 0` plus parameter-independent
            // nullspace — the mathematical basis for EFS convergence —
            // fails here, so EFS/HybridEFS must be excluded at plan time
            // rather than retried as a silent first attempt that stalls
            // for hundreds of seconds before the runner falls back.
            let gamlss_disable_fixed_point = true;
            let outer_policy = {
                // GAMLSS spatial path: psi_dim = log_kappa_dim + auxiliary_dim,
                // matching the (theta_dim - rho_dim) decomposition the
                // optimizer uses internally. Build realized ParameterBlockSpecs
                // at the seed rho so the family's own cost model — which
                // multiplies coefficient-gradient / coefficient-Hessian
                // per-row cost by the joint outer-coordinate dimension and
                // total p — produces honest `predicted_*_work` estimates.
                // Previously this fed `predicted_*_work: 0` to the planner,
                // which then ungated dense outer Hessian work that costs
                // hundreds of seconds per eval at biobank scale (see
                // `OuterDerivativePolicy::OUTER_HESSIAN_WORK_BUDGET`).
                let theta_seed = joint_setup.theta0();
                let rho_dim = joint_setup.rho_dim();
                let psi_dim = theta_seed.len() - rho_dim;
                let rho_seed = theta_seed.slice(s![..rho_dim]).to_owned();
                let policy_blocks_res = builder.build_blocks(
                    &rho_seed,
                    &mean_boot_design,
                    &noise_boot_design,
                    mean_beta_hint_cell.borrow().clone(),
                    noise_beta_hint_cell.borrow().clone(),
                );
                let mut policy = match policy_blocks_res {
                    Ok(policy_blocks) => {
                        let policy_family =
                            builder.build_family(&mean_boot_design, &noise_boot_design);
                        crate::families::custom_family::CustomFamily::outer_derivative_policy(
                            &policy_family,
                            &policy_blocks,
                            psi_dim,
                            options,
                        )
                    }
                    Err(err) => {
                        // Block construction at the seed should not fail for
                        // any in-tree family, but if it does, fall back to a
                        // policy that names the capability honestly and
                        // declines to predict cost. Setting work to
                        // `u128::MAX` routes the planner through gradient-only
                        // BFGS (the universal Hessian-work budget is
                        // saturating, so a sentinel is fine here).
                        log::warn!(
                            "[GAMLSS spatial] failed to realize policy blocks at seed rho ({err}); \
                             routing outer optimizer through gradient-only BFGS"
                        );
                        let capability = if analytic_joint_derivatives_available {
                            crate::families::custom_family::ExactOuterDerivativeOrder::Second
                        } else {
                            crate::families::custom_family::ExactOuterDerivativeOrder::First
                        };
                        crate::families::custom_family::OuterDerivativePolicy {
                            capability,
                            predicted_gradient_work: u128::MAX,
                            predicted_hessian_work: u128::MAX,
                            // No GAMLSS family today overrides its
                            // outer-only `_with_options` hooks to consume
                            // `outer_score_subsample`; staged-κ would
                            // build pilot masks the family then ignores.
                            subsample_capable: false,
                        }
                    }
                };
                if !analytic_joint_derivatives_available {
                    // Capability must not exceed what the analytic derivatives
                    // path can supply — the macro's hyper evaluator returns
                    // an error otherwise.
                    policy.capability =
                        crate::families::custom_family::ExactOuterDerivativeOrder::First;
                }
                policy
            };
            optimize_spatial_length_scale_exact_joint(
                data,
                &[builder.meanspec().clone(), builder.noisespec().clone()],
                &[mean_terms, noise_terms],
                kappa_options,
                &joint_setup,
                builder.exact_spatial_seed_risk_profile(),
                analytic_joint_derivatives_available,
                analytic_joint_derivatives_available,
                gamlss_disable_fixed_point,
                None,
                outer_policy,
                |theta, specs: &[TermCollectionSpec], designs: &[TermCollectionDesign]| {
                    assert_eq!(
                        specs.len(),
                        2,
                        "joint spatial closure expects exactly two block specs (mean, noise); got {}",
                        specs.len(),
                    );
                    assert_eq!(
                        designs.len(),
                        2,
                        "joint spatial closure expects exactly two block designs (mean, noise); got {}",
                        designs.len(),
                    );
                    let rho = theta.slice(s![..joint_setup.rho_dim()]).to_owned();
                    let fit = {
                        let blocks = builder.build_blocks(
                            &rho,
                            &designs[0],
                            &designs[1],
                            mean_beta_hint_cell.borrow().clone(),
                            noise_beta_hint_cell.borrow().clone(),
                        )?;
                        if mean_beta_hint_cell.borrow().is_none()
                            && let Some(beta) = blocks.first().and_then(|block| block.initial_beta.clone())
                        {
                            *mean_beta_hint_cell.borrow_mut() = Some(beta);
                        }
                        if noise_beta_hint_cell.borrow().is_none()
                            && let Some(beta) =
                                blocks.get(1).and_then(|block| block.initial_beta.clone())
                        {
                            *noise_beta_hint_cell.borrow_mut() = Some(beta);
                        }
                        let family = builder.build_family(&designs[0], &designs[1]);
                        // Branch on whether the κ optimizer drives rho.
                        //
                        // * `log_kappa_dim() > 0 && kappa_options.enabled` ⇒
                        //   the outer (ρ, ψ) optimizer is active and
                        //   passes each candidate ρ to this closure;
                        //   the inner fit must hold log-lambdas fixed
                        //   at the supplied ρ so the outer derivative
                        //   has a well-defined directional gradient.
                        //
                        // * Otherwise (κ disabled via the locked-κ
                        //   short-circuit, or no spatial terms at all)
                        //   the fast path in
                        //   `optimize_spatial_length_scale_exact_joint`
                        //   calls this closure exactly once at
                        //   `theta = theta0`; ρ must still be optimized
                        //   from data because the user never pinned it.
                        //   `fit_custom_family` performs the joint
                        //   ρ + coefficient REML fit at the user's
                        //   (now-fixed) kernel scale, which is the
                        //   intended behaviour when `length_scale=…` is
                        //   set on every spatial term.
                        if joint_setup.log_kappa_dim() > 0 && kappa_options.enabled {
                            let warm_start = hyper_warm_start_cell.borrow().clone();
                            fit_custom_family_fixed_log_lambdas(
                                &family,
                                &blocks,
                                options,
                                warm_start.as_ref(),
                                0,
                                None,
                                true,
                            )?
                        } else {
                            fit_custom_family(&family, &blocks, options)?
                        }
                    };
                    let (mean_beta, noise_beta) = builder.extract_primary_betas(&fit)?;
                    mean_beta_hint = Some(mean_beta);
                    noise_beta_hint = Some(noise_beta);
                    *mean_beta_hint_cell.borrow_mut() = mean_beta_hint.clone();
                    *noise_beta_hint_cell.borrow_mut() = noise_beta_hint.clone();
                    Ok(fit)
                },
                |theta,
                 specs: &[TermCollectionSpec],
                 designs: &[TermCollectionDesign],
                 eval_mode,
                 row_set: &crate::families::row_kernel::RowSet| {
                    use crate::solver::estimate::reml::unified::EvalMode;
                    if !analytic_joint_derivatives_available {
                        return Err(
                            "analytic spatial psi derivatives are unavailable for this exact two-block path"
                                .to_string(),
                        );
                    }
                    let rho = theta.slice(s![..joint_setup.rho_dim()]).to_owned();
                    let blocks = builder.build_blocks(
                        &rho,
                        &designs[0],
                        &designs[1],
                        mean_beta_hint_cell.borrow().clone(),
                        noise_beta_hint_cell.borrow().clone(),
                    )?;
                    if mean_beta_hint_cell.borrow().is_none()
                        && let Some(beta) = blocks.first().and_then(|block| block.initial_beta.clone())
                    {
                        *mean_beta_hint_cell.borrow_mut() = Some(beta);
                    }
                    if noise_beta_hint_cell.borrow().is_none()
                        && let Some(beta) = blocks.get(1).and_then(|block| block.initial_beta.clone())
                    {
                        *noise_beta_hint_cell.borrow_mut() = Some(beta);
                    }
                    let family = builder.build_family(&designs[0], &designs[1]);
                    let psiderivative_blocks = builder.build_psiderivative_blocks(
                        data,
                        &specs[0],
                        &specs[1],
                        &designs[0],
                        &designs[1],
                    )?;
                    let warm_start = hyper_warm_start_cell.borrow().clone();
                    // Forward the κ-staging row set to the family by installing it
                    // on the canonical `outer_score_subsample` option. Inner-PIRLS
                    // and final covariance still run on full data (the per-row
                    // weight is consulted only by outer-only paths inside the
                    // family). When the staging schedule is full-data the option
                    // stays `None` and the call is equivalent to the prior path.
                    let eval_options = match row_set {
                        crate::families::row_kernel::RowSet::All => {
                            std::borrow::Cow::Borrowed(options)
                        }
                        crate::families::row_kernel::RowSet::Subsample {
                            rows,
                            n_full,
                        } => {
                            let subsample = crate::families::marginal_slope_shared::
                                OuterScoreSubsample::from_weighted_rows(
                                    (**rows).clone(),
                                    *n_full,
                                    *n_full as u64,
                                );
                            let mut cloned = options.clone();
                            cloned.outer_score_subsample =
                                Some(std::sync::Arc::new(subsample));
                            std::borrow::Cow::Owned(cloned)
                        }
                    };
                    let eval = evaluate_custom_family_joint_hyper(
                        &family,
                        &blocks,
                        eval_options.as_ref(),
                        &rho,
                        &psiderivative_blocks,
                        warm_start.as_ref(),
                        eval_mode,
                    )?;
                    *hyper_warm_start_cell.borrow_mut() = Some(eval.warm_start.clone());
                    if !eval.inner_converged {
                        return Err(
                            "exact two-block spatial inner solve did not converge".to_string(),
                        );
                    }
                    if matches!(eval_mode, EvalMode::ValueGradientHessian)
                        && !eval.outer_hessian.is_analytic()
                    {
                        return Err(
                            "exact two-block spatial objective requires a full joint [rho, psi] hessian"
                                .to_string(),
                        );
                    }
                    Ok((eval.objective, eval.gradient, eval.outer_hessian))
                },
                |theta, specs: &[TermCollectionSpec], designs: &[TermCollectionDesign]| {
                    if !analytic_joint_derivatives_available {
                        return Err(
                            "analytic spatial psi derivatives are unavailable for this exact two-block path"
                                .to_string(),
                        );
                    }
                    let rho = theta.slice(s![..joint_setup.rho_dim()]).to_owned();
                    let blocks = builder.build_blocks(
                        &rho,
                        &designs[0],
                        &designs[1],
                        mean_beta_hint_cell.borrow().clone(),
                        noise_beta_hint_cell.borrow().clone(),
                    )?;
                    if mean_beta_hint_cell.borrow().is_none()
                        && let Some(beta) = blocks.first().and_then(|block| block.initial_beta.clone())
                    {
                        *mean_beta_hint_cell.borrow_mut() = Some(beta);
                    }
                    if noise_beta_hint_cell.borrow().is_none()
                        && let Some(beta) = blocks.get(1).and_then(|block| block.initial_beta.clone())
                    {
                        *noise_beta_hint_cell.borrow_mut() = Some(beta);
                    }
                    let family = builder.build_family(&designs[0], &designs[1]);
                    let psiderivative_blocks = builder.build_psiderivative_blocks(
                        data,
                        &specs[0],
                        &specs[1],
                        &designs[0],
                        &designs[1],
                    )?;
                    let warm_start = hyper_warm_start_cell.borrow().clone();
                    let eval = evaluate_custom_family_joint_hyper_efs(
                        &family,
                        &blocks,
                        options,
                        &rho,
                        &psiderivative_blocks,
                        warm_start.as_ref(),
                    )?;
                    *hyper_warm_start_cell.borrow_mut() = Some(eval.warm_start.clone());
                    if !eval.inner_converged {
                        return Err(
                            "exact two-block spatial EFS inner solve did not converge".to_string(),
                        );
                    }
                    Ok(eval.efs_eval)
                },
                |_beta: &Array1<f64>| Ok(()),
            )
        }};
    }

    let mut solved = run_exact_joint_spatial!()
        .map_err(|err| format!("exact two-block spatial optimization failed: {err}"))?;

    let expected_noise_penalty_count = builder.noise_penalty_count(&solved.designs[1]);
    let actual_noise_penalty_count = solved.designs[1].penalties.len();
    if expected_noise_penalty_count > actual_noise_penalty_count {
        if expected_noise_penalty_count != actual_noise_penalty_count + 1 {
            return Err(GamlssError::UnsupportedConfiguration {
                reason: format!(
                    "location-scale result noise design expected {} penalties after augmentation, got {} before augmentation",
                    expected_noise_penalty_count, actual_noise_penalty_count
                ),
            }
            .into());
        }
        append_binomial_log_sigma_shrinkage_penalty_design(&mut solved.designs[1]);
    }

    BlockwiseTermFitResult::try_from_parts(BlockwiseTermFitResultParts {
        fit: solved.fit,
        meanspec_resolved: solved.resolved_specs.remove(0),
        noisespec_resolved: solved.resolved_specs.remove(0),
        mean_design: solved.designs.remove(0),
        noise_design: solved.designs.remove(0),
    })
}

struct GaussianLocationScaleTermBuilder {
    y: Array1<f64>,
    weights: Array1<f64>,
    meanspec: TermCollectionSpec,
    noisespec: TermCollectionSpec,
    mean_offset: Array1<f64>,
    noise_offset: Array1<f64>,
}

impl LocationScaleFamilyBuilder for GaussianLocationScaleTermBuilder {
    type Family = GaussianLocationScaleFamily;

    fn meanspec(&self) -> &TermCollectionSpec {
        &self.meanspec
    }

    fn noisespec(&self) -> &TermCollectionSpec {
        &self.noisespec
    }

    fn noise_penalty_count(&self, noise_design: &TermCollectionDesign) -> usize {
        // Mirror the Binomial location-scale path: the log-sigma (scale)
        // block carries an extra full-space shrinkage penalty so its
        // polynomial nullspace (constant log-sigma, plus the linear term for
        // tp/Duchon bases) is not left unpenalized. Without it, outer REML
        // optimizes lambda_sigma on a flat/ill-conditioned surface, which
        // over-smooths the scale envelope (bad Pearson/CRPS/PIT/NLL) and can
        // diverge the coupled inner Newton (log_sigma residual blows up,
        // beta -> infinity). The strength of this ridge is REML-selected.
        noise_design.penalties.len() + 1
    }

    fn exact_spatial_joint_supported(&self) -> bool {
        true
    }

    fn exact_spatial_seed_risk_profile(&self) -> crate::seeding::SeedRiskProfile {
        crate::seeding::SeedRiskProfile::Gaussian
    }

    fn build_blocks(
        &self,
        theta: &Array1<f64>,
        mean_design: &TermCollectionDesign,
        noise_design: &TermCollectionDesign,
        mean_beta_hint: Option<Array1<f64>>,
        noise_beta_hint: Option<Array1<f64>>,
    ) -> Result<Vec<ParameterBlockSpec>, String> {
        let layout = GamlssLambdaLayout::two_block(
            mean_design.penalties.len(),
            self.noise_penalty_count(noise_design),
        );
        layout.validate_theta_len(theta.len(), "gaussian location-scale")?;
        let (meanspec, noisespec) = build_gaussian_mean_and_scale_blocks(
            &self.y,
            &self.weights,
            mean_design,
            noise_design,
            &self.mean_offset,
            &self.noise_offset,
            layout.mean_from(theta),
            layout.noise_from(theta),
            mean_beta_hint,
            noise_beta_hint,
            "GaussianLocationScale::build_blocks",
        )?;
        Ok(vec![meanspec, noisespec])
    }

    fn build_family(
        &self,
        mean_design: &TermCollectionDesign,
        noise_design: &TermCollectionDesign,
    ) -> Self::Family {
        let preparednoise_design =
            prepared_gaussian_log_sigma_design(&mean_design.design, &noise_design.design)
                .expect("prepared Gaussian log-sigma design should match block construction");
        GaussianLocationScaleFamily {
            y: self.y.clone(),
            weights: self.weights.clone(),
            mu_design: Some(mean_design.design.clone()),
            log_sigma_design: Some(preparednoise_design),
            policy: crate::resource::ResourcePolicy::default_library(),
            cached_row_scalars: std::sync::RwLock::new(None),
        }
    }

    fn extract_primary_betas(
        &self,
        fit: &UnifiedFitResult,
    ) -> Result<(Array1<f64>, Array1<f64>), String> {
        let mean_beta = fit
            .block_states
            .get(GaussianLocationScaleFamily::BLOCK_MU)
            .ok_or_else(|| "missing Gaussian mu block state".to_string())?
            .beta
            .clone();
        let noise_beta = fit
            .block_states
            .get(GaussianLocationScaleFamily::BLOCK_LOG_SIGMA)
            .ok_or_else(|| "missing Gaussian log_sigma block state".to_string())?
            .beta
            .clone();
        Ok((mean_beta, noise_beta))
    }

    fn build_psiderivative_blocks(
        &self,
        data: ndarray::ArrayView2<'_, f64>,
        meanspec_resolved: &TermCollectionSpec,
        noisespec_resolved: &TermCollectionSpec,
        mean_design: &TermCollectionDesign,
        noise_design: &TermCollectionDesign,
    ) -> Result<Vec<Vec<CustomFamilyBlockPsiDerivative>>, String> {
        let mean_derivs =
            build_block_spatial_psi_derivatives(data, meanspec_resolved, mean_design)?
                .ok_or_else(|| "missing Gaussian mean spatial psi derivatives".to_string())?;
        let noise_derivs =
            build_block_spatial_psi_derivatives(data, noisespec_resolved, noise_design)?
                .ok_or_else(|| "missing Gaussian log-sigma spatial psi derivatives".to_string())?;
        Ok(vec![mean_derivs, noise_derivs])
    }
}

struct GaussianLocationScaleWiggleTermBuilder {
    y: Array1<f64>,
    weights: Array1<f64>,
    meanspec: TermCollectionSpec,
    noisespec: TermCollectionSpec,
    mean_offset: Array1<f64>,
    noise_offset: Array1<f64>,
    wiggle_knots: Array1<f64>,
    wiggle_degree: usize,
    wiggle_block: ParameterBlockInput,
}

impl LocationScaleFamilyBuilder for GaussianLocationScaleWiggleTermBuilder {
    type Family = GaussianLocationScaleWiggleFamily;

    fn meanspec(&self) -> &TermCollectionSpec {
        &self.meanspec
    }

    fn noisespec(&self) -> &TermCollectionSpec {
        &self.noisespec
    }

    fn noise_penalty_count(&self, noise_design: &TermCollectionDesign) -> usize {
        // Same full-space log-sigma shrinkage penalty as the non-wiggle
        // Gaussian builder; see GaussianLocationScaleTermBuilder.
        noise_design.penalties.len() + 1
    }

    fn exact_spatial_joint_supported(&self) -> bool {
        true
    }

    fn exact_spatial_seed_risk_profile(&self) -> crate::seeding::SeedRiskProfile {
        crate::seeding::SeedRiskProfile::Gaussian
    }

    fn require_exact_spatial_joint(&self) -> bool {
        true
    }

    fn extra_rho0(&self) -> Result<Array1<f64>, String> {
        initial_log_lambdas_orzeros(&self.wiggle_block)
    }

    fn build_blocks(
        &self,
        theta: &Array1<f64>,
        mean_design: &TermCollectionDesign,
        noise_design: &TermCollectionDesign,
        mean_beta_hint: Option<Array1<f64>>,
        noise_beta_hint: Option<Array1<f64>>,
    ) -> Result<Vec<ParameterBlockSpec>, String> {
        let layout = GamlssLambdaLayout::withwiggle(
            mean_design.penalties.len(),
            self.noise_penalty_count(noise_design),
            self.wiggle_block.penalties.len(),
        );
        layout.validate_theta_len(theta.len(), "gaussian location-scale wiggle")?;
        let (meanspec, noisespec) = build_gaussian_mean_and_scale_blocks(
            &self.y,
            &self.weights,
            mean_design,
            noise_design,
            &self.mean_offset,
            &self.noise_offset,
            layout.mean_from(theta),
            layout.noise_from(theta),
            mean_beta_hint,
            noise_beta_hint,
            "GaussianLocationScaleWiggle::build_blocks",
        )?;
        let n_rows = meanspec.design.nrows();
        let wigglespec = build_location_scale_wiggle_block(
            "wiggle",
            self.wiggle_block.design.clone(),
            self.wiggle_block.offset.clone(),
            wiggle_block_penalty_matrices(&self.wiggle_block),
            self.wiggle_block.nullspace_dims.clone(),
            layout.wiggle_from(theta),
            self.wiggle_block.initial_beta.clone(),
            n_rows,
        )?;
        Ok(vec![meanspec, noisespec, wigglespec])
    }

    fn build_family(
        &self,
        mean_design: &TermCollectionDesign,
        noise_design: &TermCollectionDesign,
    ) -> Self::Family {
        let preparednoise_design =
            prepared_gaussian_log_sigma_design(&mean_design.design, &noise_design.design).expect(
                "prepared Gaussian log-sigma design should match wiggle block construction",
            );
        GaussianLocationScaleWiggleFamily {
            y: self.y.clone(),
            weights: self.weights.clone(),
            mu_design: Some(mean_design.design.clone()),
            log_sigma_design: Some(preparednoise_design),
            wiggle_knots: self.wiggle_knots.clone(),
            wiggle_degree: self.wiggle_degree,
            policy: crate::resource::ResourcePolicy::default_library(),
            cached_row_scalars: std::sync::RwLock::new(None),
        }
    }

    fn extract_primary_betas(
        &self,
        fit: &UnifiedFitResult,
    ) -> Result<(Array1<f64>, Array1<f64>), String> {
        let mean_beta = fit
            .block_states
            .get(GaussianLocationScaleWiggleFamily::BLOCK_MU)
            .ok_or_else(|| "missing Gaussian wiggle mu block state".to_string())?
            .beta
            .clone();
        let noise_beta = fit
            .block_states
            .get(GaussianLocationScaleWiggleFamily::BLOCK_LOG_SIGMA)
            .ok_or_else(|| "missing Gaussian wiggle log_sigma block state".to_string())?
            .beta
            .clone();
        Ok((mean_beta, noise_beta))
    }

    fn build_psiderivative_blocks(
        &self,
        data: ndarray::ArrayView2<'_, f64>,
        meanspec_resolved: &TermCollectionSpec,
        noisespec_resolved: &TermCollectionSpec,
        mean_design: &TermCollectionDesign,
        noise_design: &TermCollectionDesign,
    ) -> Result<Vec<Vec<CustomFamilyBlockPsiDerivative>>, String> {
        let mean_derivs =
            build_block_spatial_psi_derivatives(data, meanspec_resolved, mean_design)?.ok_or_else(
                || "missing Gaussian wiggle mean spatial psi derivatives".to_string(),
            )?;
        let noise_derivs =
            build_block_spatial_psi_derivatives(data, noisespec_resolved, noise_design)?
                .ok_or_else(|| {
                    "missing Gaussian wiggle log-sigma spatial psi derivatives".to_string()
                })?;
        Ok(vec![mean_derivs, noise_derivs, Vec::new()])
    }
}

struct BinomialLocationScaleTermBuilder {
    y: Array1<f64>,
    weights: Array1<f64>,
    link_kind: InverseLink,
    meanspec: TermCollectionSpec,
    noisespec: TermCollectionSpec,
    mean_offset: Array1<f64>,
    noise_offset: Array1<f64>,
}

impl LocationScaleFamilyBuilder for BinomialLocationScaleTermBuilder {
    type Family = BinomialLocationScaleFamily;

    fn meanspec(&self) -> &TermCollectionSpec {
        &self.meanspec
    }

    fn noisespec(&self) -> &TermCollectionSpec {
        &self.noisespec
    }

    fn exact_spatial_joint_supported(&self) -> bool {
        true
    }

    fn require_exact_spatial_joint(&self) -> bool {
        true
    }

    fn noise_penalty_count(&self, noise_design: &TermCollectionDesign) -> usize {
        noise_design.penalties.len() + 1
    }

    fn build_blocks(
        &self,
        theta: &Array1<f64>,
        mean_design: &TermCollectionDesign,
        noise_design: &TermCollectionDesign,
        mean_beta_hint: Option<Array1<f64>>,
        noise_beta_hint: Option<Array1<f64>>,
    ) -> Result<Vec<ParameterBlockSpec>, String> {
        let layout = GamlssLambdaLayout::two_block(
            mean_design.penalties.len(),
            self.noise_penalty_count(noise_design),
        );
        layout.validate_theta_len(theta.len(), "binomial location-scale")?;
        let (thresholdspec, log_sigmaspec) = build_binomial_threshold_and_scale_blocks(
            &self.y,
            &self.weights,
            &self.link_kind,
            mean_design,
            noise_design,
            &self.mean_offset,
            &self.noise_offset,
            layout.mean_from(theta),
            layout.noise_from(theta),
            mean_beta_hint,
            noise_beta_hint,
            "BinomialLocationScale::build_blocks",
        )?;
        Ok(vec![thresholdspec, log_sigmaspec])
    }

    fn build_family(
        &self,
        mean_design: &TermCollectionDesign,
        noise_design: &TermCollectionDesign,
    ) -> Self::Family {
        let identifiednoise_design =
            identified_binomial_log_sigma_design(mean_design, noise_design, &self.weights)
                .expect("identified binomial log-sigma design");
        BinomialLocationScaleFamily {
            y: self.y.clone(),
            weights: self.weights.clone(),
            link_kind: self.link_kind.clone(),
            threshold_design: Some(mean_design.design.clone()),
            log_sigma_design: Some(identifiednoise_design),
            policy: crate::resource::ResourcePolicy::default_library(),
        }
    }

    fn extract_primary_betas(
        &self,
        fit: &UnifiedFitResult,
    ) -> Result<(Array1<f64>, Array1<f64>), String> {
        let mean_beta = fit
            .block_states
            .get(BinomialLocationScaleFamily::BLOCK_T)
            .ok_or_else(|| "missing Binomial threshold block state".to_string())?
            .beta
            .clone();
        let noise_beta = fit
            .block_states
            .get(BinomialLocationScaleFamily::BLOCK_LOG_SIGMA)
            .ok_or_else(|| "missing Binomial log_sigma block state".to_string())?
            .beta
            .clone();
        Ok((mean_beta, noise_beta))
    }

    fn build_psiderivative_blocks(
        &self,
        data: ndarray::ArrayView2<'_, f64>,
        meanspec_resolved: &TermCollectionSpec,
        noisespec_resolved: &TermCollectionSpec,
        mean_design: &TermCollectionDesign,
        noise_design: &TermCollectionDesign,
    ) -> Result<Vec<Vec<CustomFamilyBlockPsiDerivative>>, String> {
        let mean_derivs =
            build_block_spatial_psi_derivatives(data, meanspec_resolved, mean_design)?
                .ok_or_else(|| "missing threshold spatial psi derivatives".to_string())?;
        let noise_derivs =
            build_block_spatial_psi_derivatives(data, noisespec_resolved, noise_design)?
                .ok_or_else(|| "missing log_sigma spatial psi derivatives".to_string())?;
        Ok(vec![mean_derivs, noise_derivs])
    }
}

struct BinomialLocationScaleWiggleTermBuilder {
    y: Array1<f64>,
    weights: Array1<f64>,
    link_kind: InverseLink,
    meanspec: TermCollectionSpec,
    noisespec: TermCollectionSpec,
    mean_offset: Array1<f64>,
    noise_offset: Array1<f64>,
    wiggle_knots: Array1<f64>,
    wiggle_degree: usize,
    wiggle_block: ParameterBlockInput,
}

impl LocationScaleFamilyBuilder for BinomialLocationScaleWiggleTermBuilder {
    type Family = BinomialLocationScaleWiggleFamily;

    fn meanspec(&self) -> &TermCollectionSpec {
        &self.meanspec
    }

    fn noisespec(&self) -> &TermCollectionSpec {
        &self.noisespec
    }

    fn exact_spatial_joint_supported(&self) -> bool {
        true
    }

    fn require_exact_spatial_joint(&self) -> bool {
        true
    }

    fn extra_rho0(&self) -> Result<Array1<f64>, String> {
        initial_log_lambdas_orzeros(&self.wiggle_block)
    }

    fn noise_penalty_count(&self, noise_design: &TermCollectionDesign) -> usize {
        noise_design.penalties.len() + 1
    }

    fn build_blocks(
        &self,
        theta: &Array1<f64>,
        mean_design: &TermCollectionDesign,
        noise_design: &TermCollectionDesign,
        mean_beta_hint: Option<Array1<f64>>,
        noise_beta_hint: Option<Array1<f64>>,
    ) -> Result<Vec<ParameterBlockSpec>, String> {
        let layout = GamlssLambdaLayout::withwiggle(
            mean_design.penalties.len(),
            self.noise_penalty_count(noise_design),
            self.wiggle_block.penalties.len(),
        );
        layout.validate_theta_len(theta.len(), "wiggle location-scale")?;
        let (thresholdspec, log_sigmaspec) = build_binomial_threshold_and_scale_blocks(
            &self.y,
            &self.weights,
            &self.link_kind,
            mean_design,
            noise_design,
            &self.mean_offset,
            &self.noise_offset,
            layout.mean_from(theta),
            layout.noise_from(theta),
            mean_beta_hint,
            noise_beta_hint,
            "BinomialLocationScaleWiggle::build_blocks",
        )?;
        let n_rows = thresholdspec.design.nrows();
        let wigglespec = build_location_scale_wiggle_block(
            "wiggle",
            self.wiggle_block.design.clone(),
            self.wiggle_block.offset.clone(),
            wiggle_block_penalty_matrices(&self.wiggle_block),
            vec![],
            layout.wiggle_from(theta),
            self.wiggle_block.initial_beta.clone(),
            n_rows,
        )?;
        Ok(vec![thresholdspec, log_sigmaspec, wigglespec])
    }

    fn build_family(
        &self,
        mean_design: &TermCollectionDesign,
        noise_design: &TermCollectionDesign,
    ) -> Self::Family {
        let identifiednoise_design =
            identified_binomial_log_sigma_design(mean_design, noise_design, &self.weights)
                .expect("identified binomial log-sigma design should match block construction");
        BinomialLocationScaleWiggleFamily {
            y: self.y.clone(),
            weights: self.weights.clone(),
            link_kind: self.link_kind.clone(),
            threshold_design: Some(mean_design.design.clone()),
            log_sigma_design: Some(identifiednoise_design),
            wiggle_knots: self.wiggle_knots.clone(),
            wiggle_degree: self.wiggle_degree,
            policy: crate::resource::ResourcePolicy::default_library(),
        }
    }

    fn extract_primary_betas(
        &self,
        fit: &UnifiedFitResult,
    ) -> Result<(Array1<f64>, Array1<f64>), String> {
        let mean_beta = fit
            .block_states
            .get(BinomialLocationScaleWiggleFamily::BLOCK_T)
            .ok_or_else(|| "missing Binomial wiggle threshold block state".to_string())?
            .beta
            .clone();
        let noise_beta = fit
            .block_states
            .get(BinomialLocationScaleWiggleFamily::BLOCK_LOG_SIGMA)
            .ok_or_else(|| "missing Binomial wiggle log_sigma block state".to_string())?
            .beta
            .clone();
        Ok((mean_beta, noise_beta))
    }

    fn build_psiderivative_blocks(
        &self,
        data: ndarray::ArrayView2<'_, f64>,
        meanspec_resolved: &TermCollectionSpec,
        noisespec_resolved: &TermCollectionSpec,
        mean_design: &TermCollectionDesign,
        noise_design: &TermCollectionDesign,
    ) -> Result<Vec<Vec<CustomFamilyBlockPsiDerivative>>, String> {
        let mean_derivs =
            build_block_spatial_psi_derivatives(data, meanspec_resolved, mean_design)?
                .ok_or_else(|| "missing threshold spatial psi derivatives".to_string())?;
        let noise_derivs =
            build_block_spatial_psi_derivatives(data, noisespec_resolved, noise_design)?
                .ok_or_else(|| "missing log_sigma spatial psi derivatives".to_string())?;
        // The wiggle block has no direct spatial design matrix of its own in the
        // term builder. Spatial psi moves the wiggle family only through the
        // realized threshold/log-sigma designs, which in turn perturb q0 and the
        // realized wiggle basis B(q0). The exact joint wiggle psi hooks consume
        // those threshold/log-sigma derivative payloads and reconstruct the full
        // flattened likelihood-side [rho, psi] calculus internally, so the
        // wiggle block intentionally contributes no direct CustomFamilyBlockPsiDerivative
        // entries here.
        Ok(vec![mean_derivs, noise_derivs, Vec::new()])
    }
}

pub(crate) fn fit_gaussian_location_scale_terms(
    data: ndarray::ArrayView2<'_, f64>,
    spec: GaussianLocationScaleTermSpec,
    options: &BlockwiseFitOptions,
    kappa_options: &SpatialLengthScaleOptimizationOptions,
) -> Result<BlockwiseTermFitResult, String> {
    validate_gaussian_location_scale_termspec(data, &spec, "fit_gaussian_location_scale_terms")?;
    fit_location_scale_terms(
        data,
        GaussianLocationScaleTermBuilder {
            y: spec.y,
            weights: spec.weights,
            meanspec: spec.meanspec,
            noisespec: spec.log_sigmaspec,
            mean_offset: spec.mean_offset,
            noise_offset: spec.log_sigma_offset,
        },
        options,
        kappa_options,
    )
}

pub(crate) fn fit_gaussian_location_scalewiggle_terms(
    data: ndarray::ArrayView2<'_, f64>,
    spec: GaussianLocationScaleWiggleTermSpec,
    options: &BlockwiseFitOptions,
    kappa_options: &SpatialLengthScaleOptimizationOptions,
) -> Result<BlockwiseTermFitResult, String> {
    validate_gaussian_location_scalewiggle_termspec(
        data,
        &spec,
        "fit_gaussian_location_scalewiggle_terms",
    )?;
    fit_location_scale_terms(
        data,
        GaussianLocationScaleWiggleTermBuilder {
            y: spec.y,
            weights: spec.weights,
            meanspec: spec.meanspec,
            noisespec: spec.log_sigmaspec,
            mean_offset: spec.mean_offset,
            noise_offset: spec.log_sigma_offset,
            wiggle_knots: spec.wiggle_knots,
            wiggle_degree: spec.wiggle_degree,
            wiggle_block: spec.wiggle_block,
        },
        options,
        kappa_options,
    )
}

pub(crate) fn select_gaussian_location_scale_link_wiggle_basis_from_pilot(
    pilot: &BlockwiseTermFitResult,
    wiggle_cfg: &WiggleBlockConfig,
    wiggle_penalty_orders: &[usize],
) -> Result<SelectedWiggleBasis, String> {
    let q_seed = pilot
        .fit
        .block_states
        .first()
        .ok_or_else(|| "pilot Gaussian wiggle fit is missing mean block".to_string())?
        .eta
        .view();
    select_wiggle_basis_from_seed(q_seed, wiggle_cfg, wiggle_penalty_orders)
}

pub(crate) fn fit_gaussian_location_scale_terms_with_selected_wiggle(
    data: ndarray::ArrayView2<'_, f64>,
    spec: GaussianLocationScaleTermSpec,
    selected_wiggle_basis: SelectedWiggleBasis,
    options: &BlockwiseFitOptions,
    kappa_options: &SpatialLengthScaleOptimizationOptions,
) -> Result<BlockwiseTermWiggleFitResult, String> {
    let SelectedWiggleBasis {
        knots: wiggle_knots,
        degree: wiggle_degree,
        block: wiggle_block,
        ..
    } = selected_wiggle_basis;
    let solved = fit_gaussian_location_scalewiggle_terms(
        data,
        GaussianLocationScaleWiggleTermSpec {
            y: spec.y,
            weights: spec.weights,
            meanspec: spec.meanspec,
            log_sigmaspec: spec.log_sigmaspec,
            mean_offset: spec.mean_offset,
            log_sigma_offset: spec.log_sigma_offset,
            wiggle_knots: wiggle_knots.clone(),
            wiggle_degree,
            wiggle_block,
        },
        options,
        kappa_options,
    )?;

    BlockwiseTermWiggleFitResult::try_from_parts(BlockwiseTermWiggleFitResultParts {
        fit: solved,
        wiggle_knots,
        wiggle_degree,
    })
}

pub(crate) fn fit_binomial_location_scale_terms(
    data: ndarray::ArrayView2<'_, f64>,
    spec: BinomialLocationScaleTermSpec,
    options: &BlockwiseFitOptions,
    kappa_options: &SpatialLengthScaleOptimizationOptions,
) -> Result<BlockwiseTermFitResult, String> {
    validate_binomial_location_scale_termspec(data, &spec, "fit_binomial_location_scale_terms")?;
    fit_location_scale_terms(
        data,
        BinomialLocationScaleTermBuilder {
            y: spec.y,
            weights: spec.weights,
            link_kind: spec.link_kind,
            meanspec: spec.thresholdspec,
            noisespec: spec.log_sigmaspec,
            mean_offset: spec.threshold_offset,
            noise_offset: spec.log_sigma_offset,
        },
        options,
        kappa_options,
    )
}

pub(crate) fn fit_binomial_location_scalewiggle_terms(
    data: ndarray::ArrayView2<'_, f64>,
    spec: BinomialLocationScaleWiggleTermSpec,
    options: &BlockwiseFitOptions,
    kappa_options: &SpatialLengthScaleOptimizationOptions,
) -> Result<BlockwiseTermFitResult, String> {
    validate_binomial_location_scalewiggle_termspec(
        data,
        &spec,
        "fit_binomial_location_scalewiggle_terms",
    )?;
    fit_location_scale_terms(
        data,
        BinomialLocationScaleWiggleTermBuilder {
            y: spec.y,
            weights: spec.weights,
            link_kind: spec.link_kind,
            meanspec: spec.thresholdspec,
            noisespec: spec.log_sigmaspec,
            mean_offset: spec.threshold_offset,
            noise_offset: spec.log_sigma_offset,
            wiggle_knots: spec.wiggle_knots,
            wiggle_degree: spec.wiggle_degree,
            wiggle_block: spec.wiggle_block,
        },
        options,
        kappa_options,
    )
}

pub(crate) fn select_binomial_location_scale_link_wiggle_basis_from_pilot(
    pilot: &BlockwiseTermFitResult,
    wiggle_cfg: &WiggleBlockConfig,
    wiggle_penalty_orders: &[usize],
) -> Result<SelectedWiggleBasis, String> {
    let eta_t = pilot
        .fit
        .block_states
        .first()
        .ok_or_else(|| "pilot fit is missing threshold block".to_string())?
        .eta
        .view();
    let eta_ls = pilot
        .fit
        .block_states
        .get(1)
        .ok_or_else(|| "pilot fit is missing log_sigma block".to_string())?
        .eta
        .view();
    let sigma = eta_ls.mapv(safe_exp);
    let q_seed = Array1::from_iter(eta_t.iter().zip(sigma.iter()).map(|(&t, &s)| -t / s));
    select_wiggle_basis_from_seed(q_seed.view(), wiggle_cfg, wiggle_penalty_orders)
}

pub(crate) fn fit_binomial_location_scale_terms_with_selected_wiggle(
    data: ndarray::ArrayView2<'_, f64>,
    spec: BinomialLocationScaleTermSpec,
    selected_wiggle_basis: SelectedWiggleBasis,
    options: &BlockwiseFitOptions,
    kappa_options: &SpatialLengthScaleOptimizationOptions,
) -> Result<BlockwiseTermWiggleFitResult, String> {
    let SelectedWiggleBasis {
        knots: wiggle_knots,
        degree: wiggle_degree,
        block: wiggle_block,
        ..
    } = selected_wiggle_basis;
    let solved = fit_binomial_location_scalewiggle_terms(
        data,
        BinomialLocationScaleWiggleTermSpec {
            y: spec.y,
            weights: spec.weights,
            link_kind: spec.link_kind,
            thresholdspec: spec.thresholdspec,
            log_sigmaspec: spec.log_sigmaspec,
            threshold_offset: spec.threshold_offset,
            log_sigma_offset: spec.log_sigma_offset,
            wiggle_knots: wiggle_knots.clone(),
            wiggle_degree,
            wiggle_block,
        },
        options,
        kappa_options,
    )?;

    BlockwiseTermWiggleFitResult::try_from_parts(BlockwiseTermWiggleFitResultParts {
        fit: solved,
        wiggle_knots,
        wiggle_degree,
    })
}

pub(crate) fn select_binomial_mean_link_wiggle_basis_from_pilot(
    pilot_design: &TermCollectionDesign,
    pilot_fit: &UnifiedFitResult,
    wiggle_cfg: &WiggleBlockConfig,
    wiggle_penalty_orders: &[usize],
) -> Result<SelectedWiggleBasis, String> {
    let q_seed = pilot_design.design.dot(&pilot_fit.beta);
    select_wiggle_basis_from_seed(q_seed.view(), wiggle_cfg, wiggle_penalty_orders)
}

pub(crate) fn fit_binomial_mean_wiggle_terms_with_selected_basis(
    data: ndarray::ArrayView2<'_, f64>,
    pilot_spec: &TermCollectionSpec,
    pilot_design: &TermCollectionDesign,
    pilot_fit: &UnifiedFitResult,
    y: &Array1<f64>,
    weights: &Array1<f64>,
    link_kind: InverseLink,
    selected_wiggle_basis: SelectedWiggleBasis,
    options: &BlockwiseFitOptions,
    kappa_options: &SpatialLengthScaleOptimizationOptions,
) -> Result<BinomialMeanWiggleTermFitResult, String> {
    const RHO_BOUND: f64 = 12.0;

    validate_term_weights(
        data,
        y.len(),
        weights,
        "fit_binomial_mean_wiggle_terms_with_selected_basis",
    )?;
    validate_binomial_response(y, "fit_binomial_mean_wiggle_terms_with_selected_basis")?;

    // Large-n binomial mean-wiggle fits keep the caller's explicit Hessian
    // request. The unified evaluator chooses the scalable exact representation
    // (dense for small work, operator HVP for large work) instead of routing to
    // gradient-only BFGS by observation count.

    let SelectedWiggleBasis {
        knots: wiggle_knots,
        degree: wiggle_degree,
        block: wiggle_block,
        ..
    } = selected_wiggle_basis;

    let spatial_terms = spatial_length_scale_term_indices(pilot_spec);
    if spatial_terms.is_empty() {
        let fit = fit_binomial_mean_wiggle(
            BinomialMeanWiggleSpec {
                y: y.clone(),
                weights: weights.clone(),
                link_kind,
                wiggle_knots: wiggle_knots.clone(),
                wiggle_degree,
                eta_block: ParameterBlockInput {
                    design: pilot_design.design.clone(),
                    offset: Array1::zeros(y.len()),
                    penalties: pilot_design
                        .penalties
                        .iter()
                        .map(crate::solver::estimate::PenaltySpec::from_blockwise_ref)
                        .collect(),
                    nullspace_dims: vec![],
                    initial_log_lambdas: Some(
                        pilot_fit
                            .lambdas
                            .mapv(|v| v.max(WARMSTART_LOG_LAMBDA_FLOOR).ln()),
                    ),
                    initial_beta: Some(pilot_fit.beta.clone()),
                },
                wiggle_block,
            },
            options,
        )?;
        return Ok(BinomialMeanWiggleTermFitResult {
            fit,
            resolvedspec: pilot_spec.clone(),
            design: pilot_design.clone(),
            wiggle_knots,
            wiggle_degree,
        });
    }

    let dims_per_term = spatial_dims_per_term(pilot_spec, &spatial_terms);
    let log_kappa0 =
        SpatialLogKappaCoords::from_length_scales_aniso(pilot_spec, &spatial_terms, kappa_options)
            .reseed_from_data(data, pilot_spec, &spatial_terms, kappa_options);
    let log_kappa_lower = SpatialLogKappaCoords::lower_bounds_aniso_from_data(
        data,
        pilot_spec,
        &spatial_terms,
        &dims_per_term,
        kappa_options,
    );
    let log_kappa_upper = SpatialLogKappaCoords::upper_bounds_aniso_from_data(
        data,
        pilot_spec,
        &spatial_terms,
        &dims_per_term,
        kappa_options,
    );
    // Project seed onto bounds; spec.length_scale is a hint, not a constraint.
    let log_kappa0 = log_kappa0.clamp_to_bounds(&log_kappa_lower, &log_kappa_upper);

    let eta_penalty_count = pilot_design.penalties.len();
    let wiggle_penalty_count = initial_log_lambdas_orzeros(&wiggle_block)?.len();
    let rho_dim = eta_penalty_count + wiggle_penalty_count;
    let baseline_resolvedspec = log_kappa0
        .apply_tospec(pilot_spec, &spatial_terms)
        .map_err(|e| e.to_string())?;
    let baseline_design =
        build_term_collection_design(data, &baseline_resolvedspec).map_err(|e| e.to_string())?;
    let baseline_fit = fit_binomial_mean_wiggle(
        BinomialMeanWiggleSpec {
            y: y.clone(),
            weights: weights.clone(),
            link_kind: link_kind.clone(),
            wiggle_knots: wiggle_knots.clone(),
            wiggle_degree,
            eta_block: ParameterBlockInput {
                design: baseline_design.design.clone(),
                offset: Array1::zeros(y.len()),
                penalties: baseline_design
                    .penalties
                    .iter()
                    .map(crate::solver::estimate::PenaltySpec::from_blockwise_ref)
                    .collect(),
                nullspace_dims: vec![],
                initial_log_lambdas: Some(
                    pilot_fit
                        .lambdas
                        .mapv(|v| v.max(WARMSTART_LOG_LAMBDA_FLOOR).ln()),
                ),
                initial_beta: Some(pilot_fit.beta.clone()),
            },
            wiggle_block: wiggle_block.clone(),
        },
        options,
    )?;
    let baseline_log_lambdas = baseline_fit
        .lambdas
        .mapv(|v| v.max(WARMSTART_LOG_LAMBDA_FLOOR).ln());
    if baseline_log_lambdas.len() != rho_dim {
        return Err(GamlssError::DimensionMismatch {
            reason: format!(
                "baseline binomial mean-wiggle fit returned {} log-lambdas, expected {rho_dim}",
                baseline_log_lambdas.len()
            ),
        }
        .into());
    }
    let baseline_eta_beta = baseline_fit
        .block_states
        .get(BinomialMeanWiggleFamily::BLOCK_ETA)
        .ok_or_else(|| "baseline binomial mean-wiggle fit missing eta block".to_string())?
        .beta
        .clone();
    let baseline_wiggle_beta = Some(
        baseline_fit
            .block_states
            .get(BinomialMeanWiggleFamily::BLOCK_WIGGLE)
            .ok_or_else(|| "baseline binomial mean-wiggle fit missing wiggle block".to_string())?
            .beta
            .clone(),
    );
    let theta_dim = rho_dim + log_kappa0.len();
    let mut theta0 = Array1::<f64>::zeros(theta_dim);
    theta0
        .slice_mut(s![0..rho_dim])
        .assign(&baseline_log_lambdas);
    theta0
        .slice_mut(s![rho_dim..theta_dim])
        .assign(log_kappa0.as_array());

    let mut lower = Array1::<f64>::from_elem(theta_dim, -RHO_BOUND);
    let mut upper = Array1::<f64>::from_elem(theta_dim, RHO_BOUND);
    lower
        .slice_mut(s![rho_dim..theta_dim])
        .assign(log_kappa_lower.as_array());
    upper
        .slice_mut(s![rho_dim..theta_dim])
        .assign(log_kappa_upper.as_array());

    let pilot_spec_cloned = pilot_spec.clone();
    let pilot_beta = baseline_eta_beta;
    let wiggle_design = wiggle_block.design.clone();
    let wiggle_offset = wiggle_block.offset.clone();
    let wiggle_penalties = wiggle_block.penalties.clone();
    let wiggle_initial_beta = baseline_wiggle_beta;
    let wiggle_knots_cloned = wiggle_knots.clone();
    let y_cloned = y.clone();
    let weights_cloned = weights.clone();
    let link_kind_cloned = link_kind.clone();
    let outer_family = BinomialMeanWiggleFamily {
        y: y_cloned.clone(),
        weights: weights_cloned.clone(),
        link_kind: link_kind_cloned.clone(),
        wiggle_knots: wiggle_knots_cloned.clone(),
        wiggle_degree,
        policy: crate::resource::ResourcePolicy::default_library(),
    };
    let screening_cap = Arc::new(AtomicUsize::new(0));
    let mut outer_options = options.clone();
    outer_options.screening_max_inner_iterations = Some(Arc::clone(&screening_cap));
    struct MeanWiggleOuterState {
        warm_cache: Option<crate::custom_family::CustomFamilyWarmStart>,
        last_eval: Option<(
            Array1<f64>,
            f64,
            Array1<f64>,
            crate::solver::outer_strategy::HessianResult,
            crate::custom_family::CustomFamilyWarmStart,
        )>,
    }

    let build_realized_blocks = |theta: &Array1<f64>| -> Result<
        (
            TermCollectionSpec,
            TermCollectionDesign,
            Vec<ParameterBlockSpec>,
            Vec<CustomFamilyBlockPsiDerivative>,
        ),
        String,
    > {
        let log_kappa =
            SpatialLogKappaCoords::from_theta_tail_with_dims(theta, rho_dim, dims_per_term.clone());
        let resolvedspec = log_kappa
            .apply_tospec(&pilot_spec_cloned, &spatial_terms)
            .map_err(|e| e.to_string())?;
        let design =
            build_term_collection_design(data, &resolvedspec).map_err(|e| e.to_string())?;
        let eta_derivs = build_block_spatial_psi_derivatives(data, &resolvedspec, &design)?
            .ok_or_else(|| {
                "missing eta spatial psi derivatives for binomial mean wiggle".to_string()
            })?;
        let blocks = vec![
            ParameterBlockSpec {
                name: "eta".to_string(),
                design: design.design.clone(),
                offset: Array1::zeros(y_cloned.len()),
                penalties: design.penalties_as_penalty_matrix(),
                nullspace_dims: vec![],
                initial_log_lambdas: theta.slice(s![0..eta_penalty_count]).to_owned(),
                initial_beta: Some(pilot_beta.clone()),
                // Lower gauge priority on the static eta design: it yields the
                // shared level/intercept direction to the dynamic full-width
                // wiggle I-spline block (see fit_binomial_mean_wiggle).
                gauge_priority: LINK_WIGGLE_GAUGE_PRIORITY,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
            ParameterBlockSpec {
                name: "wiggle".to_string(),
                design: wiggle_design.clone(),
                offset: wiggle_offset.clone(),
                penalties: {
                    let p_wiggle = wiggle_design.ncols();
                    wiggle_penalties
                        .iter()
                        .map(|spec| match spec {
                            crate::solver::estimate::PenaltySpec::Block {
                                local,
                                col_range,
                                ..
                            } => PenaltyMatrix::Blockwise {
                                local: local.clone(),
                                col_range: col_range.clone(),
                                total_dim: p_wiggle,
                            },
                            crate::solver::estimate::PenaltySpec::Dense(m)
                            | crate::solver::estimate::PenaltySpec::DenseWithMean {
                                matrix: m,
                                ..
                            } => PenaltyMatrix::Dense(m.clone()),
                        })
                        .collect()
                },
                nullspace_dims: vec![],
                initial_log_lambdas: theta.slice(s![eta_penalty_count..rho_dim]).to_owned(),
                initial_beta: wiggle_initial_beta.clone(),
                gauge_priority: DEFAULT_GAUGE_PRIORITY,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
        ];
        Ok((resolvedspec, design, blocks, eta_derivs))
    };

    let build_eval = |theta: &Array1<f64>,
                      warm_cache: Option<&crate::custom_family::CustomFamilyWarmStart>,
                      need_hessian: bool|
     -> Result<
        (
            crate::custom_family::CustomFamilyJointHyperResult,
            TermCollectionSpec,
            TermCollectionDesign,
        ),
        String,
    > {
        let (resolvedspec, design, blocks, eta_derivs) = build_realized_blocks(theta)?;
        let eval = evaluate_custom_family_joint_hyper(
            &outer_family,
            &blocks,
            &outer_options,
            &theta.slice(s![0..rho_dim]).to_owned(),
            &[eta_derivs, Vec::new()],
            warm_cache,
            if need_hessian {
                crate::solver::estimate::reml::unified::EvalMode::ValueGradientHessian
            } else {
                crate::solver::estimate::reml::unified::EvalMode::ValueAndGradient
            },
        )?;
        Ok((eval, resolvedspec, design))
    };

    let build_efs = |theta: &Array1<f64>,
                     warm_cache: Option<&crate::custom_family::CustomFamilyWarmStart>|
     -> Result<crate::custom_family::CustomFamilyJointHyperEfsResult, String> {
        let (_, _, blocks, eta_derivs) = build_realized_blocks(theta)?;
        evaluate_custom_family_joint_hyper_efs(
            &outer_family,
            &blocks,
            &outer_options,
            &theta.slice(s![0..rho_dim]).to_owned(),
            &[eta_derivs, Vec::new()],
            warm_cache,
        )
        .map_err(|e| e.to_string())
    };

    use crate::estimate::EstimationError;
    use crate::solver::outer_strategy::{
        DeclaredHessianForm, Derivative, OuterEval, OuterEvalOrder,
    };

    // Exact first-order AND second-order [rho, psi] calculus is available
    // for all inverse links via the shared jet formulas plus the generic
    // exact-Newton D_βH / D²_βH closures routed through
    // evaluate_custom_family_joint_hyper -> joint_outer_evaluate ->
    // BorrowedJointDerivProvider. This enables the analytic-Hessian outer
    // plan for REML optimization instead of the downgraded gradient-only
    // outer strategies.
    //
    // Spatial log-kappa coordinates are ψ (design-moving) dimensions because
    // they rebuild the spatial basis and penalties at each outer proposal.
    let analytic_outer_hessian_available = true;
    let mut seed_heuristic = theta0.to_vec();
    for value in &mut seed_heuristic[..rho_dim] {
        *value = value.exp();
    }
    let problem = crate::solver::outer_strategy::OuterProblem::new(theta_dim)
        .with_gradient(Derivative::Analytic)
        .with_hessian(if analytic_outer_hessian_available {
            DeclaredHessianForm::Either
        } else {
            DeclaredHessianForm::Unavailable
        })
        .with_psi_dim(theta_dim - rho_dim)
        .with_tolerance(options.outer_tol)
        .with_max_iter(options.outer_max_iter)
        .with_bounds(lower.clone(), upper.clone())
        .with_initial_rho(theta0.clone())
        .with_seed_config(crate::seeding::SeedConfig {
            max_seeds: 4,
            seed_budget: 2,
            risk_profile: crate::seeding::SeedRiskProfile::GeneralizedLinear,
            num_auxiliary_trailing: theta_dim - rho_dim,
            ..Default::default()
        })
        .with_screening_cap(Arc::clone(&screening_cap))
        .with_rho_bound(12.0)
        .with_heuristic_lambdas(seed_heuristic);

    let eval_outer = |state: &mut MeanWiggleOuterState,
                      theta: &Array1<f64>,
                      order: OuterEvalOrder|
     -> Result<OuterEval, EstimationError> {
        if let Some((cached_theta, cached_cost, cached_grad, cached_hess, cached_warm)) =
            &state.last_eval
            && cached_theta == theta
            && (!matches!(order, OuterEvalOrder::ValueGradientHessian)
                || matches!(
                    cached_hess,
                    crate::solver::outer_strategy::HessianResult::Analytic(_)
                        | crate::solver::outer_strategy::HessianResult::Operator(_)
                ))
        {
            state.warm_cache = Some(cached_warm.clone());
            return Ok(OuterEval {
                cost: *cached_cost,
                gradient: cached_grad.clone(),
                hessian: cached_hess.clone(),
                inner_beta_hint: None,
            });
        }
        let need_hessian = matches!(order, OuterEvalOrder::ValueGradientHessian)
            && analytic_outer_hessian_available;
        let (eval, _, _) = build_eval(theta, state.warm_cache.as_ref(), need_hessian)
            .map_err(EstimationError::InvalidInput)?;
        if !eval.inner_converged {
            state.warm_cache = Some(eval.warm_start);
            crate::bail_invalid_estim!(
                "binomial mean-wiggle exact spatial inner solve did not converge"
            );
        }
        let hessian_result = eval.outer_hessian.clone();
        state.last_eval = Some((
            theta.clone(),
            eval.objective,
            eval.gradient.clone(),
            eval.outer_hessian.clone(),
            eval.warm_start.clone(),
        ));
        state.warm_cache = Some(eval.warm_start);
        Ok(OuterEval {
            cost: eval.objective,
            gradient: eval.gradient,
            hessian: hessian_result,
            inner_beta_hint: None,
        })
    };

    let mut obj = problem.build_objective_with_eval_order(
        MeanWiggleOuterState {
            warm_cache: None,
            last_eval: None,
        },
        |state: &mut MeanWiggleOuterState, theta: &Array1<f64>| {
            if let Some((cached_theta, cached_cost, _, _, cached_warm)) = &state.last_eval
                && cached_theta == theta
            {
                state.warm_cache = Some(cached_warm.clone());
                return Ok(*cached_cost);
            }
            let (eval, _, _) = build_eval(theta, state.warm_cache.as_ref(), false)
                .map_err(EstimationError::InvalidInput)?;
            if !eval.inner_converged {
                state.warm_cache = Some(eval.warm_start);
                crate::bail_invalid_estim!(
                    "binomial mean-wiggle exact spatial cost inner solve did not converge"
                        .to_string(),
                );
            }
            state.warm_cache = Some(eval.warm_start);
            Ok(eval.objective)
        },
        |state: &mut MeanWiggleOuterState, theta: &Array1<f64>| {
            eval_outer(
                state,
                theta,
                if analytic_outer_hessian_available {
                    OuterEvalOrder::ValueGradientHessian
                } else {
                    OuterEvalOrder::ValueAndGradient
                },
            )
        },
        |state: &mut MeanWiggleOuterState, theta: &Array1<f64>, order: OuterEvalOrder| {
            eval_outer(state, theta, order)
        },
        Some(|state: &mut MeanWiggleOuterState| {
            state.warm_cache = None;
            state.last_eval = None;
        }),
        Some(|state: &mut MeanWiggleOuterState, theta: &Array1<f64>| {
            let eval = build_efs(theta, state.warm_cache.as_ref())
                .map_err(EstimationError::InvalidInput)?;
            if !eval.inner_converged {
                state.warm_cache = Some(eval.warm_start);
                crate::bail_invalid_estim!(
                    "binomial mean-wiggle exact spatial EFS inner solve did not converge"
                        .to_string(),
                );
            }
            state.warm_cache = Some(eval.warm_start);
            Ok(eval.efs_eval)
        }),
    );

    let outer = problem
        .run(&mut obj, "binomial mean wiggle exact spatial hyper")
        .map_err(|e| e.to_string())?;
    if !outer.converged {
        return Err(GamlssError::NumericalFailure { reason: format!(
            "binomial mean wiggle exact spatial hyper did not converge after {} iterations (final_objective={:.6e}, final_grad_norm={})",
            outer.iterations,
            outer.final_value,
            outer.final_grad_norm_report(),
        ) }.into());
    }
    let theta_star = outer.rho;

    let log_kappa =
        SpatialLogKappaCoords::from_theta_tail_with_dims(&theta_star, rho_dim, dims_per_term);
    let resolvedspec = log_kappa
        .apply_tospec(&pilot_spec_cloned, &spatial_terms)
        .map_err(|e| e.to_string())?;
    let design = build_term_collection_design(data, &resolvedspec).map_err(|e| e.to_string())?;
    let resolvedspec =
        freeze_term_collection_from_design(&resolvedspec, &design).map_err(|e| e.to_string())?;
    let fit = fit_binomial_mean_wiggle(
        BinomialMeanWiggleSpec {
            y: y_cloned,
            weights: weights_cloned,
            link_kind: link_kind_cloned,
            wiggle_knots: wiggle_knots.clone(),
            wiggle_degree,
            eta_block: ParameterBlockInput {
                design: design.design.clone(),
                offset: Array1::zeros(y.len()),
                penalties: design
                    .penalties
                    .iter()
                    .map(crate::solver::estimate::PenaltySpec::from_blockwise_ref)
                    .collect(),
                nullspace_dims: vec![],
                initial_log_lambdas: Some(theta_star.slice(s![0..eta_penalty_count]).to_owned()),
                initial_beta: Some(pilot_beta),
            },
            wiggle_block: ParameterBlockInput {
                design: wiggle_design,
                offset: wiggle_offset,
                penalties: wiggle_penalties,
                nullspace_dims: vec![],
                initial_log_lambdas: Some(
                    theta_star.slice(s![eta_penalty_count..rho_dim]).to_owned(),
                ),
                initial_beta: wiggle_initial_beta,
            },
        },
        options,
    )?;

    Ok(BinomialMeanWiggleTermFitResult {
        fit,
        resolvedspec,
        design,
        wiggle_knots,
        wiggle_degree,
    })
}

/// Link identifiers for distribution parameters in multi-parameter GAMLSS families.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum ParameterLink {
    Identity,
    Log,
    Logit,
    Probit,
    InverseLink,
    /// Learnable smooth departure from a known base link.
    Wiggle,
}

struct BinomialLocationScaleCore {
    sigma: Array1<f64>,
    dsigma_deta: Array1<f64>,
    q0: Array1<f64>,
    mu: Array1<f64>,
    dmu_dq: Array1<f64>,
    d2mu_dq2: Array1<f64>,
    d3mu_dq3: Array1<f64>,
    log_likelihood: f64,
}

#[derive(Clone, Copy)]
struct NonWiggleQDerivs {
    q_t: f64,
    q_ls: f64,
    q_tl: f64,
    q_ll: f64,
    q_tl_ls: f64,
    q_ll_ls: f64,
}

#[derive(Clone, Copy)]
struct NonWiggleQDirectional {
    delta_q: f64,
    delta_q_t: f64,
    delta_q_ls: f64,
    delta_q_tl: f64,
    delta_q_ll: f64,
}

#[derive(Clone, Copy)]
struct BinomialLocationScaleRow {
    sigma: f64,
    dsigma_deta: f64,
    q0: f64,
    inverse_link: crate::mixture_link::InverseLinkJet,
    ll: f64,
}

#[inline]
fn hessian_coeff_fromobjective_q_terms(m1: f64, m2: f64, q_a: f64, q_b: f64, q_ab: f64) -> f64 {
    // F = -sum ell, scalar q:
    //   H_ab = m2 * q_a q_b + m1 * q_ab.
    m2 * q_a * q_b + m1 * q_ab
}

#[inline]
fn directionalhessian_coeff_fromobjective_q_terms(
    m1: f64,
    m2: f64,
    m3: f64,
    dq: f64,
    q_a: f64,
    q_b: f64,
    q_ab: f64,
    dq_a: f64,
    dq_b: f64,
    dq_ab: f64,
) -> f64 {
    // F = -sum ell, scalar q:
    //   dH_ab[u] = m3*dq*q_a*q_b + m2*(dq_a*q_b + q_a*dq_b + dq*q_ab) + m1*dq_ab.
    m3 * dq * q_a * q_b + m2 * (dq_a * q_b + q_a * dq_b + dq * q_ab) + m1 * dq_ab
}

#[inline]
fn second_directionalhessian_coeff_fromobjective_q_terms(
    m1: f64,
    m2: f64,
    m3: f64,
    m4: f64,
    dq_u: f64,
    dqv: f64,
    d2q_uv: f64,
    q_a: f64,
    q_b: f64,
    q_ab: f64,
    dq_a_u: f64,
    dq_av: f64,
    dq_b_u: f64,
    dq_bv: f64,
    d2q_a_uv: f64,
    d2q_b_uv: f64,
    dq_ab_u: f64,
    dq_abv: f64,
    d2q_ab_uv: f64,
) -> f64 {
    // F = -sum ell, scalar q:
    //   H_ab = m2 * q_a q_b + m1 * q_ab.
    // Exact mixed second directional derivative:
    //
    // Write
    //   A = q_a q_b,
    //   B = q_ab.
    //
    // Then
    //   H_ab = m2 * A + m1 * B,
    // where m_k = F^(k)(q).
    //
    // First directional derivative along u:
    //   D_u H_ab
    //   = m3 * dq_u * A
    //   + m2 * (D_u A + dq_u * B)
    //   + m1 * D_u B.
    //
    // Differentiate once more along v:
    //   D²H_ab[u,v] =
    //      m4*dq_u*dqv*q_a*q_b
    //    + m3*(d2q_uv*q_a*q_b
    //         + dq_u*(dq_av*q_b + q_a*dq_bv)
    //         + dqv*(dq_a_u*q_b + q_a*dq_b_u)
    //         + dq_u*dqv*q_ab)
    //    + m2*(d2q_a_uv*q_b + dq_a_u*dq_bv + dq_av*dq_b_u + q_a*d2q_b_uv
    //          + d2q_uv*q_ab + dq_u*dq_abv + dqv*dq_ab_u)
    //    + m1*d2q_ab_uv.
    //
    // The single dq_u*dqv*q_ab term is important. There is exactly one copy:
    //
    //   Dv[m2 * dq_u * B]
    //   = m3 * dqv * dq_u * B + m2 * (d2q_uv * B + dq_u * Dv B),
    //
    // and no second copy appears elsewhere. A previous version of this helper
    // accidentally counted this term twice by embedding `dqv * q_ab` in both
    // the `dq_u` and `dqv` product-rule branches.
    let d_qaqb_u = dq_a_u * q_b + q_a * dq_b_u;
    let d_qaqbv = dq_av * q_b + q_a * dq_bv;
    let d2_qaqb_uv = d2q_a_uv * q_b + dq_a_u * dq_bv + dq_av * dq_b_u + q_a * d2q_b_uv;
    m4 * dq_u * dqv * q_a * q_b
        + m3 * (d2q_uv * q_a * q_b + dq_u * d_qaqbv + dqv * d_qaqb_u + dq_u * dqv * q_ab)
        + m2 * (d2_qaqb_uv + d2q_uv * q_ab + dq_u * dq_abv + dqv * dq_ab_u)
        + m1 * d2q_ab_uv
}

/// Non-wiggle location-scale map derivatives via shared scalar core.
fn nonwiggle_q_derivs(eta_t: f64, sigma: f64) -> NonWiggleQDerivs {
    let inv_sigma = sigma.recip();
    let q_t = -inv_sigma;
    let q_ls = eta_t * inv_sigma;
    let q_tl = inv_sigma;
    let q_ll = -eta_t * inv_sigma;
    let q_tl_ls = -inv_sigma;
    let q_ll_ls = eta_t * inv_sigma;
    NonWiggleQDerivs {
        q_t,
        q_ls,
        q_tl,
        q_ll,
        q_tl_ls,
        q_ll_ls,
    }
}

/// Directional derivatives along (d_eta_t, d_eta_ls):
/// delta_q = q_t d_eta_t + q_ls d_eta_ls
/// delta_q_t = q_tl d_eta_ls
/// delta_q_ls = q_tl d_eta_t + q_ll d_eta_ls
/// delta_q_tt = 0
/// delta_q_tl = q_tl_ls d_eta_ls
/// delta_q_ll = q_tl_ls d_eta_t + q_ll_ls d_eta_ls
fn nonwiggle_q_directional(
    q: NonWiggleQDerivs,
    d_eta_t: f64,
    d_eta_ls: f64,
) -> NonWiggleQDirectional {
    // Directional-chain derivation:
    //
    // For any scalar f(eta_t,eta_ls), directional derivative along
    // d eta = (d_eta_t, d_eta_ls) is
    //   dot{f} = f_t d_eta_t + f_ls d_eta_ls.
    //
    // Apply to q and its eta-partials:
    //   dot{q}      = q_t d_eta_t + q_ls d_eta_ls.
    //   dot{q_t}    = q_tt d_eta_t + q_tl d_eta_ls = q_tl d_eta_ls (q_tt=0).
    //   dot{q_ls}   = q_tl d_eta_t + q_ll d_eta_ls.
    //   dot{q_tt}   = 0.
    //   dot{q_tl}   = q_tl_ls d_eta_ls.
    //   dot{q_ll}   = q_tl_ls d_eta_t + q_ll_ls d_eta_ls.
    NonWiggleQDirectional {
        delta_q: q.q_t * d_eta_t + q.q_ls * d_eta_ls,
        delta_q_t: q.q_tl * d_eta_ls,
        delta_q_ls: q.q_tl * d_eta_t + q.q_ll * d_eta_ls,
        delta_q_tl: q.q_tl_ls * d_eta_ls,
        delta_q_ll: q.q_tl_ls * d_eta_t + q.q_ll_ls * d_eta_ls,
    }
}

#[inline]
fn log1mexp_neg_positive(z: f64) -> f64 {
    assert!(z >= 0.0);
    if z == 0.0 {
        f64::NEG_INFINITY
    } else if z <= std::f64::consts::LN_2 {
        (-(-z).exp_m1()).ln()
    } else {
        (1.0 - (-z).exp()).ln()
    }
}

#[inline]
fn bernoulli_log_likelihood_from_probability(y: f64, weight: f64, mu: f64) -> Result<f64, String> {
    if weight == 0.0 {
        return Ok(0.0);
    }
    if !mu.is_finite() || !(0.0..=1.0).contains(&mu) {
        return Err(GamlssError::NumericalFailure {
            reason: format!(
                "binomial location-scale inverse link returned invalid probability {mu}"
            ),
        }
        .into());
    }
    let log_mu = if mu == 0.0 {
        if y == 0.0 { 0.0 } else { f64::NEG_INFINITY }
    } else {
        mu.ln()
    };
    let log_one_minus = if mu == 1.0 {
        if y == 1.0 { 0.0 } else { f64::NEG_INFINITY }
    } else {
        (1.0 - mu).ln()
    };
    let ll = weight * (y * log_mu + (1.0 - y) * log_one_minus);
    if ll.is_finite() {
        Ok(ll)
    } else {
        Err(GamlssError::NonFinite {
            reason: format!(
                "binomial location-scale log likelihood is non-finite at y={y}, mu={mu}"
            ),
        }
        .into())
    }
}

#[inline]
fn binomial_location_scale_q0(eta_t: f64, sigma: f64) -> f64 {
    -eta_t / sigma
}

#[inline]
fn binomial_location_scale_log_likelihood(
    y: f64,
    weight: f64,
    q: f64,
    link_kind: &InverseLink,
    mu: f64,
) -> Result<f64, String> {
    if weight == 0.0 {
        return Ok(0.0);
    }
    match link_kind {
        InverseLink::Standard(StandardLink::Probit) => {
            Ok(weight * (y * normal_logcdf(q) + (1.0_f64 - y) * normal_logsf(q)))
        }
        InverseLink::Standard(StandardLink::Logit) => Ok(weight
            * (-y * crate::linalg::utils::stable_softplus(-q)
                - (1.0_f64 - y) * crate::linalg::utils::stable_softplus(q))),
        InverseLink::Standard(StandardLink::CLogLog) => {
            let z = q.exp();
            let log_p = if z == 0.0 {
                q
            } else if z.is_infinite() {
                0.0
            } else {
                log1mexp_neg_positive(z)
            };
            let log_survival = -z;
            let ll = weight * (y * log_p + (1.0_f64 - y) * log_survival);
            if ll.is_finite() {
                Ok(ll)
            } else {
                Err(GamlssError::NonFinite { reason: format!(
                    "binomial cloglog location-scale log likelihood is non-finite at y={y}, q={q}"
                ) }.into())
            }
        }
        _ => bernoulli_log_likelihood_from_probability(y, weight, mu),
    }
}

fn binomial_location_scalerow(
    y: f64,
    weight: f64,
    eta_t: f64,
    eta_ls: f64,
    etawiggle: f64,
    link_kind: &InverseLink,
) -> Result<BinomialLocationScaleRow, String> {
    let SigmaJet1 {
        sigma,
        d1: dsigma_deta,
    } = exp_sigma_jet1_scalar(eta_ls);
    let q0 = binomial_location_scale_q0(eta_t, sigma);
    let q = q0 + etawiggle;
    let jet = inverse_link_jet_for_inverse_link(link_kind, q)
        .map_err(|e| format!("location-scale inverse-link evaluation failed: {e}"))?;
    let raw_mu = jet.mu;
    // μ is stored RAW (unclamped). The q-derivative tower built downstream
    // (binomial_neglog_q_derivatives_dispatch et al.) is the EXACT derivative
    // of the loss evaluated here, computed via the per-branch reciprocals in
    // `binomial_loglik_mu_derivatives` plus the saturation guard in the
    // `*_from_jet` consumers. Flooring μ at MIN_PROB here would replace every
    // representable sub-MIN_PROB tail probability with a 1e-10 surrogate,
    // corrupting the Fisher curvature throughout the saturated tail (#948).
    // The inverse-link derivatives d1/d2/d3 carry the legitimate gradient
    // signal and are likewise preserved.
    let inverse_link = jet;
    let ll = binomial_location_scale_log_likelihood(y, weight, q, link_kind, raw_mu)?;
    Ok(BinomialLocationScaleRow {
        sigma,
        dsigma_deta,
        q0,
        inverse_link,
        ll,
    })
}

/// Compute only the log-likelihood scalar for the binomial location-scale model.
/// This avoids allocating 7 n-vectors that `binomial_location_scale_core` would produce,
/// making backtracking line searches much cheaper at biobank scale.
fn binomial_location_scale_ll_only(
    y: &Array1<f64>,
    weights: &Array1<f64>,
    eta_t: &Array1<f64>,
    eta_ls: &Array1<f64>,
    etawiggle: Option<&Array1<f64>>,
    link_kind: &InverseLink,
) -> Result<f64, String> {
    let n = y.len();
    let y_slice = y.as_slice().expect("y must be contiguous");
    let w_slice = weights.as_slice().expect("weights must be contiguous");
    let et_slice = eta_t.as_slice().expect("eta_t must be contiguous");
    let el_slice = eta_ls.as_slice().expect("eta_ls must be contiguous");
    let ew_slice = etawiggle.map(|w| w.as_slice().expect("etawiggle must be contiguous"));
    (0..n)
        .into_par_iter()
        .try_fold(
            || 0.0_f64,
            |acc, i| -> Result<f64, String> {
                let SigmaJet1 { sigma, .. } = exp_sigma_jet1_scalar(el_slice[i]);
                let q0 = binomial_location_scale_q0(et_slice[i], sigma);
                let q = q0 + ew_slice.map_or(0.0, |w| w[i]);
                if matches!(link_kind, InverseLink::Standard(StandardLink::Probit)) {
                    return Ok(acc
                        + binomial_location_scale_log_likelihood(
                            y_slice[i], w_slice[i], q, link_kind, 0.5,
                        )?);
                }
                let jet = inverse_link_jet_for_inverse_link(link_kind, q)
                    .map_err(|e| format!("location-scale inverse-link evaluation failed: {e}"))?;
                Ok(acc
                    + binomial_location_scale_log_likelihood(
                        y_slice[i], w_slice[i], q, link_kind, jet.mu,
                    )?)
            },
        )
        .try_reduce(|| 0.0_f64, |a, b| Ok(a + b))
}

fn binomial_location_scale_core(
    y: &Array1<f64>,
    weights: &Array1<f64>,
    eta_t: &Array1<f64>,
    eta_ls: &Array1<f64>,
    etawiggle: Option<&Array1<f64>>,
    link_kind: &InverseLink,
) -> Result<BinomialLocationScaleCore, String> {
    let n = y.len();
    if weights.len() != n || eta_t.len() != n || eta_ls.len() != n {
        return Err(GamlssError::DimensionMismatch {
            reason: "binomial location-scale core size mismatch".to_string(),
        }
        .into());
    }
    if let Some(w) = etawiggle
        && w.len() != n
    {
        return Err(GamlssError::DimensionMismatch {
            reason: "binomial location-scale core wiggle size mismatch".to_string(),
        }
        .into());
    }

    // Parallel per-row probit/inverse-link evaluation. At biobank scale
    // (n = 320K) the sequential probit erfc loop was a major single-thread
    // hotspot called dozens of times per outer REML gradient evaluation.
    let y_slice = y.as_slice().expect("y must be contiguous");
    let w_slice = weights.as_slice().expect("weights must be contiguous");
    let et_slice = eta_t.as_slice().expect("eta_t must be contiguous");
    let el_slice = eta_ls.as_slice().expect("eta_ls must be contiguous");
    let ew_slice = etawiggle.map(|w| w.as_slice().expect("etawiggle must be contiguous"));

    // Write each row's seven scalar derivatives directly into preallocated
    // output buffers in parallel, reducing the per-row log-likelihood
    // alongside. The previous path collected a `Vec<BinomialLocationScaleRow>`
    // (8 scalar fields plus alignment) and then serially scattered into the
    // seven `Array1`s, which at biobank scale n=3e5 cost ~50 MB of transient
    // allocation and a single-threaded post-pass.
    let mut sigma = vec![0.0_f64; n];
    let mut dsigma_deta = vec![0.0_f64; n];
    let mut q0 = vec![0.0_f64; n];
    let mut mu = vec![0.0_f64; n];
    let mut dmu_dq = vec![0.0_f64; n];
    let mut d2mu_dq2 = vec![0.0_f64; n];
    let mut d3mu_dq3 = vec![0.0_f64; n];

    /// Wrapper to send raw pointers across threads for disjoint per-row writes.
    /// Each parallel iteration writes to a unique index `i`, and the caller
    /// ensures the pointers outlive the parallel region (see SAFETY: notes
    /// on each `unsafe` site below).
    #[derive(Clone, Copy)]
    struct SendPtr(*mut f64);
    // SAFETY: pointers are constructed from live writable buffers and used
    // only for disjoint per-row writes inside a bounded parallel region; the
    // owning `Vec`s outlive the region.
    unsafe impl Send for SendPtr {}
    // SAFETY: same live-buffer and disjoint-index invariants as `Send`; no
    // two threads write the same offset through any shared `SendPtr` value.
    unsafe impl Sync for SendPtr {}
    impl SendPtr {
        #[inline(always)]
        // SAFETY: `self.0` points to a live writable allocation with length
        // greater than `i`, and `i` is exclusively owned by the calling
        // parallel iteration.
        unsafe fn write(self, i: usize, v: f64) {
            // SAFETY: see `write`'s function-level note: `i` is in-bounds
            // and exclusively owned by this iteration.
            unsafe { *self.0.add(i) = v };
        }
    }

    let sigma_p = SendPtr(sigma.as_mut_ptr());
    let dsigma_p = SendPtr(dsigma_deta.as_mut_ptr());
    let q0_p = SendPtr(q0.as_mut_ptr());
    let mu_p = SendPtr(mu.as_mut_ptr());
    let dmu_p = SendPtr(dmu_dq.as_mut_ptr());
    let d2mu_p = SendPtr(d2mu_dq2.as_mut_ptr());
    let d3mu_p = SendPtr(d3mu_dq3.as_mut_ptr());

    let ll = (0..n)
        .into_par_iter()
        .map(move |i| {
            let row = binomial_location_scalerow(
                y_slice[i],
                w_slice[i],
                et_slice[i],
                el_slice[i],
                ew_slice.map_or(0.0, |w| w[i]),
                link_kind,
            )?;
            // SAFETY: `i` comes from `0..n`, so it is in-bounds for each
            // preallocated length-`n` buffer, and every index is produced once;
            // each pointer targets a distinct output buffer.
            unsafe {
                sigma_p.write(i, row.sigma);
                dsigma_p.write(i, row.dsigma_deta);
                q0_p.write(i, row.q0);
                mu_p.write(i, row.inverse_link.mu);
                dmu_p.write(i, row.inverse_link.d1);
                d2mu_p.write(i, row.inverse_link.d2);
                d3mu_p.write(i, row.inverse_link.d3);
            }
            Ok::<f64, String>(row.ll)
        })
        .try_reduce(|| 0.0_f64, |a, b| Ok(a + b))?;

    Ok(BinomialLocationScaleCore {
        sigma: Array1::from_vec(sigma),
        dsigma_deta: Array1::from_vec(dsigma_deta),
        q0: Array1::from_vec(q0),
        mu: Array1::from_vec(mu),
        dmu_dq: Array1::from_vec(dmu_dq),
        d2mu_dq2: Array1::from_vec(d2mu_dq2),
        d3mu_dq3: Array1::from_vec(d3mu_dq3),
        log_likelihood: ll,
    })
}

/// Pure row-coefficient builder for the binomial location-scale joint
/// directional derivative `D_β H_L[u]`. Returns `(c_tt, c_tl, c_ll)` such
/// that the resulting matrix is
///
///   X_t^T diag(c_tt) X_t + X_t^T diag(c_tl) X_ls (+ symmetric)
///   + X_ls^T diag(c_ll) X_ls.
///
/// Inputs `d_eta_t = X_t · u_t`, `d_eta_ls = X_ls · u_ls` are the linear
/// predictor perturbations along the joint direction `u = (u_t, u_ls)`.
fn binomial_location_scale_first_directional_coefficients(
    y: &Array1<f64>,
    weights: &Array1<f64>,
    core: &BinomialLocationScaleCore,
    d_eta_t: &Array1<f64>,
    d_eta_ls: &Array1<f64>,
    link_kind: &InverseLink,
) -> (Array1<f64>, Array1<f64>, Array1<f64>) {
    let n = y.len();
    let mut c_tt_v = vec![0.0_f64; n];
    let mut c_tl_v = vec![0.0_f64; n];
    let mut c_ll_v = vec![0.0_f64; n];
    let y_slice = y.as_slice().expect("y must be contiguous");
    let w_slice = weights.as_slice().expect("weights must be contiguous");
    let q0_slice = core.q0.as_slice().expect("q0 must be contiguous");
    let sigma_slice = core.sigma.as_slice().expect("sigma must be contiguous");
    let dsigma_slice = core
        .dsigma_deta
        .as_slice()
        .expect("dsigma_deta must be contiguous");
    let mu_slice = core.mu.as_slice().expect("mu must be contiguous");
    let dmu_slice = core.dmu_dq.as_slice().expect("dmu_dq must be contiguous");
    let d2mu_slice = core
        .d2mu_dq2
        .as_slice()
        .expect("d2mu_dq2 must be contiguous");
    let d3mu_slice = core
        .d3mu_dq3
        .as_slice()
        .expect("d3mu_dq3 must be contiguous");
    let det_slice = d_eta_t.as_slice().expect("d_eta_t must be contiguous");
    let del_slice = d_eta_ls.as_slice().expect("d_eta_ls must be contiguous");
    c_tt_v
        .par_iter_mut()
        .zip(c_tl_v.par_iter_mut())
        .zip(c_ll_v.par_iter_mut())
        .enumerate()
        .for_each(|(i, ((c_tt, c_tl), c_ll))| {
            let q = q0_slice[i];
            let r = 1.0 / sigma_slice[i];
            let s = dsigma_slice[i] / sigma_slice[i];
            let (m1, m2, m3) = binomial_neglog_q_derivatives_dispatch(
                y_slice[i],
                w_slice[i],
                q,
                mu_slice[i],
                dmu_slice[i],
                d2mu_slice[i],
                d3mu_slice[i],
                link_kind,
            );
            let a = det_slice[i];
            let b = del_slice[i];
            let sb = s * b;
            let du = -r * a - q * sb;
            *c_tt = r * r * (m3 * du - 2.0 * m2 * sb);
            *c_tl = s * r * (q * m3 * du + m2 * (2.0 * du - q * sb) - m1 * sb);
            *c_ll = s * s * (m1 + 3.0 * q * m2 + q * q * m3) * du;
        });
    (
        Array1::from_vec(c_tt_v),
        Array1::from_vec(c_tl_v),
        Array1::from_vec(c_ll_v),
    )
}

/// Pure row-coefficient builder for the binomial location-scale joint
/// second directional derivative `D²_β H_L[u, v]`. Returns
/// `(c_tt, c_tl, c_ll)` analogous to the first-order helper but built from
/// the four predictor perturbations `(d_eta_t_u, d_eta_ls_u, d_eta_t_v,
/// d_eta_ls_v)`.
fn binomial_location_scalesecond_directional_coefficients(
    y: &Array1<f64>,
    weights: &Array1<f64>,
    core: &BinomialLocationScaleCore,
    d_eta_t_u: &Array1<f64>,
    d_eta_ls_u: &Array1<f64>,
    d_eta_t_v: &Array1<f64>,
    d_eta_ls_v: &Array1<f64>,
    link_kind: &InverseLink,
) -> Result<(Array1<f64>, Array1<f64>, Array1<f64>), String> {
    use rayon::iter::{IntoParallelIterator, ParallelIterator};
    let n = y.len();
    // Per-row second-directional coefficient computation. m4 dispatch
    // can fail (Result), so collect a Result<Vec<(tt, tl, ll)>>.
    let triples: Result<Vec<(f64, f64, f64)>, String> = (0..n)
        .into_par_iter()
        .map(|i| -> Result<(f64, f64, f64), String> {
            let q = core.q0[i];
            let r = 1.0 / core.sigma[i];
            let (m1, m2, m3) = binomial_neglog_q_derivatives_dispatch(
                y[i],
                weights[i],
                q,
                core.mu[i],
                core.dmu_dq[i],
                core.d2mu_dq2[i],
                core.d3mu_dq3[i],
                link_kind,
            );
            let m4 = binomial_neglog_q_fourth_derivative_dispatch(
                y[i],
                weights[i],
                q,
                core.mu[i],
                core.dmu_dq[i],
                core.d2mu_dq2[i],
                core.d3mu_dq3[i],
                link_kind,
            )?;
            let s = core.dsigma_deta[i] / core.sigma[i];
            let a = d_eta_t_u[i];
            let b = s * d_eta_ls_u[i];
            let c = d_eta_t_v[i];
            let d = s * d_eta_ls_v[i];
            let du = -r * a - q * b;
            let dv = -r * c - q * d;
            let d2 = r * (a * d + b * c) + q * b * d;
            let tt =
                r * r * (m4 * du * dv + m3 * (d2 - 2.0 * d * du - 2.0 * b * dv) + 4.0 * m2 * b * d);
            let tl = s
                * r
                * (q * m4 * du * dv
                    + m3 * (q * d2 + 3.0 * du * dv - q * (d * du + b * dv))
                    + m2 * (q * b * d + 2.0 * d2 - 2.0 * (d * du + b * dv))
                    + m1 * b * d);
            let ll = s
                * s
                * (q * q * m4 * du * dv
                    + m3 * (q * q * d2 + 5.0 * q * du * dv)
                    + m2 * (3.0 * q * d2 + 4.0 * du * dv)
                    + m1 * d2);
            Ok((tt, tl, ll))
        })
        .collect();
    let triples = triples?;
    let mut coeff_tt = Array1::<f64>::zeros(n);
    let mut coeff_tl = Array1::<f64>::zeros(n);
    let mut coeff_ll = Array1::<f64>::zeros(n);
    for (i, (tt, tl, ll)) in triples.into_iter().enumerate() {
        coeff_tt[i] = tt;
        coeff_tl[i] = tl;
        coeff_ll[i] = ll;
    }
    Ok((coeff_tt, coeff_tl, coeff_ll))
}

/// Built-in Gaussian location-scale family:
/// - Block 0: location μ(·) with identity link
/// - Block 1: log-scale log σ(·) with log link
pub struct GaussianLocationScaleFamily {
    pub y: Array1<f64>,
    pub weights: Array1<f64>,
    pub mu_design: Option<DesignMatrix>,
    pub log_sigma_design: Option<DesignMatrix>,
    /// Resource policy threaded into PsiDesignMap construction (and any other
    /// per-call materialization decision) made during exact-Newton joint psi
    /// derivative evaluation. Defaults to `ResourcePolicy::default_library()`
    /// when the family is built without an explicit policy.
    pub policy: crate::resource::ResourcePolicy,
    /// Cached per-observation row scalars keyed by 6-element fingerprint
    /// (first, mid, last elements of both eta vectors).
    /// Avoids recomputing O(n) scalars K+ times per REML gradient/Hessian evaluation.
    pub cached_row_scalars:
        std::sync::RwLock<Option<(f64, f64, f64, f64, f64, f64, Arc<GaussianJointRowScalars>)>>,
}

impl Clone for GaussianLocationScaleFamily {
    fn clone(&self) -> Self {
        Self {
            y: self.y.clone(),
            weights: self.weights.clone(),
            mu_design: self.mu_design.clone(),
            log_sigma_design: self.log_sigma_design.clone(),
            policy: self.policy.clone(),
            cached_row_scalars: std::sync::RwLock::new(
                self.cached_row_scalars
                    .read()
                    .expect("lock poisoned")
                    .clone(),
            ),
        }
    }
}

struct LocationScaleJointPsiDirection {
    block_idx: usize,
    local_idx: usize,
    x_primary_psi: PsiDesignMap,
    x_ls_psi: PsiDesignMap,
    z_primary_psi: Array1<f64>,
    z_ls_psi: Array1<f64>,
}

struct LocationScaleJointPsiSecondDrifts {
    x_primary_ab_action: Option<CustomFamilyPsiSecondDesignAction>,
    x_ls_ab_action: Option<CustomFamilyPsiSecondDesignAction>,
    x_primary_ab: Option<Array2<f64>>,
    x_ls_ab: Option<Array2<f64>>,
    z_primary_ab: Array1<f64>,
    z_ls_ab: Array1<f64>,
}

/// Shared interface that the Gaussian and Binomial location-scale families (and
/// their wiggle variants) expose to the unified joint ψ workspace.
///
/// The four families are structurally identical at the workspace level: each
/// owns two dense block designs (location + log-scale), produces a per-ψ
/// direction, and assembles second-order ψ terms and a ψ-Hessian directional
/// derivative from those parts. They differ only in (1) the concrete
/// [`Direction`](Self::Direction) struct produced (Gaussian vs Binomial field
/// names), (2) the family-name fragment in the dense-designs error message, and
/// (3) whether an optional Horvitz–Thompson outer-row subsample is threaded
/// into the per-row weight arrays (Gaussian does; Binomial ignores it and runs
/// the full-data exact path). This single trait gives the generic
/// [`LocationScaleJointPsiWorkspace`] one dispatch surface; each family's impl
/// is a thin delegation to inherent methods it already owns.
trait LocationScaleJointPsiFamily: Clone + Send + Sync + 'static {
    /// Per-ψ joint direction produced by this family.
    type Direction: Send + Sync + 'static;

    /// Family-name fragment used in the workspace's dense-designs error
    /// message so the originating family stays visible after unification.
    const LABEL: &'static str;

    fn ws_policy(&self) -> &crate::resource::ResourcePolicy;

    fn ws_exact_joint_dense_block_designs<'a>(
        &'a self,
        specs: Option<&'a [ParameterBlockSpec]>,
    ) -> Result<Option<(Cow<'a, Array2<f64>>, Cow<'a, Array2<f64>>)>, String>;

    fn ws_psi_direction(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        design_loc: &Array2<f64>,
        design_scale: &Array2<f64>,
        policy: &crate::resource::ResourcePolicy,
    ) -> Result<Option<Self::Direction>, String>;

    fn ws_psi_second_order_terms_from_parts(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_a: &Self::Direction,
        psi_b: &Self::Direction,
        design_loc: &Array2<f64>,
        design_scale: &Array2<f64>,
        subsample: Option<&[crate::families::marginal_slope_shared::WeightedOuterRow]>,
    ) -> Result<ExactNewtonJointPsiSecondOrderTerms, String>;

    fn ws_psi_hessian_directional_from_parts(
        &self,
        block_states: &[ParameterBlockState],
        psi_dir: &Self::Direction,
        d_beta_flat: &Array1<f64>,
        design_loc: &Array2<f64>,
        design_scale: &Array2<f64>,
        subsample: Option<&[crate::families::marginal_slope_shared::WeightedOuterRow]>,
    ) -> Result<Array2<f64>, String>;
}

impl LocationScaleJointPsiFamily for GaussianLocationScaleFamily {
    type Direction = LocationScaleJointPsiDirection;
    const LABEL: &'static str = "GaussianLocationScaleFamily";

    fn ws_policy(&self) -> &crate::resource::ResourcePolicy {
        &self.policy
    }

    fn ws_exact_joint_dense_block_designs<'a>(
        &'a self,
        specs: Option<&'a [ParameterBlockSpec]>,
    ) -> Result<Option<(Cow<'a, Array2<f64>>, Cow<'a, Array2<f64>>)>, String> {
        self.exact_joint_dense_block_designs(specs)
    }

    fn ws_psi_direction(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        design_loc: &Array2<f64>,
        design_scale: &Array2<f64>,
        policy: &crate::resource::ResourcePolicy,
    ) -> Result<Option<LocationScaleJointPsiDirection>, String> {
        self.exact_newton_joint_psi_direction(
            block_states,
            derivative_blocks,
            psi_index,
            design_loc,
            design_scale,
            policy,
        )
    }

    fn ws_psi_second_order_terms_from_parts(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_a: &LocationScaleJointPsiDirection,
        psi_b: &LocationScaleJointPsiDirection,
        design_loc: &Array2<f64>,
        design_scale: &Array2<f64>,
        subsample: Option<&[crate::families::marginal_slope_shared::WeightedOuterRow]>,
    ) -> Result<ExactNewtonJointPsiSecondOrderTerms, String> {
        self.exact_newton_joint_psisecond_order_terms_from_parts(
            block_states,
            derivative_blocks,
            psi_a,
            psi_b,
            design_loc,
            design_scale,
            subsample,
        )
    }

    fn ws_psi_hessian_directional_from_parts(
        &self,
        block_states: &[ParameterBlockState],
        psi_dir: &LocationScaleJointPsiDirection,
        d_beta_flat: &Array1<f64>,
        design_loc: &Array2<f64>,
        design_scale: &Array2<f64>,
        subsample: Option<&[crate::families::marginal_slope_shared::WeightedOuterRow]>,
    ) -> Result<Array2<f64>, String> {
        self.exact_newton_joint_psihessian_directional_derivative_from_parts(
            block_states,
            psi_dir,
            d_beta_flat,
            design_loc,
            design_scale,
            subsample,
        )
    }
}

impl LocationScaleJointPsiFamily for GaussianLocationScaleWiggleFamily {
    type Direction = LocationScaleJointPsiDirection;
    const LABEL: &'static str = "GaussianLocationScaleWiggleFamily";

    fn ws_policy(&self) -> &crate::resource::ResourcePolicy {
        &self.policy
    }

    fn ws_exact_joint_dense_block_designs<'a>(
        &'a self,
        specs: Option<&'a [ParameterBlockSpec]>,
    ) -> Result<Option<(Cow<'a, Array2<f64>>, Cow<'a, Array2<f64>>)>, String> {
        self.exact_joint_dense_block_designs(specs)
    }

    fn ws_psi_direction(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        design_loc: &Array2<f64>,
        design_scale: &Array2<f64>,
        policy: &crate::resource::ResourcePolicy,
    ) -> Result<Option<LocationScaleJointPsiDirection>, String> {
        self.exact_newton_joint_psi_direction(
            block_states,
            derivative_blocks,
            psi_index,
            design_loc,
            design_scale,
            policy,
        )
    }

    fn ws_psi_second_order_terms_from_parts(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_a: &LocationScaleJointPsiDirection,
        psi_b: &LocationScaleJointPsiDirection,
        design_loc: &Array2<f64>,
        design_scale: &Array2<f64>,
        outer_rows: Option<&[crate::families::marginal_slope_shared::WeightedOuterRow]>,
    ) -> Result<ExactNewtonJointPsiSecondOrderTerms, String> {
        assert!(outer_rows.map_or(true, |r| r.len() <= isize::MAX as usize));
        // Wiggle ψ path: full-data exact (= trivially unbiased). The
        // wiggle-specific second-order from-parts function inlines 30+
        // per-row coefficient arrays (`coeff_mm{,_a,_b,_ab}`,
        // `coeff_ml{,_a,_b,_ab}`, `coeff_ll{,_a,_b,_ab}`, `a{,_a,_b,_ab}`,
        // `c{,_a,_b,_ab}`, `l{,_a,_b,_ab}`, `dw_{a,b,ab}`, `s_mu*`, `s_ls*`,
        // `s_w*`, ...) instead of packing them into a struct like the
        // non-wiggle GLS path's `GaussianJointPsi{First,Second}Weights`.
        // Each is row-linear in `rows.{w,m,n,kappa,...}` and the direction
        // vectors so HT masking is theoretically clean, but threading a mask
        // across that many call sites is brittle (any missed array silently
        // biases the estimator). The outer score remains unbiased without
        // touching the wiggle ψ path: HT-unbiased LL
        // (`log_likelihood_only_with_options`) + HT-unbiased ρ-Hessian
        // (`exact_newton_joint_hessian_workspace_with_options`) +
        // exact-unbiased ψ (this path) = unbiased. Broadening to the wiggle
        // ψ path is a follow-up that should refactor the inline arrays into
        // `WiggleJointPsi{First,Second}Weights` structs mirroring
        // `GaussianJointPsi{First,Second}Weights` so a single
        // `apply_ht_mask_wiggle*` helper can mask everything in one place.
        self.exact_newton_joint_psisecond_order_terms_from_parts(
            block_states,
            derivative_blocks,
            psi_a,
            psi_b,
            design_loc,
            design_scale,
        )
    }

    fn ws_psi_hessian_directional_from_parts(
        &self,
        block_states: &[ParameterBlockState],
        psi_dir: &LocationScaleJointPsiDirection,
        d_beta_flat: &Array1<f64>,
        design_loc: &Array2<f64>,
        design_scale: &Array2<f64>,
        outer_rows: Option<&[crate::families::marginal_slope_shared::WeightedOuterRow]>,
    ) -> Result<Array2<f64>, String> {
        assert!(outer_rows.map_or(true, |r| r.len() <= isize::MAX as usize));
        // Same rationale as `ws_psi_second_order_terms_from_parts` above:
        // the wiggle ψ-Hessian directional-derivative function also inlines
        // dozens of per-row arrays. Full-data is exact (= trivially
        // unbiased), so the total outer score remains unbiased.
        self.exact_newton_joint_psihessian_directional_derivative_from_parts(
            block_states,
            psi_dir,
            d_beta_flat,
            design_loc,
            design_scale,
        )
    }
}

/// Generic joint exact-Newton ψ workspace shared by every location-scale
/// family (Gaussian / Binomial, with or without a wiggle block) via the
/// [`LocationScaleJointPsiFamily`] trait.
///
/// The workspace owns the two dense block designs as `Arc<Array2<f64>>` (the
/// per-family `ws_exact_joint_dense_block_designs` hands back a `Cow`, which is
/// materialized once here), the per-ψ direction cache, and an optional
/// Horvitz–Thompson outer-row subsample. When the subsample is `Some`, every
/// per-row weight array produced inside the second-order ψ Hessian and the
/// ψ-Hessian directional-derivative computations is masked: each sampled row's
/// contribution is scaled by `WeightedOuterRow.weight = 1/π_i` and non-sampled
/// rows are zeroed. Because every downstream assembly is row-linear in those
/// arrays, the resulting ψ score and ψ Hessian remain unbiased estimators of
/// the full-data quantities. Families that do not thread the subsample (the
/// Binomial families) construct with `new` and the field stays `None`.
struct LocationScaleJointPsiWorkspace<F: LocationScaleJointPsiFamily> {
    family: F,
    block_states: Vec<ParameterBlockState>,
    derivative_blocks: Vec<Vec<CustomFamilyBlockPsiDerivative>>,
    design_loc: Arc<Array2<f64>>,
    design_scale: Arc<Array2<f64>>,
    psi_directions: ExactNewtonJointPsiDirectCache<F::Direction>,
    outer_score_subsample: Option<Arc<crate::families::marginal_slope_shared::OuterScoreSubsample>>,
}

impl<F: LocationScaleJointPsiFamily> LocationScaleJointPsiWorkspace<F> {
    fn new(
        family: F,
        block_states: Vec<ParameterBlockState>,
        specs: &[ParameterBlockSpec],
        derivative_blocks: Vec<Vec<CustomFamilyBlockPsiDerivative>>,
    ) -> Result<Self, String> {
        Self::new_with_subsample(family, block_states, specs, derivative_blocks, None)
    }

    fn new_with_subsample(
        family: F,
        block_states: Vec<ParameterBlockState>,
        specs: &[ParameterBlockSpec],
        derivative_blocks: Vec<Vec<CustomFamilyBlockPsiDerivative>>,
        outer_score_subsample: Option<
            Arc<crate::families::marginal_slope_shared::OuterScoreSubsample>,
        >,
    ) -> Result<Self, String> {
        let Some((design_loc, design_scale)) =
            family.ws_exact_joint_dense_block_designs(Some(specs))?
        else {
            return Err(GamlssError::UnsupportedConfiguration {
                reason: format!(
                    "{} exact joint psi workspace requires dense block designs",
                    F::LABEL,
                ),
            }
            .into());
        };
        let design_loc = shared_dense_arc(design_loc.as_ref());
        let design_scale = shared_dense_arc(design_scale.as_ref());
        let psi_dim = derivative_blocks.iter().map(Vec::len).sum();
        Ok(Self {
            family,
            block_states,
            derivative_blocks,
            design_loc,
            design_scale,
            psi_directions: ExactNewtonJointPsiDirectCache::new(psi_dim),
            outer_score_subsample,
        })
    }

    fn psi_direction(&self, psi_index: usize) -> Result<Option<Arc<F::Direction>>, String> {
        self.psi_directions.get_or_try_init(psi_index, || {
            self.family.ws_psi_direction(
                &self.block_states,
                &self.derivative_blocks,
                psi_index,
                self.design_loc.as_ref(),
                self.design_scale.as_ref(),
                self.family.ws_policy(),
            )
        })
    }

    fn subsample_rows(
        &self,
    ) -> Option<&[crate::families::marginal_slope_shared::WeightedOuterRow]> {
        self.outer_score_subsample
            .as_ref()
            .map(|s| s.rows.as_ref().as_slice())
    }
}

impl<F> ExactNewtonJointPsiWorkspace for LocationScaleJointPsiWorkspace<F>
where
    F: LocationScaleJointPsiFamily,
{
    fn second_order_terms(
        &self,
        psi_i: usize,
        psi_j: usize,
    ) -> Result<Option<ExactNewtonJointPsiSecondOrderTerms>, String> {
        let Some(dir_i) = self.psi_direction(psi_i)? else {
            return Ok(None);
        };
        let Some(dir_j) = self.psi_direction(psi_j)? else {
            return Ok(None);
        };
        Ok(Some(self.family.ws_psi_second_order_terms_from_parts(
            &self.block_states,
            &self.derivative_blocks,
            dir_i.as_ref(),
            dir_j.as_ref(),
            self.design_loc.as_ref(),
            self.design_scale.as_ref(),
            self.subsample_rows(),
        )?))
    }

    fn hessian_directional_derivative(
        &self,
        psi_index: usize,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<crate::solver::estimate::reml::unified::DriftDerivResult>, String> {
        let Some(dir) = self.psi_direction(psi_index)? else {
            return Ok(None);
        };
        Ok(Some(
            crate::solver::estimate::reml::unified::DriftDerivResult::Dense(
                self.family.ws_psi_hessian_directional_from_parts(
                    &self.block_states,
                    dir.as_ref(),
                    d_beta_flat,
                    self.design_loc.as_ref(),
                    self.design_scale.as_ref(),
                    self.subsample_rows(),
                )?,
            ),
        ))
    }
}

type GaussianLocationScaleExactNewtonJointPsiWorkspace =
    LocationScaleJointPsiWorkspace<GaussianLocationScaleFamily>;
type GaussianLocationScaleWiggleExactNewtonJointPsiWorkspace =
    LocationScaleJointPsiWorkspace<GaussianLocationScaleWiggleFamily>;

#[derive(Clone)]
pub struct GaussianJointRowScalars {
    obs_weight: Array1<f64>,
    w: Array1<f64>,
    m: Array1<f64>,
    n: Array1<f64>,
    /// κ = (dσ/dη_ls)/σ for the active sigma link.
    /// The cross Hessian block H_{μ,ls} carries an overall κ factor and the
    /// scale-scale block H_{ls,ls} carries κ².
    kappa: Array1<f64>,
    /// κ' = dκ/dη_ls = κ(1−κ) for the logb link. The static H_{ls,ls} block
    /// carries a κ'·(a−n) term, so κ' threads through every dH directional
    /// weight via the chain rule.
    kappa_prime: Array1<f64>,
    /// κ'' = κ(1−κ)(1−2κ); appears in d²H_{ls,ls} via the second
    /// η-derivative of κ'·(a−n).
    kappa_dprime: Array1<f64>,
}

struct GaussianJointPsiFirstWeights {
    objective_psirow: Array1<f64>,
    scoremu: Array1<f64>,
    score_ls: Array1<f64>,
    dscoremu: Array1<f64>,
    dscore_ls: Array1<f64>,
    hmumu: Array1<f64>,
    hmu_ls: Array1<f64>,
    h_ls_ls: Array1<f64>,
    dhmumu: Array1<f64>,
    dhmu_ls: Array1<f64>,
    dh_ls_ls: Array1<f64>,
}

struct GaussianJointPsiSecondWeights {
    objective_psi_psirow: Array1<f64>,
    d2scoremu: Array1<f64>,
    d2score_ls: Array1<f64>,
    d2hmumu: Array1<f64>,
    d2hmu_ls: Array1<f64>,
    d2h_ls_ls: Array1<f64>,
}

struct GaussianJointPsiMixedDriftWeights {
    dhmumu_u: Array1<f64>,
    dhmu_ls_u: Array1<f64>,
    dh_ls_ls_u: Array1<f64>,
    d2hmumu: Array1<f64>,
    d2hmu_ls: Array1<f64>,
    d2h_ls_ls: Array1<f64>,
}

/// Apply a Horvitz–Thompson outer-row subsample mask to every per-row array
/// of a `GaussianJointPsiFirstWeights` in place: each sampled row's
/// contribution is multiplied by `WeightedOuterRow.weight = 1/π_i` and all
/// non-sampled rows are zeroed. Every downstream assembly
/// (`gaussian_joint_psi*_fromweights`,
/// `build_two_block_custom_family_joint_psi_operator_from_actions`) consumes
/// these arrays row-linearly via `Xᵀ diag(W) Y` and `weighted_crossprod_psi_maps`,
/// so the resulting first-order ψ score and Hessian remain unbiased estimators
/// of the full-data quantities.
fn apply_ht_mask_first(
    weights: &mut GaussianJointPsiFirstWeights,
    rows: &[crate::families::marginal_slope_shared::WeightedOuterRow],
) {
    let n = weights.objective_psirow.len();
    let mut obj = Array1::<f64>::zeros(n);
    let mut smu = Array1::<f64>::zeros(n);
    let mut sls = Array1::<f64>::zeros(n);
    let mut dsmu = Array1::<f64>::zeros(n);
    let mut dsls = Array1::<f64>::zeros(n);
    let mut hmm = Array1::<f64>::zeros(n);
    let mut hml = Array1::<f64>::zeros(n);
    let mut hll = Array1::<f64>::zeros(n);
    let mut dhmm = Array1::<f64>::zeros(n);
    let mut dhml = Array1::<f64>::zeros(n);
    let mut dhll = Array1::<f64>::zeros(n);
    for r in rows {
        let i = r.index;
        let w = r.weight;
        obj[i] = weights.objective_psirow[i] * w;
        smu[i] = weights.scoremu[i] * w;
        sls[i] = weights.score_ls[i] * w;
        dsmu[i] = weights.dscoremu[i] * w;
        dsls[i] = weights.dscore_ls[i] * w;
        hmm[i] = weights.hmumu[i] * w;
        hml[i] = weights.hmu_ls[i] * w;
        hll[i] = weights.h_ls_ls[i] * w;
        dhmm[i] = weights.dhmumu[i] * w;
        dhml[i] = weights.dhmu_ls[i] * w;
        dhll[i] = weights.dh_ls_ls[i] * w;
    }
    weights.objective_psirow = obj;
    weights.scoremu = smu;
    weights.score_ls = sls;
    weights.dscoremu = dsmu;
    weights.dscore_ls = dsls;
    weights.hmumu = hmm;
    weights.hmu_ls = hml;
    weights.h_ls_ls = hll;
    weights.dhmumu = dhmm;
    weights.dhmu_ls = dhml;
    weights.dh_ls_ls = dhll;
}

/// HT mask for `GaussianJointPsiSecondWeights`. Same semantics as
/// `apply_ht_mask_first`: each per-row contribution is scaled by 1/π_i and
/// non-sampled rows are zeroed. Consumed row-linearly by
/// `gaussian_joint_psisecondhessian_fromweights` and the `score_psi_psi`
/// `fast_atv(_, d2score_*)` reductions.
fn apply_ht_mask_second(
    weights: &mut GaussianJointPsiSecondWeights,
    rows: &[crate::families::marginal_slope_shared::WeightedOuterRow],
) {
    let n = weights.objective_psi_psirow.len();
    let mut obj = Array1::<f64>::zeros(n);
    let mut d2smu = Array1::<f64>::zeros(n);
    let mut d2sls = Array1::<f64>::zeros(n);
    let mut d2hmm = Array1::<f64>::zeros(n);
    let mut d2hml = Array1::<f64>::zeros(n);
    let mut d2hll = Array1::<f64>::zeros(n);
    for r in rows {
        let i = r.index;
        let w = r.weight;
        obj[i] = weights.objective_psi_psirow[i] * w;
        d2smu[i] = weights.d2scoremu[i] * w;
        d2sls[i] = weights.d2score_ls[i] * w;
        d2hmm[i] = weights.d2hmumu[i] * w;
        d2hml[i] = weights.d2hmu_ls[i] * w;
        d2hll[i] = weights.d2h_ls_ls[i] * w;
    }
    weights.objective_psi_psirow = obj;
    weights.d2scoremu = d2smu;
    weights.d2score_ls = d2sls;
    weights.d2hmumu = d2hmm;
    weights.d2hmu_ls = d2hml;
    weights.d2h_ls_ls = d2hll;
}

/// HT mask for `GaussianJointPsiMixedDriftWeights`. Same semantics as the
/// other `apply_ht_mask_*` helpers; consumed row-linearly by
/// `gaussian_joint_psi_mixedhessian_drift_fromweights`.
fn apply_ht_mask_mixed(
    weights: &mut GaussianJointPsiMixedDriftWeights,
    rows: &[crate::families::marginal_slope_shared::WeightedOuterRow],
) {
    let n = weights.dhmumu_u.len();
    let mut dhmm_u = Array1::<f64>::zeros(n);
    let mut dhml_u = Array1::<f64>::zeros(n);
    let mut dhll_u = Array1::<f64>::zeros(n);
    let mut d2hmm = Array1::<f64>::zeros(n);
    let mut d2hml = Array1::<f64>::zeros(n);
    let mut d2hll = Array1::<f64>::zeros(n);
    for r in rows {
        let i = r.index;
        let w = r.weight;
        dhmm_u[i] = weights.dhmumu_u[i] * w;
        dhml_u[i] = weights.dhmu_ls_u[i] * w;
        dhll_u[i] = weights.dh_ls_ls_u[i] * w;
        d2hmm[i] = weights.d2hmumu[i] * w;
        d2hml[i] = weights.d2hmu_ls[i] * w;
        d2hll[i] = weights.d2h_ls_ls[i] * w;
    }
    weights.dhmumu_u = dhmm_u;
    weights.dhmu_ls_u = dhml_u;
    weights.dh_ls_ls_u = dhll_u;
    weights.d2hmumu = d2hmm;
    weights.d2hmu_ls = d2hml;
    weights.d2h_ls_ls = d2hll;
}

fn gaussian_jointrow_scalars(
    y: &Array1<f64>,
    etamu: &Array1<f64>,
    eta_ls: &Array1<f64>,
    weights: &Array1<f64>,
) -> Result<GaussianJointRowScalars, String> {
    let nobs = y.len();
    if etamu.len() != nobs || eta_ls.len() != nobs || weights.len() != nobs {
        return Err(GamlssError::DimensionMismatch {
            reason: "Gaussian joint row scalar input size mismatch".to_string(),
        }
        .into());
    }
    let mut obs_weight = Array1::<f64>::uninit(nobs);
    let mut w = Array1::<f64>::uninit(nobs);
    let mut m = Array1::<f64>::uninit(nobs);
    let mut n = Array1::<f64>::uninit(nobs);
    let mut kappa = Array1::<f64>::uninit(nobs);
    let mut kappa_prime = Array1::<f64>::uninit(nobs);
    let mut kappa_dprime = Array1::<f64>::uninit(nobs);
    for i in 0..nobs {
        let jet = crate::families::sigma_link::logb_sigma_jet1_scalar(eta_ls[i]);
        let s = jet.sigma;
        // κ = exp(η)/(b + exp(η)). Use the direct exp(η)/σ form
        // when finite — it preserves the precision of exp(η) at very negative
        // η (where 1 − b/σ catastrophically cancels because b/σ → 1). The
        // η → +∞ branch returns 1 cleanly without hitting ∞/∞ NaN.
        let ki = logb_dlog_sigma_deta(s, jet.d1);
        let kp = ki * (1.0 - ki);
        let kdp = kp * (1.0 - 2.0 * ki);
        let wi = weights[i] / (s * s);
        let ri = y[i] - etamu[i];
        obs_weight[i].write(weights[i]);
        w[i].write(wi);
        m[i].write(ri * wi);
        n[i].write(ri * ri * wi);
        kappa[i].write(ki);
        kappa_prime[i].write(kp);
        kappa_dprime[i].write(kdp);
    }
    // SAFETY: every `MaybeUninit` slot in each of these arrays was written
    // exactly once in the `for i in 0..nobs` loop above; no slot is read,
    // moved, or dropped before this point.
    let (obs_weight, w, m, n, kappa, kappa_prime, kappa_dprime) = unsafe {
        (
            obs_weight.assume_init(),
            w.assume_init(),
            m.assume_init(),
            n.assume_init(),
            kappa.assume_init(),
            kappa_prime.assume_init(),
            kappa_dprime.assume_init(),
        )
    };
    Ok(GaussianJointRowScalars {
        obs_weight,
        w,
        m,
        n,
        kappa,
        kappa_prime,
        kappa_dprime,
    })
}

fn gaussian_joint_first_directionalweights(
    scalars: &GaussianJointRowScalars,
    dotmu: &Array1<f64>,
    dot_eta: &Array1<f64>,
) -> (Array1<f64>, Array1<f64>, Array1<f64>) {
    let nobs = scalars.w.len();
    let mut w_u = Array1::<f64>::uninit(nobs);
    let mut c_u = Array1::<f64>::uninit(nobs);
    let mut d_u = Array1::<f64>::uninit(nobs);
    for i in 0..nobs {
        let wi = scalars.w[i];
        let mi = scalars.m[i];
        let ki = scalars.kappa[i];
        let kpi = scalars.kappa_prime[i];
        let ai = scalars.obs_weight[i];
        let dm = dotmu[i];
        let de = dot_eta[i];
        // κ-scaled log-sigma direction.
        let sde = ki * de;
        w_u[i].write(-2.0 * wi * sde);
        // + 2·κ'·m·de: dκ/dη chain-rule from σ = b + e^η.
        c_u[i].write(ki * (-2.0 * wi * dm - 4.0 * mi * sde) + 2.0 * mi * kpi * de);
        // Directional derivative of Fisher E[H_{ls,ls}]=2κ²a: 4κκ'a·de (#566).
        d_u[i].write(4.0 * ki * kpi * ai * de);
    }
    // SAFETY: every slot of `w_u`, `c_u`, `d_u` was written exactly once
    // inside the loop above (one `.write(...)` per index per array).
    let (w_u, c_u, d_u) = unsafe { (w_u.assume_init(), c_u.assume_init(), d_u.assume_init()) };
    (w_u, c_u, d_u)
}

fn gaussian_jointsecond_directionalweights(
    scalars: &GaussianJointRowScalars,
    dotmu_u: &Array1<f64>,
    dot_eta_u: &Array1<f64>,
    dotmuv: &Array1<f64>,
    dot_etav: &Array1<f64>,
) -> (Array1<f64>, Array1<f64>, Array1<f64>) {
    let nobs = scalars.w.len();
    let mut w_uv = Array1::<f64>::uninit(nobs);
    let mut c_uv = Array1::<f64>::uninit(nobs);
    let mut d_uv = Array1::<f64>::uninit(nobs);
    for i in 0..nobs {
        let wi = scalars.w[i];
        let mi = scalars.m[i];
        let ki = scalars.kappa[i];
        let kpi = scalars.kappa_prime[i];
        let kdpi = scalars.kappa_dprime[i];
        let ai = scalars.obs_weight[i];
        let dmu = dotmu_u[i];
        let dmv = dotmuv[i];
        let deu = dot_eta_u[i];
        let dev = dot_etav[i];
        // κ-scaled log-sigma directions.
        let sdeu = ki * deu;
        let sdev = ki * dev;
        let de_sym = dmu * dev + dmv * deu;
        let de_eta = deu * dev;
        // − 2·κ'·w·deu·dev: ∂²w/∂η² = 4wκ² − 2wκ'.
        w_uv[i].write(4.0 * wi * sdeu * sdev - 2.0 * wi * kpi * de_eta);
        // − 2·κ'·w·sym + 2·m·(κ''−6·κ·κ')·deu·dev from d²(2mκ).
        c_uv[i].write(
            ki * (4.0 * wi * (dmu * sdev + dmv * sdeu) + 8.0 * mi * sdeu * sdev)
                - 2.0 * wi * kpi * de_sym
                + 2.0 * mi * (kdpi - 6.0 * ki * kpi) * de_eta,
        );
        // d²/du dv of Fisher E[H_{ls,ls}]=2κ²a: bilinear in fixed directions
        // u,v, no μ dependence ⇒ 4a(κ'²+κκ'')·deu·dev (#566).
        d_uv[i].write(4.0 * ai * (kpi * kpi + ki * kdpi) * de_eta);
    }
    // SAFETY: every slot of `w_uv`, `c_uv`, `d_uv` was written exactly once
    // inside the loop above.
    let (w_uv, c_uv, d_uv) =
        unsafe { (w_uv.assume_init(), c_uv.assume_init(), d_uv.assume_init()) };
    (w_uv, c_uv, d_uv)
}

fn gaussian_joint_psi_firstweights(
    scalars: &GaussianJointRowScalars,
    mu_a: &Array1<f64>,
    eta_a: &Array1<f64>,
) -> GaussianJointPsiFirstWeights {
    let nobs = scalars.w.len();
    let mut objective_psirow = Array1::<f64>::uninit(nobs);
    let mut scoremu = Array1::<f64>::uninit(nobs);
    let mut score_ls = Array1::<f64>::uninit(nobs);
    let mut dscoremu = Array1::<f64>::uninit(nobs);
    let mut dscore_ls = Array1::<f64>::uninit(nobs);
    let mut hmumu = Array1::<f64>::uninit(nobs);
    let mut hmu_ls = Array1::<f64>::uninit(nobs);
    let mut h_ls_ls = Array1::<f64>::uninit(nobs);
    let mut dhmumu = Array1::<f64>::uninit(nobs);
    let mut dhmu_ls = Array1::<f64>::uninit(nobs);
    let mut dh_ls_ls = Array1::<f64>::uninit(nobs);
    for i in 0..nobs {
        let mi = scalars.m[i];
        let ni = scalars.n[i];
        let ki = scalars.kappa[i];
        let kpi = scalars.kappa_prime[i];
        let ai = scalars.obs_weight[i];
        let ma = mu_a[i];
        let ea = eta_a[i];
        // κ-scaled log-sigma direction.
        let sea = ki * ea;
        let smu = -mi;
        let sls = ki * (ai - ni);
        let wi = scalars.w[i];
        scoremu[i].write(smu);
        score_ls[i].write(sls);
        dscoremu[i].write(wi * ma + 2.0 * mi * sea);
        // + κ'·(a−n)·η̇ chain-rule term (∂[κ(a−n)]/∂η = κ'(a−n) + 2κ²n).
        dscore_ls[i].write(ki * (2.0 * mi * ma + 2.0 * ni * sea) + kpi * (ai - ni) * ea);
        hmumu[i].write(wi);
        // Cross block: Fisher expectation E[H_{μ,ls}] = 2κ·E[m] = 0 (μ ⊥ σ;
        // see exact_newton_joint_hessian_from_designs / #684). The observed
        // 2mκ is mean-zero noise that would inject spurious μ↔σ coupling into
        // the REML determinant via the Schur complement and over-smooth log σ.
        hmu_ls[i].write(0.0);
        // Fisher/expected (log σ, log σ) information: E[H_{ls,ls}] = 2κ²a.
        // The observed curvature 2κ²n + κ'(a−n) collapses where the fitted
        // residual is small (n→0), under-counting the scale block's EDF and
        // letting REML over-smooth the scale predictor toward a flat constant
        // (#566). Using E[n]=a (true model) gives the residual-free expected
        // information 2κ²a, exactly as gamlss/mgcv gaulss Fisher-score the
        // scale channel and as the diagonal PIRLS kernel already does
        // (gaussian_diagonal_row_kernel: 2·obs_weight·κ²). The score
        // (score_ls/dscore_ls/d2score_ls) stays the exact observed gradient so
        // the joint Newton still converges to the true MLE stationary point;
        // only the (ls,ls) curvature feeding the REML determinant/EDF is the
        // expectation.
        h_ls_ls[i].write(2.0 * ki * ki * ai);
        dhmumu[i].write(-2.0 * wi * sea);
        // Cross block is Fisher 0 (μ ⊥ σ; #684), so its directional derivative
        // is identically 0.
        dhmu_ls[i].write(0.0);
        // Directional derivative of E[H_{ls,ls}]=2κ²a along (μ̇,η̇): no μ
        // dependence; ∂(2κ²a)/∂η = 4κκ'a, so dh_ls_ls = 4κκ'a·η̇.
        dh_ls_ls[i].write(4.0 * ki * kpi * ai * ea);
        objective_psirow[i].write(smu * ma + sls * ea);
    }
    // SAFETY: every `MaybeUninit` slot in each field array was written
    // exactly once inside the `for i in 0..nobs` loop above.
    unsafe {
        GaussianJointPsiFirstWeights {
            objective_psirow: objective_psirow.assume_init(),
            scoremu: scoremu.assume_init(),
            score_ls: score_ls.assume_init(),
            dscoremu: dscoremu.assume_init(),
            dscore_ls: dscore_ls.assume_init(),
            hmumu: hmumu.assume_init(),
            hmu_ls: hmu_ls.assume_init(),
            h_ls_ls: h_ls_ls.assume_init(),
            dhmumu: dhmumu.assume_init(),
            dhmu_ls: dhmu_ls.assume_init(),
            dh_ls_ls: dh_ls_ls.assume_init(),
        }
    }
}

fn gaussian_joint_psisecondweights(
    scalars: &GaussianJointRowScalars,
    mu_a: &Array1<f64>,
    eta_a: &Array1<f64>,
    mu_b: &Array1<f64>,
    eta_b: &Array1<f64>,
    mu_ab: &Array1<f64>,
    eta_ab: &Array1<f64>,
) -> GaussianJointPsiSecondWeights {
    let nobs = scalars.w.len();
    let mut objective_psi_psirow = Array1::<f64>::uninit(nobs);
    let mut d2scoremu = Array1::<f64>::uninit(nobs);
    let mut d2score_ls = Array1::<f64>::uninit(nobs);
    let mut d2hmumu = Array1::<f64>::uninit(nobs);
    let mut d2hmu_ls = Array1::<f64>::uninit(nobs);
    let mut d2h_ls_ls = Array1::<f64>::uninit(nobs);
    for i in 0..nobs {
        let wi = scalars.w[i];
        let mi = scalars.m[i];
        let ni = scalars.n[i];
        let ki = scalars.kappa[i];
        let kpi = scalars.kappa_prime[i];
        let kdpi = scalars.kappa_dprime[i];
        let ai = scalars.obs_weight[i];
        let amn = ai - ni;
        let ma = mu_a[i];
        let mb = mu_b[i];
        let mab = mu_ab[i];
        let ea = eta_a[i];
        let eb = eta_b[i];
        let eab = eta_ab[i];
        // κ-scaled log-sigma directions.
        let sea = ki * ea;
        let seb = ki * eb;
        let seab = ki * eab;
        let cross = ma * seb + mb * sea;
        // Bare-η symmetric form (no κ): needed for κ' chain-rule terms.
        let cross_eta = ma * eb + mb * ea;
        let sea_seb = sea * seb;
        let ea_eb = ea * eb;
        let ma_mb = ma * mb;
        // + κ'·(a−n)·ea·eb: dκ/dη chain-rule contribution from σ = b + e^η.
        objective_psi_psirow[i].write(
            wi * ma_mb + 2.0 * mi * cross + 2.0 * ni * sea_seb - mi * mab
                + ki * amn * eab
                + kpi * amn * ea_eb,
        );
        // + 2·m·κ'·ea·eb: ∂²(−m)/∂η² = −4mκ² + 2mκ'.
        d2scoremu[i].write(
            wi * mab - 2.0 * wi * cross - 4.0 * mi * sea_seb
                + 2.0 * mi * seab
                + 2.0 * mi * kpi * ea_eb,
        );
        // + 2·κ'·m·sym(μ_a η_b) + (κ''(a−n)+6κκ'n)·ea·eb + κ'(a−n)·eab.
        d2score_ls[i].write(
            ki * (-2.0 * wi * ma_mb - 4.0 * mi * cross - 4.0 * ni * sea_seb
                + 2.0 * mi * mab
                + 2.0 * ni * seab)
                + 2.0 * mi * kpi * cross_eta
                + (kdpi * amn + 6.0 * ki * kpi * ni) * ea_eb
                + kpi * amn * eab,
        );
        // − 2·κ'·w·ea·eb: ∂²w/∂η² = 4wκ² − 2wκ'.
        d2hmumu[i].write(4.0 * wi * sea_seb - 2.0 * wi * seab - 2.0 * wi * kpi * ea_eb);
        // Cross block is Fisher 0 (μ ⊥ σ; #684), so its second directional
        // derivative is identically 0.
        d2hmu_ls[i].write(0.0);
        // d²/dψ_a dψ_b of the Fisher (ls,ls) information E[H_{ls,ls}]=2κ²a (#566).
        // No μ dependence; ∂(2κ²a)/∂η=4κκ'a and ∂(4κκ'a)/∂η=4a(κ'²+κκ'')a, so
        // the second directional derivative is 4a(κ'²+κκ'')·ea·eb + 4aκκ'·eab.
        d2h_ls_ls[i].write(4.0 * ai * (kpi * kpi + ki * kdpi) * ea_eb + 4.0 * ai * ki * kpi * eab);
    }
    // SAFETY: every `MaybeUninit` slot in each field array was written
    // exactly once inside the `for i in 0..nobs` loop above.
    unsafe {
        GaussianJointPsiSecondWeights {
            objective_psi_psirow: objective_psi_psirow.assume_init(),
            d2scoremu: d2scoremu.assume_init(),
            d2score_ls: d2score_ls.assume_init(),
            d2hmumu: d2hmumu.assume_init(),
            d2hmu_ls: d2hmu_ls.assume_init(),
            d2h_ls_ls: d2h_ls_ls.assume_init(),
        }
    }
}

fn gaussian_joint_psi_mixed_driftweights(
    scalars: &GaussianJointRowScalars,
    // Only the log-σ–channel directions enter the surviving (μ,μ) and (ls,ls)
    // Fisher blocks; the μ-channel drift directions fed the observed cross
    // block, which is now Fisher 0 (μ ⊥ σ; #684) and no longer assembled.
    dot_eta: &Array1<f64>,
    eta_a: &Array1<f64>,
    dot_eta_a: &Array1<f64>,
) -> GaussianJointPsiMixedDriftWeights {
    let nobs = scalars.w.len();
    let mut dhmumu_u = Array1::<f64>::uninit(nobs);
    let mut dhmu_ls_u = Array1::<f64>::uninit(nobs);
    let mut dh_ls_ls_u = Array1::<f64>::uninit(nobs);
    let mut d2hmumu = Array1::<f64>::uninit(nobs);
    let mut d2hmu_ls = Array1::<f64>::uninit(nobs);
    let mut d2h_ls_ls = Array1::<f64>::uninit(nobs);
    for i in 0..nobs {
        let wi = scalars.w[i];
        let ki = scalars.kappa[i];
        let kpi = scalars.kappa_prime[i];
        let kdpi = scalars.kappa_dprime[i];
        let ai = scalars.obs_weight[i];
        let de = dot_eta[i];
        let ea = eta_a[i];
        let dea = dot_eta_a[i];
        // κ-scaled log-sigma directions.
        let sde = ki * de;
        let sea = ki * ea;
        let sdea = ki * dea;
        let de_ea = de * ea;
        // First directional derivative of Hessian blocks (== Helper A).
        dhmumu_u[i].write(-2.0 * wi * sde);
        // Cross block is Fisher 0 (μ ⊥ σ; #684); its first directional and
        // second mixed directional derivatives are identically 0. The
        // observed-cross drift inputs (m, dotmu, μ_a, dotmu_a) are therefore
        // not read here.
        dhmu_ls_u[i].write(0.0);
        // Directional derivative of Fisher E[H_{ls,ls}]=2κ²a along (dm,de):
        // no μ dependence, ∂(2κ²a)/∂η=4κκ'a ⇒ 4κκ'a·de (#566).
        dh_ls_ls_u[i].write(4.0 * ki * kpi * ai * de);
        // − 2·κ'·w·de·ea: ∂²w/∂η² = 4wκ² − 2wκ'.
        d2hmumu[i].write(4.0 * wi * sde * sea - 2.0 * wi * sdea - 2.0 * wi * kpi * de_ea);
        d2hmu_ls[i].write(0.0);
        // d²/(drift × ψ) of Fisher E[H_{ls,ls}]=2κ²a: 4a(κ'²+κκ'')·de·ea +
        // 4aκκ'·dea (drift direction de, ψ direction ea, mixed dea) (#566).
        d2h_ls_ls[i].write(4.0 * ai * (kpi * kpi + ki * kdpi) * de_ea + 4.0 * ai * ki * kpi * dea);
    }
    // SAFETY: every `MaybeUninit` slot in each field array was written
    // exactly once inside the `for i in 0..nobs` loop above.
    unsafe {
        GaussianJointPsiMixedDriftWeights {
            dhmumu_u: dhmumu_u.assume_init(),
            dhmu_ls_u: dhmu_ls_u.assume_init(),
            dh_ls_ls_u: dh_ls_ls_u.assume_init(),
            d2hmumu: d2hmumu.assume_init(),
            d2hmu_ls: d2hmu_ls.assume_init(),
            d2h_ls_ls: d2h_ls_ls.assume_init(),
        }
    }
}

fn gaussian_pack_joint_score(scoremu: &Array1<f64>, score_ls: &Array1<f64>) -> Array1<f64> {
    let pmu = scoremu.len();
    let p_ls = score_ls.len();
    let mut out = Array1::<f64>::zeros(pmu + p_ls);
    out.slice_mut(s![0..pmu]).assign(scoremu);
    out.slice_mut(s![pmu..pmu + p_ls]).assign(score_ls);
    out
}

fn gaussian_pack_joint_symmetrichessian(
    hmumu: &Array2<f64>,
    hmu_ls: &Array2<f64>,
    h_ls_ls: &Array2<f64>,
) -> Array2<f64> {
    let pmu = hmumu.nrows();
    let p_ls = h_ls_ls.nrows();
    let total = pmu + p_ls;
    let mut out = Array2::<f64>::zeros((total, total));
    out.slice_mut(s![0..pmu, 0..pmu]).assign(hmumu);
    out.slice_mut(s![0..pmu, pmu..total]).assign(hmu_ls);
    out.slice_mut(s![pmu..total, pmu..total]).assign(h_ls_ls);
    mirror_upper_to_lower(&mut out);
    out
}

/// Canonical Gaussian location-scale Fisher (expected) joint-Hessian row
/// coefficients `(mm, ml, ll)` — the SINGLE source of truth for this curvature,
/// shared by every representation that assembles the value Hessian (the dense
/// `exact_newton_joint_hessian_from_designs` and the matrix-free
/// `GaussianLocationScaleHessianWorkspace`). The (μ, log σ) information is
/// block-diagonal because location and scale are information-orthogonal:
///   `ml = E[H_{μ,ls}] = 2κ·E[m] = 2κ·E[r]·w/σ² = 0`  (E[r]=0 at any β; #684),
/// and the (log σ, log σ) block is the residual-free Fisher form
///   `ll = E[H_{ls,ls}] = 2κ²a`  (a = obs_weight; #566).
/// Routing both paths through this one constructor makes the cross-block drift
/// that caused #684 — one representation using the observed `2κm`, another the
/// Fisher 0 — structurally impossible: they cannot disagree because they read
/// the same coefficients. The observed SCORE still drives the Newton step
/// (Fisher scoring → exact joint MLE); only the curvature feeding the REML
/// determinant / Newton metric is the orthogonal expectation.
fn gaussian_locscale_fisher_joint_row_coeffs(
    rows: &GaussianJointRowScalars,
) -> (Array1<f64>, Array1<f64>, Array1<f64>) {
    let mm = rows.w.clone();
    let ml = Array1::<f64>::zeros(rows.kappa.len());
    let ll = 2.0 * &rows.kappa * &rows.kappa * &rows.obs_weight;
    (mm, ml, ll)
}

fn gaussian_joint_hessian_from_designs(
    xmu: &DenseOrOperator<'_>,
    x_ls: &DenseOrOperator<'_>,
    hmumu_coeff: &Array1<f64>,
    hmu_ls_coeff: &Array1<f64>,
    h_ls_ls_coeff: &Array1<f64>,
) -> Result<Array2<f64>, String> {
    if xmu.nrows() != hmumu_coeff.len()
        || xmu.nrows() != hmu_ls_coeff.len()
        || xmu.nrows() != h_ls_ls_coeff.len()
        || x_ls.nrows() != xmu.nrows()
    {
        return Err(GamlssError::DimensionMismatch { reason: format!(
            "gaussian_joint_hessian_from_designs dimension mismatch: xmu {}x{}, x_ls {}x{}, coeffs {}/{}/{}",
            xmu.nrows(),
            xmu.ncols(),
            x_ls.nrows(),
            x_ls.ncols(),
            hmumu_coeff.len(),
            hmu_ls_coeff.len(),
            h_ls_ls_coeff.len()
        ) }.into());
    }

    let n = xmu.nrows();
    let pmu = xmu.ncols();
    let p_ls = x_ls.ncols();
    let total = pmu + p_ls;
    let mut out = Array2::<f64>::zeros((total, total));
    for rows in exact_design_row_chunks(n, pmu.max(p_ls)) {
        let xmu_chunk = xmu.row_chunk(rows.clone())?;
        let xls_chunk = x_ls.row_chunk(rows.clone())?;
        let hmumu = hmumu_coeff.slice(s![rows.clone()]);
        let hmu_ls = hmu_ls_coeff.slice(s![rows.clone()]);
        let h_ls_ls = h_ls_ls_coeff.slice(s![rows.clone()]);
        let chunk_hessian =
            fast_joint_hessian_2x2(&xmu_chunk, &xls_chunk, &hmumu, &hmu_ls, &h_ls_ls);
        out += &chunk_hessian;
    }
    Ok(out)
}

fn gaussian_joint_psihessian_fromweights(
    xmu: &Array2<f64>,
    x_ls: &Array2<f64>,
    xmu_psi: CustomFamilyPsiLinearMapRef<'_>,
    x_ls_psi: CustomFamilyPsiLinearMapRef<'_>,
    weights: &GaussianJointPsiFirstWeights,
) -> Result<Array2<f64>, String> {
    // For the symmetric blocks (hmumu, h_ls_ls), the pair
    //   X_psi^T D X  and  X^T D X_psi
    // are transposes of each other, so compute one and add its transpose.
    let a_mu = weighted_crossprod_psi_maps(
        xmu_psi,
        weights.hmumu.view(),
        CustomFamilyPsiLinearMapRef::Dense(xmu),
    )?;
    let hmumu = &a_mu + &a_mu.t() + &xt_diag_x_dense(xmu, &weights.dhmumu)?;
    let hmu_ls = weighted_crossprod_psi_maps(
        xmu_psi,
        weights.hmu_ls.view(),
        CustomFamilyPsiLinearMapRef::Dense(x_ls),
    )? + &weighted_crossprod_psi_maps(
        CustomFamilyPsiLinearMapRef::Dense(xmu),
        weights.hmu_ls.view(),
        x_ls_psi,
    )? + &xt_diag_y_dense(xmu, &weights.dhmu_ls, x_ls)?;
    let a_ls = weighted_crossprod_psi_maps(
        x_ls_psi,
        weights.h_ls_ls.view(),
        CustomFamilyPsiLinearMapRef::Dense(x_ls),
    )?;
    let h_ls_ls = &a_ls + &a_ls.t() + &xt_diag_x_dense(x_ls, &weights.dh_ls_ls)?;
    Ok(gaussian_pack_joint_symmetrichessian(
        &hmumu, &hmu_ls, &h_ls_ls,
    ))
}

fn build_two_block_custom_family_joint_psi_operator_from_actions(
    left_action: Option<CustomFamilyPsiDesignAction>,
    right_action: Option<CustomFamilyPsiDesignAction>,
    left_range: std::ops::Range<usize>,
    right_range: std::ops::Range<usize>,
    left_design: &Array2<f64>,
    right_design: &Array2<f64>,
    left_weights: &Array1<f64>,
    cross_weights: &Array1<f64>,
    right_weights: &Array1<f64>,
    left_drift_weights: &Array1<f64>,
    cross_drift_weights: &Array1<f64>,
    right_drift_weights: &Array1<f64>,
) -> Result<Option<std::sync::Arc<dyn crate::solver::estimate::reml::unified::HyperOperator>>, String>
{
    if left_action.is_none() && right_action.is_none() {
        return Ok(None);
    }

    let total = left_design.ncols() + right_design.ncols();
    let channels = vec![
        CustomFamilyJointDesignChannel::new(left_range, shared_dense_arc(left_design), left_action),
        CustomFamilyJointDesignChannel::new(
            right_range,
            shared_dense_arc(right_design),
            right_action,
        ),
    ];
    let pair_contributions = vec![
        CustomFamilyJointDesignPairContribution::new(
            0,
            0,
            left_weights.clone(),
            left_drift_weights.clone(),
        ),
        CustomFamilyJointDesignPairContribution::new(
            0,
            1,
            cross_weights.clone(),
            cross_drift_weights.clone(),
        ),
        CustomFamilyJointDesignPairContribution::new(
            1,
            0,
            cross_weights.clone(),
            cross_drift_weights.clone(),
        ),
        CustomFamilyJointDesignPairContribution::new(
            1,
            1,
            right_weights.clone(),
            right_drift_weights.clone(),
        ),
    ];

    Ok(Some(std::sync::Arc::new(
        CustomFamilyJointPsiOperator::new(total, channels, pair_contributions),
    )))
}

fn gaussian_joint_psisecondhessian_fromweights(
    xmu: &Array2<f64>,
    x_ls: &Array2<f64>,
    xmu_i: CustomFamilyPsiLinearMapRef<'_>,
    x_ls_i: CustomFamilyPsiLinearMapRef<'_>,
    xmu_j: CustomFamilyPsiLinearMapRef<'_>,
    x_ls_j: CustomFamilyPsiLinearMapRef<'_>,
    xmu_ab: CustomFamilyPsiLinearMapRef<'_>,
    x_ls_ab: CustomFamilyPsiLinearMapRef<'_>,
    weights_i: &GaussianJointPsiFirstWeights,
    weights_j: &GaussianJointPsiFirstWeights,
    secondweights: &GaussianJointPsiSecondWeights,
) -> Result<Array2<f64>, String> {
    // Exploit transpose symmetry: X_a^T D X_b and X_b^T D X_a are transposes.
    // For each such pair in the symmetric blocks (hmumu, h_ls_ls), compute one
    // and add its transpose, halving the number of O(np²) products.
    let a_ab_mu = weighted_crossprod_psi_maps(
        xmu_ab,
        weights_i.hmumu.view(),
        CustomFamilyPsiLinearMapRef::Dense(xmu),
    )?;
    let a_ij_mu = weighted_crossprod_psi_maps(xmu_i, weights_i.hmumu.view(), xmu_j)?;
    let a_iwj_mu = weighted_crossprod_psi_maps(
        xmu_i,
        weights_j.dhmumu.view(),
        CustomFamilyPsiLinearMapRef::Dense(xmu),
    )?;
    let a_jwi_mu = weighted_crossprod_psi_maps(
        xmu_j,
        weights_i.dhmumu.view(),
        CustomFamilyPsiLinearMapRef::Dense(xmu),
    )?;
    let hmumu = &a_ab_mu
        + &a_ab_mu.t()
        + &a_ij_mu
        + a_ij_mu.t()
        + &a_iwj_mu
        + a_iwj_mu.t()
        + &a_jwi_mu
        + a_jwi_mu.t()
        + &xt_diag_x_dense(xmu, &secondweights.d2hmumu)?;
    let hmu_ls = weighted_crossprod_psi_maps(
        xmu_ab,
        weights_i.hmu_ls.view(),
        CustomFamilyPsiLinearMapRef::Dense(x_ls),
    )? + &weighted_crossprod_psi_maps(xmu_i, weights_i.hmu_ls.view(), x_ls_j)?
        + &weighted_crossprod_psi_maps(xmu_j, weights_i.hmu_ls.view(), x_ls_i)?
        + &weighted_crossprod_psi_maps(
            xmu_i,
            weights_j.dhmu_ls.view(),
            CustomFamilyPsiLinearMapRef::Dense(x_ls),
        )?
        + &weighted_crossprod_psi_maps(
            xmu_j,
            weights_i.dhmu_ls.view(),
            CustomFamilyPsiLinearMapRef::Dense(x_ls),
        )?
        + &weighted_crossprod_psi_maps(
            CustomFamilyPsiLinearMapRef::Dense(xmu),
            weights_i.dhmu_ls.view(),
            x_ls_j,
        )?
        + &weighted_crossprod_psi_maps(
            CustomFamilyPsiLinearMapRef::Dense(xmu),
            weights_j.dhmu_ls.view(),
            x_ls_i,
        )?
        + &xt_diag_y_dense(xmu, &secondweights.d2hmu_ls, x_ls)?
        + &weighted_crossprod_psi_maps(
            CustomFamilyPsiLinearMapRef::Dense(xmu),
            weights_i.hmu_ls.view(),
            x_ls_ab,
        )?;
    let a_ab_ls = weighted_crossprod_psi_maps(
        x_ls_ab,
        weights_i.h_ls_ls.view(),
        CustomFamilyPsiLinearMapRef::Dense(x_ls),
    )?;
    let a_ij_ls = weighted_crossprod_psi_maps(x_ls_i, weights_i.h_ls_ls.view(), x_ls_j)?;
    let a_iwj_ls = weighted_crossprod_psi_maps(
        x_ls_i,
        weights_j.dh_ls_ls.view(),
        CustomFamilyPsiLinearMapRef::Dense(x_ls),
    )?;
    let a_jwi_ls = weighted_crossprod_psi_maps(
        x_ls_j,
        weights_i.dh_ls_ls.view(),
        CustomFamilyPsiLinearMapRef::Dense(x_ls),
    )?;
    let h_ls_ls = &a_ab_ls
        + &a_ab_ls.t()
        + &a_ij_ls
        + a_ij_ls.t()
        + &a_iwj_ls
        + a_iwj_ls.t()
        + &a_jwi_ls
        + a_jwi_ls.t()
        + &xt_diag_x_dense(x_ls, &secondweights.d2h_ls_ls)?;
    Ok(gaussian_pack_joint_symmetrichessian(
        &hmumu, &hmu_ls, &h_ls_ls,
    ))
}

fn gaussian_joint_psi_mixedhessian_drift_fromweights(
    xmu: &Array2<f64>,
    x_ls: &Array2<f64>,
    xmu_psi: CustomFamilyPsiLinearMapRef<'_>,
    x_ls_psi: CustomFamilyPsiLinearMapRef<'_>,
    mixedweights: &GaussianJointPsiMixedDriftWeights,
) -> Result<Array2<f64>, String> {
    let a_mu = weighted_crossprod_psi_maps(
        xmu_psi,
        mixedweights.dhmumu_u.view(),
        CustomFamilyPsiLinearMapRef::Dense(xmu),
    )?;
    let hmumu = &a_mu + &a_mu.t() + &xt_diag_x_dense(xmu, &mixedweights.d2hmumu)?;
    let hmu_ls = weighted_crossprod_psi_maps(
        xmu_psi,
        mixedweights.dhmu_ls_u.view(),
        CustomFamilyPsiLinearMapRef::Dense(x_ls),
    )? + &weighted_crossprod_psi_maps(
        CustomFamilyPsiLinearMapRef::Dense(xmu),
        mixedweights.dhmu_ls_u.view(),
        x_ls_psi,
    )? + &xt_diag_y_dense(xmu, &mixedweights.d2hmu_ls, x_ls)?;
    let a_ls = weighted_crossprod_psi_maps(
        x_ls_psi,
        mixedweights.dh_ls_ls_u.view(),
        CustomFamilyPsiLinearMapRef::Dense(x_ls),
    )?;
    let h_ls_ls = &a_ls + &a_ls.t() + &xt_diag_x_dense(x_ls, &mixedweights.d2h_ls_ls)?;
    Ok(gaussian_pack_joint_symmetrichessian(
        &hmumu, &hmu_ls, &h_ls_ls,
    ))
}

#[inline]
fn exp_sigma_derivs_up_to_fourth_array(
    eta: ArrayView1<'_, f64>,
) -> (
    Array1<f64>,
    Array1<f64>,
    Array1<f64>,
    Array1<f64>,
    Array1<f64>,
) {
    use rayon::iter::{IntoParallelIterator, ParallelIterator};
    let n = eta.len();
    let tuples: Vec<(f64, f64, f64, f64, f64)> = (0..n)
        .into_par_iter()
        .map(|i| exp_sigma_derivs_up_to_fourth_scalar(eta[i]))
        .collect();
    let mut sigma = Array1::<f64>::zeros(n);
    let mut d1 = Array1::<f64>::zeros(n);
    let mut d2 = Array1::<f64>::zeros(n);
    let mut d3 = Array1::<f64>::zeros(n);
    let mut d4 = Array1::<f64>::zeros(n);
    for (i, (s_i, d1_i, d2_i, d3_i, d4_i)) in tuples.into_iter().enumerate() {
        sigma[i] = s_i;
        d1[i] = d1_i;
        d2[i] = d2_i;
        d3[i] = d3_i;
        d4[i] = d4_i;
    }
    (sigma, d1, d2, d3, d4)
}

impl GaussianLocationScaleFamily {
    pub const BLOCK_MU: usize = 0;
    pub const BLOCK_LOG_SIGMA: usize = 1;

    fn get_or_compute_row_scalars(
        &self,
        etamu: &Array1<f64>,
        eta_ls: &Array1<f64>,
    ) -> Result<Arc<GaussianJointRowScalars>, String> {
        Ok(Arc::new(gaussian_jointrow_scalars(
            &self.y,
            etamu,
            eta_ls,
            &self.weights,
        )?))
    }

    pub fn parameternames() -> &'static [&'static str] {
        &["mu", "log_sigma"]
    }

    pub fn parameter_links() -> &'static [ParameterLink] {
        &[ParameterLink::Identity, ParameterLink::Log]
    }

    pub fn metadata() -> FamilyMetadata {
        FamilyMetadata {
            name: "gaussian_location_scale",
            parameternames: Self::parameternames(),
            parameter_links: Self::parameter_links(),
        }
    }

    fn exact_joint_supported(&self) -> bool {
        self.mu_design.is_some() && self.log_sigma_design.is_some()
    }

    fn exact_block_designs(&self) -> Result<(DenseOrOperator<'_>, DenseOrOperator<'_>), String> {
        let mu_design = self.mu_design.as_ref().ok_or_else(|| {
            "GaussianLocationScaleFamily exact path is missing mu design".to_string()
        })?;
        let log_sigma_design = self.log_sigma_design.as_ref().ok_or_else(|| {
            "GaussianLocationScaleFamily exact path is missing log-sigma design".to_string()
        })?;
        let planned = dense_blocks_planned_budget(&[mu_design, log_sigma_design]);
        let xmu = dense_block_or_operator(
            mu_design,
            mu_design.nrows(),
            mu_design.ncols(),
            planned[0],
            &self.policy,
        );
        let x_ls = dense_block_or_operator(
            log_sigma_design,
            log_sigma_design.nrows(),
            log_sigma_design.ncols(),
            planned[1],
            &self.policy,
        );
        Ok((xmu, x_ls))
    }

    fn exact_block_designs_fromspecs<'a>(
        &self,
        specs: &'a [ParameterBlockSpec],
    ) -> Result<(DenseOrOperator<'a>, DenseOrOperator<'a>), String> {
        if specs.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleFamily spec-aware exact path expects 2 specs, got {}",
                    specs.len()
                ),
            }
            .into());
        }
        let mu_design = &specs[Self::BLOCK_MU].design;
        let log_sigma_design = &specs[Self::BLOCK_LOG_SIGMA].design;
        let planned = dense_blocks_planned_budget(&[mu_design, log_sigma_design]);
        let xmu = dense_block_or_operator(
            mu_design,
            mu_design.nrows(),
            mu_design.ncols(),
            planned[0],
            &self.policy,
        );
        let x_ls = dense_block_or_operator(
            log_sigma_design,
            log_sigma_design.nrows(),
            log_sigma_design.ncols(),
            planned[1],
            &self.policy,
        );
        Ok((xmu, x_ls))
    }

    fn exact_joint_block_designs<'a>(
        &'a self,
        specs: Option<&'a [ParameterBlockSpec]>,
    ) -> Result<Option<(DenseOrOperator<'a>, DenseOrOperator<'a>)>, String> {
        if self.exact_joint_supported() {
            return self.exact_block_designs().map(Some);
        }
        if let Some(specs) = specs {
            return self.exact_block_designs_fromspecs(specs).map(Some);
        }
        Ok(None)
    }

    fn exact_joint_dense_block_designs<'a>(
        &'a self,
        specs: Option<&'a [ParameterBlockSpec]>,
    ) -> Result<Option<(Cow<'a, Array2<f64>>, Cow<'a, Array2<f64>>)>, String> {
        let Some((xmu, x_ls)) = self.exact_joint_block_designs(specs)? else {
            return Ok(None);
        };
        let xmu = match xmu {
            DenseOrOperator::Borrowed(dense) => Cow::Borrowed(dense),
            DenseOrOperator::Owned(dense) => Cow::Owned(dense),
            DenseOrOperator::Operator(_) => {
                return Err(
                    "GaussianLocationScaleFamily exact psi path requires chunked operator support for oversized designs"
                        .to_string(),
                );
            }
        };
        let x_ls = match x_ls {
            DenseOrOperator::Borrowed(dense) => Cow::Borrowed(dense),
            DenseOrOperator::Owned(dense) => Cow::Owned(dense),
            DenseOrOperator::Operator(_) => {
                return Err(
                    "GaussianLocationScaleFamily exact psi path requires chunked operator support for oversized designs"
                        .to_string(),
                );
            }
        };
        Ok(Some((xmu, x_ls)))
    }

    fn exact_newton_joint_hessian_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: Option<&[ParameterBlockSpec]>,
    ) -> Result<Option<Array2<f64>>, String> {
        let Some((xmu, x_ls)) = self.exact_joint_block_designs(specs)? else {
            return Ok(None);
        };
        self.exact_newton_joint_hessian_from_designs(block_states, &xmu, &x_ls)
    }

    fn exact_newton_joint_hessian_directional_derivative_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: Option<&[ParameterBlockSpec]>,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        let Some((xmu, x_ls)) = self.exact_joint_block_designs(specs)? else {
            return Ok(None);
        };
        self.exact_newton_joint_hessian_directional_derivative_from_designs(
            block_states,
            &xmu,
            &x_ls,
            d_beta_flat,
        )
    }

    fn exact_newton_joint_hessian_second_directional_derivative_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: Option<&[ParameterBlockSpec]>,
        d_beta_u_flat: &Array1<f64>,
        d_betav_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        let Some((xmu, x_ls)) = self.exact_joint_block_designs(specs)? else {
            return Ok(None);
        };
        self.exact_newton_joint_hessiansecond_directional_derivative_from_designs(
            block_states,
            &xmu,
            &x_ls,
            d_beta_u_flat,
            d_betav_flat,
        )
    }

    fn exact_newton_joint_psi_terms_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiTerms>, String> {
        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        self.exact_newton_joint_psi_terms_from_designs(
            block_states,
            specs,
            derivative_blocks,
            psi_index,
            &xmu,
            &x_ls,
        )
    }

    fn exact_newton_joint_psisecond_order_terms_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_i: usize,
        psi_j: usize,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms>, String> {
        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        self.exact_newton_joint_psisecond_order_terms_from_designs(
            block_states,
            derivative_blocks,
            psi_i,
            psi_j,
            &xmu,
            &x_ls,
        )
    }

    fn exact_newton_joint_psihessian_directional_derivative_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        self.exact_newton_joint_psihessian_directional_derivative_from_designs(
            block_states,
            derivative_blocks,
            psi_index,
            d_beta_flat,
            &xmu,
            &x_ls,
        )
    }

    fn exact_newton_joint_hessian_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        xmu: &DenseOrOperator<'_>,
        x_ls: &DenseOrOperator<'_>,
    ) -> Result<Option<Array2<f64>>, String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let etamu = &block_states[Self::BLOCK_MU].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        if etamu.len() != n || eta_ls.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
            }
            .into());
        }

        let rows = self.get_or_compute_row_scalars(etamu, eta_ls)?;
        // Block-diagonal Gaussian Fisher curvature (μ ⊥ σ ⇒ cross = 0, #684;
        // (ls,ls) = 2κ²a, #566), built from the shared single-source-of-truth
        // constructor so this dense path and the matrix-free workspace can never
        // disagree on the cross block. See `gaussian_locscale_fisher_joint_row_coeffs`.
        let (mm, cross, scale) = gaussian_locscale_fisher_joint_row_coeffs(&rows);
        Ok(Some(gaussian_joint_hessian_from_designs(
            xmu, x_ls, &mm, &cross, &scale,
        )?))
    }

    fn exact_newton_joint_hessian_directional_derivative_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        xmu: &DenseOrOperator<'_>,
        x_ls: &DenseOrOperator<'_>,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let etamu = &block_states[Self::BLOCK_MU].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        if etamu.len() != n || eta_ls.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
            }
            .into());
        }

        let pmu = xmu.ncols();
        let p_ls = x_ls.ncols();
        let total = pmu + p_ls;
        if d_beta_flat.len() != total {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleFamily joint d_beta length mismatch: got {}, expected {}",
                    d_beta_flat.len(),
                    total
                ),
            }
            .into());
        }
        let ximu = xmu.dot(d_beta_flat.slice(s![0..pmu]));
        let xi_ls = x_ls.dot(d_beta_flat.slice(s![pmu..pmu + p_ls]));
        let rows = self.get_or_compute_row_scalars(etamu, eta_ls)?;
        let directional = gaussian_joint_first_directionalweights(&rows, &ximu, &xi_ls);
        let dhmumu = directional.0;
        let dh_ls_ls = directional.2;
        // Fisher cross block E[H_{μ,ls}] ≡ 0 (μ ⊥ σ; see
        // exact_newton_joint_hessian_from_designs / #684), so its directional
        // derivative is identically 0 — keep the Hessian's curvature object the
        // block-diagonal Gaussian Fisher information at every order. The
        // observed-cross directional weight (`directional.1`) is therefore not
        // assembled.
        let dhmu_ls = Array1::<f64>::zeros(dhmumu.len());

        Ok(Some(gaussian_joint_hessian_from_designs(
            xmu, x_ls, &dhmumu, &dhmu_ls, &dh_ls_ls,
        )?))
    }

    fn exact_newton_joint_hessiansecond_directional_derivative_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        xmu: &DenseOrOperator<'_>,
        x_ls: &DenseOrOperator<'_>,
        d_beta_u_flat: &Array1<f64>,
        d_betav_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let etamu = &block_states[Self::BLOCK_MU].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        if etamu.len() != n || eta_ls.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
            }
            .into());
        }

        let pmu = xmu.ncols();
        let p_ls = x_ls.ncols();
        let total = pmu + p_ls;
        if d_beta_u_flat.len() != total || d_betav_flat.len() != total {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "GaussianLocationScaleFamily joint second directional derivative length mismatch: got {} and {}, expected {}",
                d_beta_u_flat.len(),
                d_betav_flat.len(),
                total
            ) }.into());
        }
        let ximu_u = xmu.dot(d_beta_u_flat.slice(s![0..pmu]));
        let xi_ls_u = x_ls.dot(d_beta_u_flat.slice(s![pmu..pmu + p_ls]));
        let ximuv = xmu.dot(d_betav_flat.slice(s![0..pmu]));
        let xi_lsv = x_ls.dot(d_betav_flat.slice(s![pmu..pmu + p_ls]));
        let rows = self.get_or_compute_row_scalars(etamu, eta_ls)?;
        let second =
            gaussian_jointsecond_directionalweights(&rows, &ximu_u, &xi_ls_u, &ximuv, &xi_lsv);
        let d2hmumu = second.0;
        let d2h_ls_ls = second.2;
        // Fisher cross block E[H_{μ,ls}] ≡ 0 (μ ⊥ σ; #684), so its second
        // directional derivative is identically 0; `second.1` (observed) is not
        // assembled, keeping the curvature object block-diagonal Fisher.
        let d2hmu_ls = Array1::<f64>::zeros(d2hmumu.len());

        Ok(Some(gaussian_joint_hessian_from_designs(
            xmu, x_ls, &d2hmumu, &d2hmu_ls, &d2h_ls_ls,
        )?))
    }

    fn exact_newton_joint_psi_direction(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        xmu: &Array2<f64>,
        x_ls: &Array2<f64>,
        policy: &crate::resource::ResourcePolicy,
    ) -> Result<Option<LocationScaleJointPsiDirection>, String> {
        let Some(parts) = locscale_joint_psi_direction_parts(
            block_states,
            derivative_blocks,
            psi_index,
            self.y.len(),
            xmu.ncols(),
            x_ls.ncols(),
            Self::BLOCK_MU,
            Self::BLOCK_LOG_SIGMA,
            2,
            "GaussianLocationScaleFamily",
            "mu",
            policy,
        )?
        else {
            return Ok(None);
        };
        Ok(Some(LocationScaleJointPsiDirection {
            block_idx: parts.block_idx,
            local_idx: parts.local_idx,
            z_primary_psi: parts.primary_z,
            z_ls_psi: parts.log_sigma_z,
            x_primary_psi: parts.primary_psi,
            x_ls_psi: parts.log_sigma_psi,
        }))
    }

    fn exact_newton_joint_psisecond_design_drifts(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_a: &LocationScaleJointPsiDirection,
        psi_b: &LocationScaleJointPsiDirection,
        xmu: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<LocationScaleJointPsiSecondDrifts, String> {
        locscale_joint_psisecond_design_drifts(
            block_states,
            derivative_blocks,
            psi_a,
            psi_b,
            LocScalePsiDriftConfig {
                n: self.y.len(),
                p_primary: xmu.ncols(),
                p_log_sigma: x_ls.ncols(),
                primary_block_idx: Self::BLOCK_MU,
                log_sigma_block_idx: Self::BLOCK_LOG_SIGMA,
                family_name: "GaussianLocationScaleFamily",
                primary_label: "mu",
                policy: &self.policy,
            },
        )
    }

    fn exact_newton_joint_psi_terms_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        xmu: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiTerms>, String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        if specs.len() != 2 || derivative_blocks.len() != 2 {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "GaussianLocationScaleFamily joint psi terms expect 2 specs and 2 derivative blocks, got {} and {}",
                specs.len(),
                derivative_blocks.len()
            ) }.into());
        }
        let Some(dir_a) = self.exact_newton_joint_psi_direction(
            block_states,
            derivative_blocks,
            psi_index,
            xmu,
            x_ls,
            &self.policy,
        )?
        else {
            return Ok(None);
        };
        // Gaussian 2-block location-scale family in the unified flattened
        // coefficient space beta = [betamu; beta_sigma]:
        //
        //   mu_i = z_i^T betamu,
        //   ell_i = x_i^T beta_sigma,
        //   s_i = exp(ell_i),
        //   r_i = y_i - mu_i,
        //   q_i = r_i / s_i,
        //   w_i = s_i^{-2},
        //   alpha_i = r_i s_i^{-2},
        //   b_i = q_i^2.
        //
        // The first fixed-beta psi object returned here is likelihood-only:
        //
        //   D_a         = -alpha^T m_a + (1 - b)^T ell_a
        //   D_{beta a}  = [ -Xmu^T alpha_a - X_{mu,a}^T alpha ;
        //                   -X_sigma^T b_a + X_{sigma,a}^T (1-b) ]
        //   D_{bb a}    = [ Xmu^T W_a Xmu + X_{mu,a}^T W Xmu + Xmu^T W X_{mu,a},
        //                   2( Xmu^T A_a X_sigma + X_{mu,a}^T A X_sigma + Xmu^T A X_{sigma,a} );
        //                   sym,
        //                   2( X_sigma^T B_a X_sigma + X_{sigma,a}^T B X_sigma + X_sigma^T B X_{sigma,a} ) ]
        //
        // with m_a = X_{mu,a} betamu, ell_a = X_{sigma,a} beta_sigma and
        // rowwise scalar drifts
        //
        //   w_a     = -2 w * ell_a
        //   alpha_a = -w * m_a - 2 alpha * ell_a
        //   b_a     = -2 alpha * m_a - 2 b * ell_a.
        //
        // Generic code in custom_family.rs promotes these likelihood-only
        // objects to the full fixed-beta V_a / g_a / H_a by adding S_a.
        let etamu = &block_states[Self::BLOCK_MU].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let rows = self.get_or_compute_row_scalars(etamu, eta_ls)?;
        let weights_a =
            gaussian_joint_psi_firstweights(&rows, &dir_a.z_primary_psi, &dir_a.z_ls_psi);
        let objective_psi = weights_a.objective_psirow.sum();
        let xmu_map = dir_a.x_primary_psi.as_linear_map_ref();
        let x_ls_map = dir_a.x_ls_psi.as_linear_map_ref();
        let score_mu =
            xmu_map.transpose_mul(weights_a.scoremu.view()) + fast_atv(xmu, &weights_a.dscoremu);
        let score_ls = x_ls_map.transpose_mul(weights_a.score_ls.view())
            + fast_atv(x_ls, &weights_a.dscore_ls);
        let score_psi = gaussian_pack_joint_score(&score_mu, &score_ls);
        let hessian_psi_operator = build_two_block_custom_family_joint_psi_operator_from_actions(
            dir_a.x_primary_psi.cloned_first_action(),
            dir_a.x_ls_psi.cloned_first_action(),
            0..xmu.ncols(),
            xmu.ncols()..xmu.ncols() + x_ls.ncols(),
            xmu,
            x_ls,
            &weights_a.hmumu,
            &weights_a.hmu_ls,
            &weights_a.h_ls_ls,
            &weights_a.dhmumu,
            &weights_a.dhmu_ls,
            &weights_a.dh_ls_ls,
        )?;
        let hessian_psi = if hessian_psi_operator.is_some() {
            Array2::zeros((0, 0))
        } else {
            gaussian_joint_psihessian_fromweights(xmu, x_ls, xmu_map, x_ls_map, &weights_a)?
        };

        Ok(Some(crate::custom_family::ExactNewtonJointPsiTerms {
            objective_psi,
            score_psi,
            hessian_psi,
            hessian_psi_operator,
        }))
    }

    fn exact_newton_joint_psisecond_order_terms_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_i: usize,
        psi_j: usize,
        xmu: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms>, String> {
        let Some(dir_i) = self.exact_newton_joint_psi_direction(
            block_states,
            derivative_blocks,
            psi_i,
            xmu,
            x_ls,
            &self.policy,
        )?
        else {
            return Ok(None);
        };
        let Some(dir_j) = self.exact_newton_joint_psi_direction(
            block_states,
            derivative_blocks,
            psi_j,
            xmu,
            x_ls,
            &self.policy,
        )?
        else {
            return Ok(None);
        };
        Ok(Some(
            self.exact_newton_joint_psisecond_order_terms_from_parts(
                block_states,
                derivative_blocks,
                &dir_i,
                &dir_j,
                xmu,
                x_ls,
                None,
            )?,
        ))
    }

    fn exact_newton_joint_psisecond_order_terms_from_parts(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        dir_i: &LocationScaleJointPsiDirection,
        dir_j: &LocationScaleJointPsiDirection,
        xmu: &Array2<f64>,
        x_ls: &Array2<f64>,
        subsample: Option<&[crate::families::marginal_slope_shared::WeightedOuterRow]>,
    ) -> Result<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms, String> {
        let second_drifts = self.exact_newton_joint_psisecond_design_drifts(
            block_states,
            derivative_blocks,
            dir_i,
            dir_j,
            xmu,
            x_ls,
        )?;
        let n = self.y.len();
        let xmu_i_map = dir_i.x_primary_psi.as_linear_map_ref();
        let x_ls_i_map = dir_i.x_ls_psi.as_linear_map_ref();
        let xmu_j_map = dir_j.x_primary_psi.as_linear_map_ref();
        let x_ls_j_map = dir_j.x_ls_psi.as_linear_map_ref();
        let xmu_ab_map = second_psi_linear_map(
            second_drifts.x_primary_ab_action.as_ref(),
            second_drifts.x_primary_ab.as_ref(),
            n,
            xmu.ncols(),
        );
        let x_ls_ab_map = second_psi_linear_map(
            second_drifts.x_ls_ab_action.as_ref(),
            second_drifts.x_ls_ab.as_ref(),
            n,
            x_ls.ncols(),
        );
        // Second fixed-beta psi objects for the same Gaussian location-scale
        // kernel. Using the notation from the first-order comment, the rowwise
        // second psi drifts are
        //
        //   w_ab     = 4 w * ell_a * ell_b - 2 w * ell_ab
        //   alpha_ab = 2 w * (m_a * ell_b + m_b * ell_a)
        //              + 4 alpha * ell_a * ell_b
        //              - w * m_ab
        //              - 2 alpha * ell_ab
        //   b_ab     = 2 w * m_a * m_b
        //              + 4 alpha * (m_a * ell_b + m_b * ell_a)
        //              + 4 b * ell_a * ell_b
        //              - 2 alpha * m_ab
        //              - 2 b * ell_ab.
        //
        // The exact likelihood-only second-order objects are then:
        //
        //   D_ab,
        //   D_{beta ab},
        //   D_{beta beta ab},
        //
        // assembled from the usual product-rule expansion over realized
        // design motion X_{.,a}, X_{.,b}, X_{.,ab}. Generic code adds S_ab.
        let etamu = &block_states[Self::BLOCK_MU].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let rows = self.get_or_compute_row_scalars(etamu, eta_ls)?;
        let mut weights_i =
            gaussian_joint_psi_firstweights(&rows, &dir_i.z_primary_psi, &dir_i.z_ls_psi);
        let mut weights_j =
            gaussian_joint_psi_firstweights(&rows, &dir_j.z_primary_psi, &dir_j.z_ls_psi);
        let mut secondweights = gaussian_joint_psisecondweights(
            &rows,
            &dir_i.z_primary_psi,
            &dir_i.z_ls_psi,
            &dir_j.z_primary_psi,
            &dir_j.z_ls_psi,
            &second_drifts.z_primary_ab,
            &second_drifts.z_ls_ab,
        );
        if let Some(sub_rows) = subsample {
            // HT mask: every downstream consumer (gaussian_joint_psisecondhessian_fromweights,
            // weighted_crossprod_psi_maps with weights_*.{hmumu,hmu_ls,h_ls_ls},
            // fast_atv on d2score_* and dscore_*) is row-linear in these arrays, so
            // scaling sampled rows by 1/π_i and zeroing the rest yields an unbiased
            // estimator of the full-data second-order ψ Hessian and ψ score.
            apply_ht_mask_first(&mut weights_i, sub_rows);
            apply_ht_mask_first(&mut weights_j, sub_rows);
            apply_ht_mask_second(&mut secondweights, sub_rows);
        }
        let objective_psi_psi = secondweights.objective_psi_psirow.sum();

        let score_psi_psi = gaussian_pack_joint_score(
            &(xmu_ab_map.transpose_mul(weights_i.scoremu.view())
                + xmu_i_map.transpose_mul(weights_j.dscoremu.view())
                + xmu_j_map.transpose_mul(weights_i.dscoremu.view())
                + fast_atv(xmu, &secondweights.d2scoremu)),
            &(x_ls_ab_map.transpose_mul(weights_i.score_ls.view())
                + x_ls_i_map.transpose_mul(weights_j.dscore_ls.view())
                + x_ls_j_map.transpose_mul(weights_i.dscore_ls.view())
                + fast_atv(x_ls, &secondweights.d2score_ls)),
        );
        let hessian_psi_psi = gaussian_joint_psisecondhessian_fromweights(
            xmu,
            x_ls,
            xmu_i_map,
            x_ls_i_map,
            xmu_j_map,
            x_ls_j_map,
            xmu_ab_map,
            x_ls_ab_map,
            &weights_i,
            &weights_j,
            &secondweights,
        )?;

        Ok(crate::custom_family::ExactNewtonJointPsiSecondOrderTerms {
            objective_psi_psi,
            score_psi_psi,
            hessian_psi_psi,
            hessian_psi_psi_operator: None,
        })
    }

    fn exact_newton_joint_psihessian_directional_derivative_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        d_beta_flat: &Array1<f64>,
        xmu: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        let Some(dir_a) = self.exact_newton_joint_psi_direction(
            block_states,
            derivative_blocks,
            psi_index,
            xmu,
            x_ls,
            &self.policy,
        )?
        else {
            return Ok(None);
        };
        Ok(Some(
            self.exact_newton_joint_psihessian_directional_derivative_from_parts(
                block_states,
                &dir_a,
                d_beta_flat,
                xmu,
                x_ls,
                None,
            )?,
        ))
    }

    fn exact_newton_joint_psihessian_directional_derivative_from_parts(
        &self,
        block_states: &[ParameterBlockState],
        dir_a: &LocationScaleJointPsiDirection,
        d_beta_flat: &Array1<f64>,
        xmu: &Array2<f64>,
        x_ls: &Array2<f64>,
        subsample: Option<&[crate::families::marginal_slope_shared::WeightedOuterRow]>,
    ) -> Result<Array2<f64>, String> {
        let etamu = &block_states[Self::BLOCK_MU].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let pmu = xmu.ncols();
        let p_ls = x_ls.ncols();
        let xmu_map = dir_a.x_primary_psi.as_linear_map_ref();
        let x_ls_map = dir_a.x_ls_psi.as_linear_map_ref();
        let total = pmu + p_ls;
        if d_beta_flat.len() != total {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "GaussianLocationScaleFamily joint psi hessian directional derivative length mismatch: got {}, expected {}",
                d_beta_flat.len(),
                total
            ) }.into());
        }
        // Only the log-σ–channel direction enters the surviving Fisher blocks
        // of the mixed drift (the μ-channel direction fed the observed cross
        // block, now Fisher 0; μ ⊥ σ, #684).
        let u_ls = d_beta_flat.slice(s![pmu..pmu + p_ls]);
        let xi_ls = fast_av(x_ls, &u_ls);
        let uza_ls = x_ls_map.forward_mul(u_ls);
        // Mixed drift T_a[u] = D_beta H_a^{(D)}[u] for the Gaussian family.
        //
        // Along u = [umu; u_sigma], define xi = Xmu umu and zeta = X_sigma u_sigma.
        // The first beta-directional drifts of the Gaussian row scalars are
        //
        //   d_u w     = -2 w * zeta
        //   d_u alpha = -w * xi - 2 alpha * zeta
        //   d_u b     = -2 alpha * xi - 2 b * zeta.
        //
        // Differentiating the psi-a scalar drifts once more gives
        //
        //   d_u w_a     = 4 w * ell_a * zeta - 2 w * zeta_a
        //   d_u alpha_a = 2 w * (m_a * zeta + ell_a * xi)
        //                 - w * xi_a
        //                 + 4 alpha * ell_a * zeta
        //                 - 2 alpha * zeta_a
        //   d_u b_a     = 2 w * m_a * xi
        //                 + 4 alpha * (m_a * zeta + ell_a * xi)
        //                 + 4 b * ell_a * zeta
        //                 - 2 alpha * xi_a
        //                 - 2 b * zeta_a.
        //
        // The matrix drift returned here is the exact likelihood-only
        //
        //   T_a[u] = D_beta H_{psi_a}^{(D)}[u],
        //
        // assembled blockwise as
        //
        //   Kmumu,a[u]   = Xmu^T W_a[u] Xmu
        //                   + X_{mu,a}^T W[u] Xmu
        //                   + Xmu^T W[u] X_{mu,a}
        //   Kmusigma,a[u]= 2( Xmu^T A_a[u] X_sigma
        //                   + X_{mu,a}^T A[u] X_sigma
        //                   + Xmu^T A[u] X_{sigma,a} )
        //   K_sigmasigma,a[u]
        //                   = 2( X_sigma^T B_a[u] X_sigma
        //                   + X_{sigma,a}^T B[u] X_sigma
        //                   + X_sigma^T B[u] X_{sigma,a} ).
        //
        // Generic code then combines this with S(theta)-motion and the profile
        // mode responses to form ddot H_{ij}.
        let rows = self.get_or_compute_row_scalars(etamu, eta_ls)?;
        let mut mixedweights =
            gaussian_joint_psi_mixed_driftweights(&rows, &xi_ls, &dir_a.z_ls_psi, &uza_ls);
        if let Some(sub_rows) = subsample {
            // HT mask: `gaussian_joint_psi_mixedhessian_drift_fromweights` is
            // row-linear in every `mixedweights.*` array via `xt_diag_*_dense`
            // and `weighted_crossprod_psi_maps`, so the masked Hessian-drift
            // remains an unbiased estimator of the full-data drift.
            apply_ht_mask_mixed(&mut mixedweights, sub_rows);
        }

        gaussian_joint_psi_mixedhessian_drift_fromweights(
            xmu,
            x_ls,
            xmu_map,
            x_ls_map,
            &mixedweights,
        )
    }

    /// Build the [`BlockEffectiveJacobian`] for block `block_idx` given the
    /// realised block specs.  Returns an [`AdditiveBlockJacobian`] encoding the
    /// linear map η_r[i] = X_r[i,:] · β_r:
    ///
    /// - block 0 (mu):       output 0 = design rows, output 1 = zeros
    /// - block 1 (log_sigma): output 0 = zeros, output 1 = design rows
    pub fn block_effective_jacobian(
        specs: &[ParameterBlockSpec],
        block_idx: usize,
    ) -> Result<Box<dyn BlockEffectiveJacobian>, String> {
        crate::util::block_jacobian::AdditiveWiggleBlockLayout {
            family: "GaussianLocationScaleFamily",
            n_outputs: 2,
            additive_blocks: &[Self::BLOCK_MU, Self::BLOCK_LOG_SIGMA],
            wiggle_block: None,
        }
        .block_effective_jacobian(specs, block_idx)
    }
}

/// Per-subject 2×2 channel Hessian `W_i` for Gaussian location-scale.
///
/// The row negative log-likelihood (with per-row weight `w_i`, response `y_i`,
/// mean predictor `μ_i`, log-scale predictor `s_i = log σ_i`) is
///
/// ```text
/// ρ_i(μ, s) = w_i [s + 0.5·(y_i − μ)²·exp(−2s)]
/// ```
///
/// The 2×2 Hessian in `(μ, s)` coordinates:
///
/// ```text
/// W_i[0,0] = w_i · exp(−2 s_i)                        ∂²ρ/∂μ²
/// W_i[1,1] = w_i · 2·(y_i − μ_i)²·exp(−2 s_i)        ∂²ρ/∂s²
/// W_i[0,1] = W_i[1,0] = w_i · 2·(y_i − μ_i)·exp(−2 s_i)  ∂²ρ/∂μ∂s
/// ```
///
/// The off-diagonal cross-channel term `∂²ρ/∂μ∂s` is nonzero whenever the
/// residual `(y_i − μ_i) ≠ 0`, i.e. away from the fitted mean.
pub struct GaussianLocationScaleChannelHessian {
    /// Row-major `(n × 2 × 2)` PSD-clamped per-subject Hessian.
    h: ndarray::Array3<f64>,
}

impl GaussianLocationScaleChannelHessian {
    /// Construct the raw (un-PSD-clamped) per-subject observed Hessian.
    ///
    /// For Gaussian location-scale the 2×2 observed Hessian
    /// `[[w·e^{-2s}, 2·w·r·e^{-2s}], [2·w·r·e^{-2s}, 2·w·r²·e^{-2s}]]`
    /// has determinant `-2·w²·r²·e^{-4s}` which is non-positive whenever
    /// the residual `r = y − μ ≠ 0`. Tests that finite-difference the row
    /// NLL must compare against this raw observed Hessian — PSD clamping
    /// alters the eigenvalues and the FD-versus-closed-form match fails.
    ///
    /// Production code that needs a PSD matrix (e.g. the canonicalize gate)
    /// must call [`Self::from_pilot`] which PSD-clamps via 2×2
    /// eigendecomposition.
    pub fn from_pilot_observed_unclamped(
        y: &ndarray::Array1<f64>,
        w: &ndarray::Array1<f64>,
        eta_mu: &ndarray::Array1<f64>,
        eta_log_sigma: &ndarray::Array1<f64>,
    ) -> Result<Self, String> {
        let n = y.len();
        if w.len() != n || eta_mu.len() != n || eta_log_sigma.len() != n {
            return Err(format!(
                "GaussianLocationScaleChannelHessian::from_pilot_observed_unclamped: \
                 length mismatch y={n} w={} eta_mu={} eta_log_sigma={}",
                w.len(),
                eta_mu.len(),
                eta_log_sigma.len(),
            ));
        }
        let mut h = ndarray::Array3::<f64>::zeros((n, 2, 2));
        for i in 0..n {
            let wi = w[i];
            let mu_i = eta_mu[i];
            let s_i = eta_log_sigma[i];
            let inv_sigma2 = (-2.0 * s_i).exp();
            let resid = y[i] - mu_i;
            h[[i, 0, 0]] = wi * inv_sigma2;
            h[[i, 1, 1]] = wi * 2.0 * resid * resid * inv_sigma2;
            h[[i, 0, 1]] = wi * 2.0 * resid * inv_sigma2;
            h[[i, 1, 0]] = h[[i, 0, 1]];
        }
        Ok(Self { h })
    }

    /// Construct from pilot predictors (μ and log σ at current β) and data,
    /// with PSD eigenvalue clamping applied per subject.
    ///
    /// `y` is the response, `w` the per-row sample weights, `eta_mu` and
    /// `eta_log_sigma` the current linear predictors. Negative eigenvalues
    /// are projected to zero (PSD clamp) before storage so the resulting
    /// matrix is a valid metric for the W-Gram identifiability compile.
    pub fn from_pilot(
        y: &ndarray::Array1<f64>,
        w: &ndarray::Array1<f64>,
        eta_mu: &ndarray::Array1<f64>,
        eta_log_sigma: &ndarray::Array1<f64>,
    ) -> Result<Self, String> {
        let n = y.len();
        if w.len() != n || eta_mu.len() != n || eta_log_sigma.len() != n {
            return Err(format!(
                "GaussianLocationScaleChannelHessian::from_pilot: \
                 length mismatch y={n} w={} eta_mu={} eta_log_sigma={}",
                w.len(),
                eta_mu.len(),
                eta_log_sigma.len(),
            ));
        }
        let mut h = ndarray::Array3::<f64>::zeros((n, 2, 2));
        for i in 0..n {
            let wi = w[i];
            let mu_i = eta_mu[i];
            let s_i = eta_log_sigma[i];
            let inv_sigma2 = (-2.0 * s_i).exp(); // exp(-2s) = 1/sigma^2
            let resid = y[i] - mu_i;
            // Hessian of w_i * ρ_i
            let h00 = wi * inv_sigma2;
            let h11 = wi * 2.0 * resid * resid * inv_sigma2;
            let h01 = wi * 2.0 * resid * inv_sigma2;
            // PSD clamp via eigendecomposition of 2×2 matrix.
            // psd_clamp_2x2 returns (λ1, λ2, u1[0], u1[1], u2[0], u2[1])
            // where u1 and u2 are unit eigenvectors for λ1 and λ2.
            // Reconstruction: H_psd = λ1·u1·u1ᵀ + λ2·u2·u2ᵀ
            let (e0, e1, u1_0, u1_1, u2_0, u2_1) = psd_clamp_2x2(h00, h01, h11);
            h[[i, 0, 0]] = e0 * u1_0 * u1_0 + e1 * u2_0 * u2_0;
            h[[i, 0, 1]] = e0 * u1_0 * u1_1 + e1 * u2_0 * u2_1;
            h[[i, 1, 0]] = h[[i, 0, 1]];
            h[[i, 1, 1]] = e0 * u1_1 * u1_1 + e1 * u2_1 * u2_1;
        }
        Ok(Self { h })
    }
}

/// Eigendecompose a 2×2 symmetric matrix `[[a, b], [b, d]]` and return
/// `(λ_max.max(0), λ_min.max(0), v0[0], v0[1], v1[0], v1[1])` where
/// `v0` and `v1` are the eigenvectors for λ_max and λ_min respectively.
/// Negative eigenvalues are clamped to zero for PSD projection.
/// The off-diagonal entry is `b` (the matrix is symmetric so only one
/// off-diagonal value is needed).
#[inline]
fn psd_clamp_2x2(a: f64, b: f64, d: f64) -> (f64, f64, f64, f64, f64, f64) {
    // Symmetric 2×2 eigenvalues via the closed-form formula.
    let trace = a + d;
    let det = a * d - b * b;
    let disc = (trace * trace * 0.25 - det).max(0.0).sqrt();
    let lam1 = (trace * 0.5 + disc).max(0.0); // larger eigenvalue (clamped)
    let lam2 = (trace * 0.5 - disc).max(0.0); // smaller eigenvalue (clamped)
    // Eigenvector for lam1: (lam1-d, b) normalized. Check: b*(lam1-d)+(d-lam1)*b=0 ✓
    let (u1_0, u1_1, u2_0, u2_1) = if b.abs() > 1e-15 * (a.abs() + d.abs()).max(1.0) {
        let ex = lam1 - d;
        let ey = b;
        let norm = (ex * ex + ey * ey).sqrt().max(1e-300);
        let (e0x, e0y) = (ex / norm, ey / norm);
        // Second eigenvector orthogonal to first: (-e0y, e0x).
        let (e1x, e1y) = (-e0y, e0x);
        (e0x, e0y, e1x, e1y)
    } else if a >= d {
        // Already diagonal (or nearly so), larger eigenvalue is a.
        (1.0, 0.0, 0.0, 1.0)
    } else {
        (0.0, 1.0, 1.0, 0.0)
    };
    (lam1, lam2, u1_0, u1_1, u2_0, u2_1)
}

impl FamilyChannelHessian for GaussianLocationScaleChannelHessian {
    fn n_outputs(&self) -> usize {
        2
    }

    fn n_subjects(&self) -> usize {
        self.h.shape()[0]
    }

    fn fill_subject(&self, i: usize, out: &mut [f64]) {
        assert_eq!(out.len(), 4);
        out[0] = self.h[[i, 0, 0]];
        out[1] = self.h[[i, 0, 1]];
        out[2] = self.h[[i, 1, 0]];
        out[3] = self.h[[i, 1, 1]];
    }

    fn evaluate_full(&self) -> ndarray::Array3<f64> {
        self.h.clone()
    }
}

impl CustomFamily for GaussianLocationScaleFamily {
    /// The Gaussian location-scale joint Hessian depends on β because the
    /// cross-block (μ,log σ) and (log σ,log σ) blocks contain the residual
    /// r = y − μ (via the row scalars m = r·w and n = r²·w), which changes
    /// when β_μ moves.  The (μ,μ) block weight w = 1/σ² also depends on
    /// β_{log σ}.  This override is essential for correct M_j[u] drift
    /// corrections when ψ hyperparameters move the design matrices.
    fn exact_newton_joint_hessian_beta_dependent(&self) -> bool {
        true
    }

    /// Two independent linear predictors: block 0 → μ channel, block 1 → log σ
    /// channel. Declaring the channel topology lets `fit_custom_family` route
    /// the identifiability audit channel-aware even when a caller builds the
    /// blocks by hand (without `build_location_scale_block`'s callbacks), so a
    /// shared μ/log-σ covariate basis is recognised as block-diagonal rather
    /// than mistaken for cross-block intercept aliases (#558).
    fn output_channel_assignment(&self, specs: &[ParameterBlockSpec]) -> Option<Vec<usize>> {
        // Two-channel families: `[mu, log_sigma]`. The optional trailing
        // zero-channel wiggle block (when present) also drives channel 0.
        Some(
            (0..specs.len())
                .map(|i| usize::from(i == Self::BLOCK_LOG_SIGMA))
                .collect(),
        )
    }

    fn coefficient_hessian_cost(&self, specs: &[ParameterBlockSpec]) -> u64 {
        // Operator-aware: when the unified evaluator picks the matrix-free
        // joint Hessian path (see `use_joint_matrix_free_path`), the workspace
        // applies the joint Hessian via row-streaming Khatri-Rao matvecs at
        // O(n · (p_t + p_ℓ)) per Hv, never building the dense (p_t + p_ℓ)²
        // matrix. Report the operator work model so diagnostics and
        // first-order-only policies reflect the representation that actually
        // runs.
        crate::families::location_scale_engine::location_scale_coefficient_hessian_cost(
            self.y.len() as u64,
            specs,
        )
    }

    fn evaluate(&self, block_states: &[ParameterBlockState]) -> Result<FamilyEvaluation, String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let etamu = &block_states[Self::BLOCK_MU].eta;
        let eta_log_sigma = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        if etamu.len() != n || eta_log_sigma.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
            }
            .into());
        }

        // Diagonal IRLS weights for the inner solver.
        //
        // For the location block (identity link): wmu = pw / sigma^2. Since the
        // location link is identity, observed = Fisher --- no correction needed.
        //
        // For the log-sigma block (log link): w_ls = 2 * pw * (dsigma/deta)^2 / sigma^2.
        // This is the Fisher weight. For the outer REML, the joint
        // `exact_newton_joint_hessian` provides the full observed Hessian directly,
        // so these Diagonal weights are only used for the inner IRLS iteration
        // (where Fisher scoring is fine). See response.md Section 3.
        //
        let mut zmu = Array1::<f64>::zeros(n);
        let mut wmu = Array1::<f64>::zeros(n);
        let mut z_ls = Array1::<f64>::zeros(n);
        let mut w_ls = Array1::<f64>::zeros(n);
        let ln2pi = (2.0 * std::f64::consts::PI).ln();
        let mut ll = 0.0;

        const CHUNK: usize = 1024;
        if let (
            Some(y_s),
            Some(w_s),
            Some(mu_s),
            Some(ls_s),
            Some(zmu_s),
            Some(wmu_s),
            Some(zls_s),
            Some(wls_s),
        ) = (
            self.y.as_slice_memory_order(),
            self.weights.as_slice_memory_order(),
            etamu.as_slice_memory_order(),
            eta_log_sigma.as_slice_memory_order(),
            zmu.as_slice_memory_order_mut(),
            wmu.as_slice_memory_order_mut(),
            z_ls.as_slice_memory_order_mut(),
            w_ls.as_slice_memory_order_mut(),
        ) {
            // Per-row Gaussian LS kernel writes 4 working arrays directly into
            // the output slices; ll is reduced via Rayon's sum. Independent
            // across rows.
            ll += zmu_s
                .par_chunks_mut(CHUNK)
                .zip(wmu_s.par_chunks_mut(CHUNK))
                .zip(zls_s.par_chunks_mut(CHUNK))
                .zip(wls_s.par_chunks_mut(CHUNK))
                .enumerate()
                .map(|(chunk_idx, (((zmu_c, wmu_c), zls_c), wls_c))| {
                    let start = chunk_idx * CHUNK;
                    let mut local_ll = 0.0;
                    for local in 0..zmu_c.len() {
                        let i = start + local;
                        let row =
                            gaussian_diagonal_row_kernel(y_s[i], mu_s[i], ls_s[i], w_s[i], ln2pi);
                        zmu_c[local] = mu_s[i] + row.location_working_shift;
                        wmu_c[local] = row.location_working_weight;
                        zls_c[local] = row.log_sigma_working_response;
                        wls_c[local] = row.log_sigma_working_weight;
                        local_ll += row.log_likelihood;
                    }
                    local_ll
                })
                .sum::<f64>();
        } else {
            // Fallback path: inputs are not contiguous. Outputs (just-allocated
            // Array1::zeros) always are. Reborrow input views into the closure.
            let y_view = self.y.view();
            let w_view = self.weights.view();
            let mu_view = etamu.view();
            let ls_view = eta_log_sigma.view();
            let zmu_s = zmu
                .as_slice_memory_order_mut()
                .expect("zeros is contiguous");
            let wmu_s = wmu
                .as_slice_memory_order_mut()
                .expect("zeros is contiguous");
            let zls_s = z_ls
                .as_slice_memory_order_mut()
                .expect("zeros is contiguous");
            let wls_s = w_ls
                .as_slice_memory_order_mut()
                .expect("zeros is contiguous");
            ll += zmu_s
                .par_chunks_mut(CHUNK)
                .zip(wmu_s.par_chunks_mut(CHUNK))
                .zip(zls_s.par_chunks_mut(CHUNK))
                .zip(wls_s.par_chunks_mut(CHUNK))
                .enumerate()
                .map(|(chunk_idx, (((zmu_c, wmu_c), zls_c), wls_c))| {
                    let start = chunk_idx * CHUNK;
                    let mut local_ll = 0.0;
                    for local in 0..zmu_c.len() {
                        let i = start + local;
                        let row = gaussian_diagonal_row_kernel(
                            y_view[i], mu_view[i], ls_view[i], w_view[i], ln2pi,
                        );
                        zmu_c[local] = mu_view[i] + row.location_working_shift;
                        wmu_c[local] = row.location_working_weight;
                        zls_c[local] = row.log_sigma_working_response;
                        wls_c[local] = row.log_sigma_working_weight;
                        local_ll += row.log_likelihood;
                    }
                    local_ll
                })
                .sum::<f64>();
        }

        Ok(FamilyEvaluation {
            log_likelihood: ll,
            blockworking_sets: vec![
                BlockWorkingSet::diagonal_checked(zmu, wmu)?,
                BlockWorkingSet::diagonal_checked(z_ls, w_ls)?,
            ],
        })
    }

    fn log_likelihood_only(&self, block_states: &[ParameterBlockState]) -> Result<f64, String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let etamu = &block_states[Self::BLOCK_MU].eta;
        let eta_log_sigma = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        if etamu.len() != n || eta_log_sigma.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
            }
            .into());
        }
        // logb noise link: σ(η_ls) = LOGB_SIGMA_FLOOR + exp(η_ls). σ ≥ b > 0
        // bounds the loglik below (−Σlog σ ≥ −n log b) and bounds 1/σ² by 1/b²,
        // so the previous `inv_s2.min(1e24)` cap is structurally unnecessary.
        let ln2pi = (2.0 * std::f64::consts::PI).ln();
        let mut ll = 0.0;
        if let (Some(y_s), Some(w_s), Some(mu_s), Some(ls_s)) = (
            self.y.as_slice_memory_order(),
            self.weights.as_slice_memory_order(),
            etamu.as_slice_memory_order(),
            eta_log_sigma.as_slice_memory_order(),
        ) {
            use rayon::iter::{IntoParallelIterator, ParallelIterator};
            ll += (0..n)
                .into_par_iter()
                .map(|i| {
                    let wi = w_s[i];
                    if wi == 0.0 {
                        return 0.0;
                    }
                    let sigma_i = logb_sigma_from_eta_scalar(ls_s[i]);
                    let inv_s2 = (sigma_i * sigma_i).recip();
                    let r = y_s[i] - mu_s[i];
                    wi * (-0.5 * (r * r * inv_s2 + ln2pi + 2.0 * sigma_i.ln()))
                })
                .sum::<f64>();
        } else {
            use rayon::iter::{IntoParallelIterator, ParallelIterator};
            ll += (0..n)
                .into_par_iter()
                .map(|i| {
                    let wi = self.weights[i];
                    if wi == 0.0 {
                        return 0.0;
                    }
                    let sigma_i = logb_sigma_from_eta_scalar(eta_log_sigma[i]);
                    let inv_s2 = (sigma_i * sigma_i).recip();
                    let r = self.y[i] - etamu[i];
                    wi * (-0.5 * (r * r * inv_s2 + ln2pi + 2.0 * sigma_i.ln()))
                })
                .sum::<f64>();
        }
        Ok(ll)
    }

    /// Outer-only log-likelihood with optional row subsample.
    ///
    /// When `options.outer_score_subsample` is `Some`, only the sampled rows
    /// contribute; each row's per-row log-likelihood term is multiplied by
    /// `WeightedOuterRow.weight`, the Horvitz–Thompson inverse-inclusion
    /// factor 1/π_i (uniform or stratified sampling both supported), so the
    /// partial sum is an unbiased estimator of the full-data log-likelihood.
    /// When `None`, this returns the full-data `log_likelihood_only`. Inner
    /// PIRLS line searches never install the subsample option, so they
    /// continue to score the exact full-data log-likelihood.
    fn log_likelihood_only_with_options(
        &self,
        block_states: &[ParameterBlockState],
        options: &BlockwiseFitOptions,
    ) -> Result<f64, String> {
        let Some(subsample) = options.outer_score_subsample.as_ref() else {
            return self.log_likelihood_only(block_states);
        };
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let etamu = &block_states[Self::BLOCK_MU].eta;
        let eta_log_sigma = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        if etamu.len() != n || eta_log_sigma.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
            }
            .into());
        }
        let ln2pi = (2.0 * std::f64::consts::PI).ln();
        use rayon::iter::ParallelIterator;
        let ll: f64 = subsample
            .rows
            .par_iter()
            .map(|row| {
                let i = row.index;
                let wi = self.weights[i];
                if wi == 0.0 {
                    return 0.0;
                }
                let sigma_i = logb_sigma_from_eta_scalar(eta_log_sigma[i]);
                let inv_s2 = (sigma_i * sigma_i).recip();
                let r = self.y[i] - etamu[i];
                row.weight * wi * (-0.5 * (r * r * inv_s2 + ln2pi + 2.0 * sigma_i.ln()))
            })
            .sum();
        Ok(ll)
    }

    fn exact_newton_joint_hessian(
        &self,
        block_states: &[ParameterBlockState],
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_for_specs(block_states, None)
    }

    fn has_explicit_joint_hessian(&self) -> bool {
        true
    }

    /// The Gaussian location-scale likelihood has no separation /
    /// under-identification regime that the full-span Jeffreys curvature `H_Φ`
    /// is meant to regularize: with the soft floor `σ ≥ b > 0` the per-row
    /// Fisher information `diag(a/σ², 2κ²a)` is bounded and `O(n)` on every
    /// identified direction at every working point, so the well-conditioned-`H`
    /// Jeffreys gate smooth-steps `H_Φ` to ~0 — yet the matching score `∇Φ`
    /// kept leaking a *phantom* penalized-stationarity residual into the inner
    /// joint-Newton (a nonzero `|∇L − Sβ|` paired with a numerically null `H_Φ`
    /// and a full-rank `H_pen`), so the KKT certificate refused every iterate
    /// and the outer REML rejected all seeds — aborting heteroscedastic
    /// location-scale fits (#684–#688). This is the same opt-out
    /// `TransformationNormalFamily` takes for the same structural reason
    /// (continuous response, `O(n)` Fisher information everywhere); it removes
    /// the phantom residual and drops the per-cycle `O(n·p²)` Jeffreys
    /// directional-derivative overhead.
    fn joint_jeffreys_term_required(&self) -> bool {
        false
    }

    fn exact_newton_joint_hessian_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_directional_derivative_for_specs(
            block_states,
            None,
            d_beta_flat,
        )
    }

    fn exact_newton_joint_hessiansecond_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        d_beta_u_flat: &Array1<f64>,
        d_betav_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_second_directional_derivative_for_specs(
            block_states,
            None,
            d_beta_u_flat,
            d_betav_flat,
        )
    }

    fn diagonalworking_weights_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        block_idx: usize,
        d_eta: &Array1<f64>,
    ) -> Result<Option<Array1<f64>>, String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_MU].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        if eta_t.len() != n || eta_ls.len() != n || self.weights.len() != n || d_eta.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "GaussianLocationScaleFamily input size mismatch".to_string(),
            }
            .into());
        }

        let sigma = eta_ls.mapv(logb_sigma_from_eta_scalar);
        let mut dw = Array1::<f64>::zeros(n);
        match block_idx {
            Self::BLOCK_MU => {
                // Gaussian location block:
                //
                //   wmu = weight / sigma^2.
                //
                // This depends only on the scale predictor, so along a
                // location-only direction d etamu the directional derivative is
                // identically zero.
                Ok(Some(dw))
            }
            Self::BLOCK_LOG_SIGMA => {
                // Gaussian log-sigma block:
                //
                // The PIRLS information weight is
                //
                //   w_ls = max(2 * weight * clamp(g, -1, 1)^2, MIN_WEIGHT),
                //   g    = sigma'(eta_ls) / sigma(eta_ls),
                // with the semantic rule that zero observation weights stay zero.
                //
                // Along a direction d eta_ls,
                //
                //   dw_ls is the directional derivative of that piecewise
                //   definition. On the active clamp branch or active MIN_WEIGHT
                //   floor branch, the returned derivative is zero to match the
                //   selected local piece of the evaluated weight.
                //
                // This is the exact directional derivative needed by the REML
                // trace term
                //
                //   0.5 tr(J^{-1} D_beta J[u])
                //   = 0.5 sum_i (x_i^T J^{-1} x_i) dw_i
                //
                // for diagonal working-set blocks.
                use rayon::iter::{IntoParallelIterator, ParallelIterator};
                let dw_vec: Vec<f64> = (0..n)
                    .into_par_iter()
                    .map(|i| {
                        let d1 = crate::families::sigma_link::logb_sigma_jet1_scalar(eta_ls[i]).d1;
                        gaussian_log_sigma_irlsinfo_directional_derivative(
                            self.weights[i],
                            sigma[i],
                            d1,
                            d_eta[i],
                        )
                    })
                    .collect();
                for (i, v) in dw_vec.into_iter().enumerate() {
                    dw[i] = v;
                }
                Ok(Some(dw))
            }
            _ => Ok(None),
        }
    }

    fn exact_newton_joint_hessian_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_for_specs(block_states, Some(specs))
    }

    fn exact_newton_joint_hessian_directional_derivative_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_directional_derivative_for_specs(
            block_states,
            Some(specs),
            d_beta_flat,
        )
    }

    fn exact_newton_joint_hessian_second_directional_derivative_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        d_beta_u_flat: &Array1<f64>,
        d_betav_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_second_directional_derivative_for_specs(
            block_states,
            Some(specs),
            d_beta_u_flat,
            d_betav_flat,
        )
    }

    fn exact_newton_joint_psi_terms(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiTerms>, String> {
        self.exact_newton_joint_psi_terms_for_specs(
            block_states,
            specs,
            derivative_blocks,
            psi_index,
        )
    }

    fn exact_newton_joint_psisecond_order_terms(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_i: usize,
        psi_j: usize,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms>, String> {
        self.exact_newton_joint_psisecond_order_terms_for_specs(
            block_states,
            specs,
            derivative_blocks,
            psi_i,
            psi_j,
        )
    }

    fn exact_newton_joint_psihessian_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_psihessian_directional_derivative_for_specs(
            block_states,
            specs,
            derivative_blocks,
            psi_index,
            d_beta_flat,
        )
    }

    fn exact_newton_joint_psi_workspace(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
    ) -> Result<Option<Arc<dyn ExactNewtonJointPsiWorkspace>>, String> {
        if block_states.len() != 2 || specs.len() != 2 || derivative_blocks.len() != 2 {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "GaussianLocationScaleFamily joint psi workspace expects 2 states, 2 specs, and 2 derivative block lists, got {} / {} / {}",
                block_states.len(),
                specs.len(),
                derivative_blocks.len()
            ) }.into());
        }
        Ok(Some(Arc::new(
            GaussianLocationScaleExactNewtonJointPsiWorkspace::new(
                self.clone(),
                block_states.to_vec(),
                specs,
                derivative_blocks.to_vec(),
            )?,
        )))
    }

    /// Outer-aware joint ψ workspace with optional row subsample.
    ///
    /// When `options.outer_score_subsample` is `None`, this is byte-identical
    /// to `exact_newton_joint_psi_workspace`. When `Some`, the subsample is
    /// stored in the workspace and forwarded into every per-row weight array
    /// produced by `gaussian_joint_psi_firstweights`,
    /// `gaussian_joint_psisecondweights`, and
    /// `gaussian_joint_psi_mixed_driftweights`: each sampled row's
    /// contribution is multiplied by `WeightedOuterRow.weight = 1/π_i` and
    /// non-sampled rows are zeroed. Every downstream assembly
    /// (`gaussian_joint_psi*_fromweights`, `weighted_crossprod_psi_maps`,
    /// `xt_diag_*_dense`,
    /// `build_two_block_custom_family_joint_psi_operator_from_actions`) is
    /// row-linear in these arrays via `Xᵀ diag(W) Y`, so the resulting
    /// second-order ψ Hessian and ψ-Hessian directional derivative are
    /// unbiased Horvitz–Thompson estimators of the full-data quantities.
    /// Inner-PIRLS and final-covariance paths never install the option.
    fn exact_newton_joint_psi_workspace_with_options(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        options: &BlockwiseFitOptions,
    ) -> Result<Option<Arc<dyn ExactNewtonJointPsiWorkspace>>, String> {
        if block_states.len() != 2 || specs.len() != 2 || derivative_blocks.len() != 2 {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "GaussianLocationScaleFamily joint psi workspace expects 2 states, 2 specs, and 2 derivative block lists, got {} / {} / {}",
                block_states.len(),
                specs.len(),
                derivative_blocks.len()
            ) }.into());
        }
        Ok(Some(Arc::new(
            GaussianLocationScaleExactNewtonJointPsiWorkspace::new_with_subsample(
                self.clone(),
                block_states.to_vec(),
                specs,
                derivative_blocks.to_vec(),
                options.outer_score_subsample.clone(),
            )?,
        )))
    }

    fn exact_newton_joint_hessian_workspace(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
    ) -> Result<Option<Arc<dyn ExactNewtonJointHessianWorkspace>>, String> {
        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        let workspace = GaussianLocationScaleHessianWorkspace::new(
            self.clone(),
            block_states.to_vec(),
            xmu.into_owned(),
            x_ls.into_owned(),
        )?;
        Ok(Some(Arc::new(workspace)))
    }

    /// Outer-aware joint-Hessian workspace with optional row subsample.
    ///
    /// When `options.outer_score_subsample` is `None`, this is byte-identical
    /// to `exact_newton_joint_hessian_workspace`. When `Some`, the precomputed
    /// per-row coefficient arrays (`coeff_mm`, `coeff_ml`, `coeff_ll`) — which
    /// every downstream assembly (`hessian_dense`, `hessian_matvec`,
    /// `hessian_diagonal`) consumes row-linearly via `Xᵀ diag(W) X` — are
    /// replaced by a Horvitz–Thompson mask: each sampled row's coefficient is
    /// multiplied by `WeightedOuterRow.weight` (the inverse-inclusion factor
    /// 1/π_i; uniform or stratified sampling both supported), and non-sampled
    /// rows are zeroed. The resulting joint Hessian is an unbiased estimator
    /// of the full-data joint Hessian. Inner PIRLS never installs the option,
    /// so the inner solve continues to consume the exact full-data Hessian.
    fn exact_newton_joint_hessian_workspace_with_options(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        options: &BlockwiseFitOptions,
    ) -> Result<Option<Arc<dyn ExactNewtonJointHessianWorkspace>>, String> {
        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        let mut workspace = GaussianLocationScaleHessianWorkspace::new(
            self.clone(),
            block_states.to_vec(),
            xmu.into_owned(),
            x_ls.into_owned(),
        )?;
        if let Some(subsample) = options.outer_score_subsample.as_ref() {
            workspace.apply_outer_subsample(subsample.rows.as_ref());
        }
        Ok(Some(Arc::new(workspace)))
    }

    fn inner_coefficient_hessian_hvp_available(&self, specs: &[ParameterBlockSpec]) -> bool {
        // The Gaussian location-scale workspace is returned by
        // `exact_newton_joint_hessian_workspace` whenever
        // `exact_joint_dense_block_designs` succeeds, which itself depends on
        // both block designs being present. This is only a β-space operator
        // capability; outer θθ Hessian availability is declared separately.
        self.exact_joint_supported()
            && matches!(
                self.exact_joint_dense_block_designs(Some(specs)),
                Ok(Some(_))
            )
    }

    /// Outer-derivative policy: declare HT-subsample capability.
    ///
    /// GaussianLocationScaleFamily overrides
    /// `log_likelihood_only_with_options`,
    /// `exact_newton_joint_hessian_workspace_with_options`, and
    /// `exact_newton_joint_psi_workspace_with_options` to consume
    /// `options.outer_score_subsample` with per-row Horvitz–Thompson weights
    /// (each sampled row's contribution is multiplied by
    /// `WeightedOuterRow.weight = 1/π_i`; non-sampled rows are zeroed),
    /// yielding unbiased estimators of the full-data log-likelihood, joint
    /// Hessian, and second-order ψ Hessian / ψ-Hessian directional
    /// derivative. The ψ-workspace masking happens inside
    /// `apply_ht_mask_first`, `apply_ht_mask_second`, and
    /// `apply_ht_mask_mixed` on the `GaussianJointPsi{First,Second,
    /// MixedDrift}Weights` per-row arrays, immediately after the row-scalar
    /// reductions and before the row-linear `weighted_crossprod_psi_maps` /
    /// `xt_diag_*_dense` assemblies, so the masked outputs remain unbiased.
    /// First-order ψ terms remain full-data exact (= trivially unbiased), so
    /// the total outer score is still unbiased. Inner-PIRLS and final-
    /// covariance paths never install the option, so they continue to
    /// consume the exact full-data quantities.
    fn outer_derivative_subsample_capable(&self) -> bool {
        true
    }
}

impl CustomFamilyGenerative for GaussianLocationScaleFamily {
    fn generativespec(
        &self,
        block_states: &[ParameterBlockState],
    ) -> Result<GenerativeSpec, String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let mu = block_states[Self::BLOCK_MU].eta.clone();
        let eta_log_sigma = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let sigma = gamlss_rowwise_map(eta_log_sigma.len(), |i| {
            logb_sigma_from_eta_scalar(eta_log_sigma[i])
        });
        Ok(GenerativeSpec {
            mean: mu,
            noise: NoiseModel::Gaussian { sigma },
        })
    }
}

/// One channel of a `RowCoeffOperator`: a row-major `Arc<Array2<f64>>`
/// design matrix indexed by row coefficient pairs. Channels with the same
/// `block` value contribute their `X^T r` outputs into the same coefficient
/// block of the joint vector (e.g. wiggle's basis B and basis_d1 are two
/// channels that both contribute to the wiggle output block).
struct RowCoeffChannel {
    block: usize,
    design: Arc<Array2<f64>>,
}

/// Symmetric pair coefficients `c_{ab}` for `a ≤ b`. The operator adds
/// `X_a^T diag(c_{ab}) X_b` to block `block_a`'s output and the transpose
/// contribution `X_b^T diag(c_{ab}) X_a` to block `block_b` when `a != b`.
struct RowCoeffPair {
    a: usize,
    b: usize,
    coeff: Array1<f64>,
}

/// Pooled per-call scratch for `RowCoeffOperator::mul_vec`. Each call
/// pops a buffer set; if the pool is empty (parallel callers exhausted
/// it) we allocate fresh — the alloc is amortized as concurrent callers
/// recycle. The pool's `Mutex` is taken only for `pop`/`push` (constant
/// time), never during the matmul.
///
/// **Invariant**: every buffer in `pool[k].u[ch]` and `pool[k].r[ch]` has
/// length `nrows`. `mul_vec` overwrites `u` via `fast_av_into` and
/// zeroes-then-accumulates `r`, leaving both buffers in any state on
/// return — callers must not depend on residual content.
struct RowCoeffScratch {
    u: Vec<Array1<f64>>,
    r: Vec<Array1<f64>>,
}

/// Matrix-free operator for two-block-style joint-Hessian directional
/// derivatives that decompose as `H = sum_{a,b} X_a^T diag(c_{ab}) X_b`
/// with each `X_a` an `n × p_a` design and `c_{ab}` an `n` row coefficient
/// vector. `mul_vec` applies the operator in O(n · sum_a p_a) per call,
/// reusing pre-sized scratch buffers for `u`, `r` from a small lock-pool
/// so concurrent `mul_vec` callers do not serialize on the same scratch.
///
/// `block_offsets` gives the starting column of each output block; the
/// operator dimension is the sum of all block widths. Each channel's
/// `mul_vec` contribution is added into the slice for its output block.
struct RowCoeffOperator {
    channels: Vec<RowCoeffChannel>,
    block_offsets: Vec<usize>,
    block_widths: Vec<usize>,
    dim: usize,
    pair_coeffs: Vec<RowCoeffPair>,
    nrows: usize,
    scratch_pool: std::sync::Mutex<Vec<RowCoeffScratch>>,
}

impl RowCoeffOperator {
    /// One-line constructor for the standard (channels, pair-coeffs)
    /// recipe used by every GAMLSS LS workspace: pass the block widths,
    /// the channel list as `(block_id, design)` tuples, and the pair
    /// list as `(a, b, coeff)` tuples. Pre-allocates one scratch in the
    /// pool so the first warm `mul_vec` call skips allocation.
    fn from_directions(
        block_widths: Vec<usize>,
        channels: Vec<(usize, Arc<Array2<f64>>)>,
        pairs: Vec<(usize, usize, Array1<f64>)>,
        nrows: usize,
    ) -> Self {
        let channels: Vec<RowCoeffChannel> = channels
            .into_iter()
            .map(|(block, design)| RowCoeffChannel { block, design })
            .collect();
        let pair_coeffs: Vec<RowCoeffPair> = pairs
            .into_iter()
            .map(|(a, b, coeff)| RowCoeffPair { a, b, coeff })
            .collect();
        let mut block_offsets = Vec::with_capacity(block_widths.len());
        let mut acc = 0;
        for w in &block_widths {
            block_offsets.push(acc);
            acc += *w;
        }
        let n_ch = channels.len();
        let initial = RowCoeffScratch {
            u: (0..n_ch).map(|_| Array1::<f64>::zeros(nrows)).collect(),
            r: (0..n_ch).map(|_| Array1::<f64>::zeros(nrows)).collect(),
        };
        Self {
            channels,
            block_offsets,
            block_widths,
            dim: acc,
            pair_coeffs,
            nrows,
            scratch_pool: std::sync::Mutex::new(vec![initial]),
        }
    }

    fn acquire_scratch(&self) -> RowCoeffScratch {
        self.scratch_pool
            .lock()
            .expect("RowCoeffOperator scratch pool poisoned")
            .pop()
            .unwrap_or_else(|| {
                let n_ch = self.channels.len();
                RowCoeffScratch {
                    u: (0..n_ch)
                        .map(|_| Array1::<f64>::zeros(self.nrows))
                        .collect(),
                    r: (0..n_ch)
                        .map(|_| Array1::<f64>::zeros(self.nrows))
                        .collect(),
                }
            })
    }

    fn release_scratch(&self, scratch: RowCoeffScratch) {
        self.scratch_pool
            .lock()
            .expect("RowCoeffOperator scratch pool poisoned")
            .push(scratch);
    }

    fn projected_trace(&self, factor: &Array2<f64>) -> f64 {
        let grams = self.projected_pair_gram_table(factor);
        self.trace_from_pair_gram_table(grams.view())
    }

    fn projected_pair_gram_cache_id(&self) -> usize {
        let mut hasher = DefaultHasher::new();
        "RowCoeffOperator::projected_pair_gram_table".hash(&mut hasher);
        self.nrows.hash(&mut hasher);
        self.dim.hash(&mut hasher);
        self.block_widths.hash(&mut hasher);
        self.block_offsets.hash(&mut hasher);
        self.channels.len().hash(&mut hasher);
        self.pair_coeffs.len().hash(&mut hasher);
        for (idx, ch) in self.channels.iter().enumerate() {
            idx.hash(&mut hasher);
            (Arc::as_ptr(&ch.design) as usize).hash(&mut hasher);
            ch.block.hash(&mut hasher);
            ch.design.nrows().hash(&mut hasher);
            ch.design.ncols().hash(&mut hasher);
            self.block_widths[ch.block].hash(&mut hasher);
        }
        for (idx, pair) in self.pair_coeffs.iter().enumerate() {
            idx.hash(&mut hasher);
            pair.a.hash(&mut hasher);
            pair.b.hash(&mut hasher);
        }
        hasher.finish() as usize
    }

    fn projected_pair_gram_table(&self, factor: &Array2<f64>) -> Array2<f64> {
        assert_eq!(
            factor.nrows(),
            self.dim,
            "row-coefficient cached projected trace factor row mismatch: factor rows={} but dim={}",
            factor.nrows(),
            self.dim
        );
        let rank = factor.ncols();
        let pair_count = self.pair_coeffs.len();
        if self.nrows == 0 || rank == 0 || pair_count == 0 {
            return Array2::<f64>::zeros((self.nrows, pair_count));
        }
        let rows_per_chunk =
            gamlss_projected_trace_chunk_rows(rank, self.channels.len(), pair_count)
                .min(self.nrows.max(1));
        let mut grams = Array2::<f64>::zeros((self.nrows, pair_count));
        let fill_chunk = |start: usize, mut out_chunk: ndarray::ArrayViewMut2<'_, f64>| {
            let end = (start + rows_per_chunk).min(self.nrows);
            let rows = start..end;
            let mut projected: Vec<Array2<f64>> = Vec::with_capacity(self.channels.len());
            for ch in &self.channels {
                let block_start = self.block_offsets[ch.block];
                let width = self.block_widths[ch.block];
                let design_chunk = ch.design.slice(s![rows.clone(), ..]);
                let factor_block = factor.slice(s![block_start..block_start + width, ..]);
                projected.push(fast_ab(&design_chunk, &factor_block));
            }
            for (pair_idx, pair) in self.pair_coeffs.iter().enumerate() {
                let u_a = &projected[pair.a];
                let u_b = &projected[pair.b];
                for local_i in 0..u_a.nrows() {
                    let mut value = 0.0;
                    for col in 0..rank {
                        value += u_a[[local_i, col]] * u_b[[local_i, col]];
                    }
                    out_chunk[[local_i, pair_idx]] = value;
                }
            }
        };
        if rayon::current_thread_index().is_none() && self.nrows > rows_per_chunk {
            grams
                .axis_chunks_iter_mut(Axis(0), rows_per_chunk)
                .into_par_iter()
                .enumerate()
                .for_each(|(chunk_idx, out_chunk)| {
                    fill_chunk(chunk_idx * rows_per_chunk, out_chunk)
                });
        } else {
            for start in (0..self.nrows).step_by(rows_per_chunk) {
                let end = (start + rows_per_chunk).min(self.nrows);
                let out_chunk = grams.slice_mut(s![start..end, ..]);
                fill_chunk(start, out_chunk);
            }
        }
        grams
    }

    fn trace_from_pair_gram_table(&self, grams: ArrayView2<'_, f64>) -> f64 {
        assert_eq!(grams.nrows(), self.nrows);
        assert_eq!(grams.ncols(), self.pair_coeffs.len());
        let mut trace = 0.0;
        for i in 0..self.nrows {
            for (pair_idx, pair) in self.pair_coeffs.iter().enumerate() {
                let multiplier = if pair.a == pair.b { 1.0 } else { 2.0 };
                trace += multiplier * pair.coeff[i] * grams[[i, pair_idx]];
            }
        }
        trace
    }
}

impl crate::solver::estimate::reml::unified::HyperOperator for RowCoeffOperator {
    fn dim(&self) -> usize {
        self.dim
    }

    fn mul_vec(&self, v: &Array1<f64>) -> Array1<f64> {
        assert_eq!(v.len(), self.dim);
        let mut scratch = self.acquire_scratch();
        let RowCoeffScratch { u, r } = &mut scratch;

        // 1) u_a = X_a · v[block_a slice]. `fast_av_into` writes directly
        //    into the pre-sized scratch buffer — no per-call n-sized
        //    allocation.
        for (k, ch) in self.channels.iter().enumerate() {
            let start = self.block_offsets[ch.block];
            let width = self.block_widths[ch.block];
            assert_eq!(ch.design.ncols(), width);
            let v_slice = v.slice(s![start..start + width]);
            crate::faer_ndarray::fast_av_into(ch.design.as_ref(), &v_slice, &mut u[k]);
        }

        // 2) r_a = sum_b c_{ab} ⊙ u_b. Zero-then-accumulate; pair coeffs
        //    contribute symmetrically when `a != b`.
        for slot in r.iter_mut() {
            slot.fill(0.0);
        }
        for pair in &self.pair_coeffs {
            let a = pair.a;
            let b = pair.b;
            let coeff = pair
                .coeff
                .as_slice()
                .expect("RowCoeffOperator pair coeff must be contiguous");
            // r[a] += coeff ⊙ u[b]; if a != b also r[b] += coeff ⊙ u[a].
            // Split the borrow so r[a] and r[b] (or u[a] and u[b]) can be
            // accessed simultaneously when a != b.
            if a == b {
                let u_a = u[a]
                    .as_slice()
                    .expect("RowCoeffOperator u must be contiguous");
                let r_a = r[a]
                    .as_slice_mut()
                    .expect("RowCoeffOperator r must be contiguous");
                use rayon::prelude::*;
                r_a.par_iter_mut()
                    .zip(coeff.par_iter())
                    .zip(u_a.par_iter())
                    .for_each(|((r, c), u)| *r += c * u);
            } else {
                let (r_a_slice, r_b_slice) = if a < b {
                    let (left, right) = r.split_at_mut(b);
                    (
                        left[a].as_slice_mut().expect("contiguous"),
                        right[0].as_slice_mut().expect("contiguous"),
                    )
                } else {
                    let (left, right) = r.split_at_mut(a);
                    (
                        right[0].as_slice_mut().expect("contiguous"),
                        left[b].as_slice_mut().expect("contiguous"),
                    )
                };
                let u_a = u[a].as_slice().expect("contiguous");
                let u_b = u[b].as_slice().expect("contiguous");
                use rayon::prelude::*;
                r_a_slice
                    .par_iter_mut()
                    .zip(r_b_slice.par_iter_mut())
                    .zip(coeff.par_iter())
                    .zip(u_a.par_iter())
                    .zip(u_b.par_iter())
                    .for_each(|((((ra, rb), c), ua), ub)| {
                        *ra += c * ub;
                        *rb += c * ua;
                    });
            }
        }

        // 3) Output[block] += X_a^T r_a per channel. Single output alloc.
        let mut out = Array1::<f64>::zeros(self.dim);
        for (k, ch) in self.channels.iter().enumerate() {
            let start = self.block_offsets[ch.block];
            let width = self.block_widths[ch.block];
            let mut block = out.slice_mut(s![start..start + width]);
            // Atv into a temporary, then accumulate; `fast_atv` allocates
            // a `width`-sized array, which is bounded and small relative
            // to the n-sized u/r buffers we already reuse.
            let contrib = fast_atv(ch.design.as_ref(), &r[k]);
            block += &contrib;
        }
        self.release_scratch(scratch);
        out
    }

    fn mul_basis_columns_into(&self, start: usize, mut out: ndarray::ArrayViewMut2<'_, f64>) {
        let cols = out.ncols();
        assert!(start + cols <= self.dim);
        let mut basis = Array1::<f64>::zeros(self.dim);
        for local_col in 0..cols {
            let global_col = start + local_col;
            basis[global_col] = 1.0;
            let col = self.mul_vec(&basis);
            out.column_mut(local_col).assign(&col);
            basis[global_col] = 0.0;
        }
    }

    fn to_dense(&self) -> Array2<f64> {
        // Build by basis-vector probing — small-K materialization path.
        let mut out = Array2::<f64>::zeros((self.dim, self.dim));
        self.mul_basis_columns_into(0, out.view_mut());
        out
    }

    fn trace_projected_factor(&self, factor: &Array2<f64>) -> f64 {
        self.projected_trace(factor)
    }

    fn trace_projected_factor_cached(
        &self,
        factor: &Array2<f64>,
        cache: &crate::solver::estimate::reml::unified::ProjectedFactorCache,
    ) -> f64 {
        let key = crate::solver::estimate::reml::unified::ProjectedFactorKey::from_factor_view(
            self.projected_pair_gram_cache_id(),
            factor.view(),
        );
        let grams = cache.get_or_insert_with(key, || self.projected_pair_gram_table(factor));
        self.trace_from_pair_gram_table(grams.view())
    }

    fn is_implicit(&self) -> bool {
        true
    }
}

/// Two-block row-coefficient operator backed by `DesignMatrix`.
///
/// This is the operator-form counterpart to `DesignTwoBlockRowCoeffOperator`'s
/// old dense-array storage: it must keep the realized block designs lazy all
/// the way through `Xv` and `X^T r`. Do not cache `Array2` snapshots here;
/// `NoDensifyOperator` regression tests rely on this type to panic if a future
/// change materializes spec-backed designs.
struct DesignTwoBlockRowCoeffOperator {
    x_a: DesignMatrix,
    x_b: DesignMatrix,
    c_aa: Arc<Array1<f64>>,
    c_ab: Arc<Array1<f64>>,
    c_bb: Arc<Array1<f64>>,
    dim: usize,
    nrows: usize,
    pa: usize,
}

impl crate::solver::estimate::reml::unified::HyperOperator for DesignTwoBlockRowCoeffOperator {
    fn dim(&self) -> usize {
        self.dim
    }

    fn mul_vec(&self, v: &Array1<f64>) -> Array1<f64> {
        assert_eq!(v.len(), self.dim);
        let v_a = v.slice(s![0..self.pa]);
        let v_b = v.slice(s![self.pa..self.dim]);
        let u_a = self.x_a.matrixvectormultiply(&v_a.to_owned());
        let u_b = self.x_b.matrixvectormultiply(&v_b.to_owned());
        assert_eq!(u_a.len(), self.nrows);
        assert_eq!(u_b.len(), self.nrows);
        let r_a = self.c_aa.as_ref() * &u_a + self.c_ab.as_ref() * &u_b;
        let r_b = self.c_ab.as_ref() * &u_a + self.c_bb.as_ref() * &u_b;
        let out_a = self.x_a.transpose_vector_multiply(&r_a);
        let out_b = self.x_b.transpose_vector_multiply(&r_b);
        let mut out = Array1::<f64>::zeros(self.dim);
        out.slice_mut(s![0..self.pa]).assign(&out_a);
        out.slice_mut(s![self.pa..self.dim]).assign(&out_b);
        out
    }

    fn mul_basis_columns_into(&self, start: usize, mut out: ndarray::ArrayViewMut2<'_, f64>) {
        let cols = out.ncols();
        assert!(start + cols <= self.dim);
        let mut basis = Array1::<f64>::zeros(self.dim);
        for local_col in 0..cols {
            let global_col = start + local_col;
            basis[global_col] = 1.0;
            let col = self.mul_vec(&basis);
            out.column_mut(local_col).assign(&col);
            basis[global_col] = 0.0;
        }
    }

    fn to_dense(&self) -> Array2<f64> {
        let mut out = Array2::<f64>::zeros((self.dim, self.dim));
        self.mul_basis_columns_into(0, out.view_mut());
        out
    }

    fn trace_projected_factor(&self, factor: &Array2<f64>) -> f64 {
        // For the two-block row-coefficient operator
        //   B v = [X_a^T (c_aa·u_a + c_ab·u_b),  X_b^T (c_ab·u_a + c_bb·u_b)]
        // with u_a = X_a v_a, u_b = X_b v_b, the column-wise quadratic form is
        //   F[:,k]^T B F[:,k] = u_a^T r_a + u_b^T r_b
        //                    = Σ_i (c_aa[i] u_a[i]² + 2 c_ab[i] u_a[i] u_b[i]
        //                            + c_bb[i] u_b[i]²)
        // so the projected trace never needs the X^T r step that the default
        // mul_vec path computes, and the per-row coefficients fold the K
        // columns into a single weighted sum once U_a, U_b are formed.
        let grams = self.projected_row_gram_triples(factor);
        self.trace_from_row_gram_triples(grams.view())
    }

    fn trace_projected_factor_cached(
        &self,
        factor: &Array2<f64>,
        cache: &crate::solver::estimate::reml::unified::ProjectedFactorCache,
    ) -> f64 {
        // Validate the factor row count up front. Without this, a caller that
        // hands in a factor whose row count does not equal the joint p slips
        // into the per-column `mul_vec` slicing where a `assert_eq!`
        // panics with the generic `left/right` message — that loses the
        // operator identity and the (pa, pb) split which is the only useful
        // diagnostic when the trace caller's own dimension bookkeeping is
        // off. Validate at the operator boundary so the panic localises the
        // caller, and so this contract is enforced in release builds too
        // (the inner `assert_eq!` is a debug-only safety net).
        assert_eq!(
            factor.nrows(),
            self.dim,
            "two-block cached projected trace factor row mismatch: factor rows={} \
             but joint p={} (pa={}, pb={})",
            factor.nrows(),
            self.dim,
            self.pa,
            self.dim - self.pa,
        );
        let key = crate::solver::estimate::reml::unified::ProjectedFactorKey::from_factor_view(
            self.projected_row_gram_cache_id(),
            factor.view(),
        );
        let grams = cache.get_or_insert_with(key, || self.projected_row_gram_triples(factor));
        self.trace_from_row_gram_triples(grams.view())
    }

    fn is_implicit(&self) -> bool {
        true
    }
}

impl DesignTwoBlockRowCoeffOperator {
    fn design_cache_token(design: &DesignMatrix) -> usize {
        match design {
            DesignMatrix::Dense(DenseDesignMatrix::Materialized(matrix)) => {
                Arc::as_ptr(matrix) as usize
            }
            DesignMatrix::Dense(DenseDesignMatrix::Lazy(op)) => {
                Arc::as_ptr(op) as *const () as usize
            }
            DesignMatrix::Sparse(sparse) => sparse as *const _ as usize,
        }
    }

    fn projected_row_gram_cache_id(&self) -> usize {
        let mut hasher = DefaultHasher::new();
        "DesignTwoBlockRowCoeffOperator::projected_row_gram_triples".hash(&mut hasher);
        Self::design_cache_token(&self.x_a).hash(&mut hasher);
        Self::design_cache_token(&self.x_b).hash(&mut hasher);
        self.nrows.hash(&mut hasher);
        self.pa.hash(&mut hasher);
        self.dim.hash(&mut hasher);
        hasher.finish() as usize
    }

    fn projected_row_gram_triples(&self, factor: &Array2<f64>) -> Array2<f64> {
        assert_eq!(
            factor.nrows(),
            self.dim,
            "two-block cached projected trace factor row mismatch: factor rows={} \
             but joint p={} (pa={}, pb={})",
            factor.nrows(),
            self.dim,
            self.pa,
            self.dim - self.pa,
        );
        let rank = factor.ncols();
        let mut grams = Array2::<f64>::zeros((self.nrows, 3));
        if self.nrows == 0 || rank == 0 {
            return grams;
        }
        let rows_per_chunk = gamlss_projected_trace_chunk_rows(rank, 2, 3).min(self.nrows.max(1));
        let f_a = factor.slice(s![0..self.pa, ..]);
        let f_b = factor.slice(s![self.pa..self.dim, ..]);
        let fill_chunk = |start: usize, mut out_chunk: ndarray::ArrayViewMut2<'_, f64>| {
            let end = (start + rows_per_chunk).min(self.nrows);
            let rows = start..end;
            let x_a_chunk = self
                .x_a
                .try_row_chunk(rows.clone())
                .expect("two-block projected trace x_a row chunk materialization failed");
            let x_b_chunk = self
                .x_b
                .try_row_chunk(rows.clone())
                .expect("two-block projected trace x_b row chunk materialization failed");
            let u_a = fast_ab(&x_a_chunk, &f_a);
            let u_b = fast_ab(&x_b_chunk, &f_b);
            for local_i in 0..u_a.nrows() {
                let mut aa = 0.0;
                let mut ab = 0.0;
                let mut bb = 0.0;
                for col in 0..rank {
                    let a = u_a[[local_i, col]];
                    let b = u_b[[local_i, col]];
                    aa += a * a;
                    ab += a * b;
                    bb += b * b;
                }
                out_chunk[[local_i, 0]] = aa;
                out_chunk[[local_i, 1]] = ab;
                out_chunk[[local_i, 2]] = bb;
            }
        };
        if rayon::current_thread_index().is_none() && self.nrows > rows_per_chunk {
            grams
                .axis_chunks_iter_mut(Axis(0), rows_per_chunk)
                .into_par_iter()
                .enumerate()
                .for_each(|(chunk_idx, out_chunk)| {
                    fill_chunk(chunk_idx * rows_per_chunk, out_chunk)
                });
        } else {
            for start in (0..self.nrows).step_by(rows_per_chunk) {
                let end = (start + rows_per_chunk).min(self.nrows);
                let out_chunk = grams.slice_mut(s![start..end, ..]);
                fill_chunk(start, out_chunk);
            }
        }
        grams
    }

    fn trace_from_row_gram_triples(&self, grams: ArrayView2<'_, f64>) -> f64 {
        assert_eq!(grams.nrows(), self.nrows);
        assert_eq!(grams.ncols(), 3);
        let c_aa = self
            .c_aa
            .as_slice()
            .expect("c_aa is constructed contiguous");
        let c_ab = self
            .c_ab
            .as_slice()
            .expect("c_ab is constructed contiguous");
        let c_bb = self
            .c_bb
            .as_slice()
            .expect("c_bb is constructed contiguous");
        let mut trace = 0.0;
        for i in 0..self.nrows {
            trace +=
                c_aa[i] * grams[[i, 0]] + 2.0 * c_ab[i] * grams[[i, 1]] + c_bb[i] * grams[[i, 2]];
        }
        trace
    }
}

/// Matrix-free joint-Hessian operator for the two-block Gaussian
/// location-scale family. The dense Hessian decomposes as
///
///   H = [[X_mu^T diag(w) X_mu,    X_mu^T diag(cross) X_ls],
///        [X_ls^T diag(cross) X_mu, X_ls^T diag(scale) X_ls]],
///
/// with `cross = 0` and `scale = 2κ²a` — the block-diagonal Gaussian Fisher
/// (expected) information (μ ⊥ σ, #684; residual-free (log σ, log σ) block,
/// #566). This MUST match the dense `exact_newton_joint_hessian_from_designs`
/// curvature object exactly: the observed cross term `2κm` (mean-zero noise)
/// over-smooths the scale and is its Fisher expectation 0. The matvec applies
/// each block by a single design-matrix multiply on each side, so the cost
/// is Θ(n (p_mu + p_ls)) per `Hv` rather than Θ(n (p_mu + p_ls)²) to form
/// the dense matrix.
struct GaussianLocationScaleHessianWorkspace {
    family: GaussianLocationScaleFamily,
    block_states: Vec<ParameterBlockState>,
    xmu: Arc<Array2<f64>>,
    x_ls: Arc<Array2<f64>>,
    coeff_mm: Array1<f64>,
    coeff_ml: Array1<f64>,
    coeff_ll: Array1<f64>,
}

impl GaussianLocationScaleHessianWorkspace {
    fn new(
        family: GaussianLocationScaleFamily,
        block_states: Vec<ParameterBlockState>,
        xmu: Array2<f64>,
        x_ls: Array2<f64>,
    ) -> Result<Self, String> {
        let etamu = &block_states[GaussianLocationScaleFamily::BLOCK_MU].eta;
        let eta_ls = &block_states[GaussianLocationScaleFamily::BLOCK_LOG_SIGMA].eta;
        let rows = family.get_or_compute_row_scalars(etamu, eta_ls)?;
        // Single source of truth shared with the dense
        // `exact_newton_joint_hessian_from_designs`: μ ⊥ σ ⇒ cross = 0 (#684),
        // (ls,ls) = 2κ²a (#566). Reading the same coefficients as the dense path
        // makes the cross-block drift that caused #684 structurally impossible.
        let (coeff_mm, coeff_ml, coeff_ll) = gaussian_locscale_fisher_joint_row_coeffs(&rows);
        Ok(Self {
            family,
            block_states,
            xmu: Arc::new(xmu),
            x_ls: Arc::new(x_ls),
            coeff_mm,
            coeff_ml,
            coeff_ll,
        })
    }

    /// Apply a Horvitz–Thompson outer-row subsample mask to the precomputed
    /// per-row coefficient arrays in place.
    ///
    /// Each sampled row's `coeff_*[i]` is multiplied by its
    /// `WeightedOuterRow.weight` (the HT inverse-inclusion factor 1/π_i —
    /// uniform or stratified sampling both supported). All non-sampled rows
    /// are zeroed. Because every downstream assembly (`hessian_dense`,
    /// `hessian_matvec`, `hessian_diagonal`) is row-linear in these arrays
    /// via `Xᵀ diag(W) X`, the resulting joint-Hessian is an unbiased
    /// estimator of the full-data joint Hessian.
    fn apply_outer_subsample(
        &mut self,
        rows: &[crate::families::marginal_slope_shared::WeightedOuterRow],
    ) {
        let n = self.coeff_mm.len();
        let mut mask_mm = Array1::<f64>::zeros(n);
        let mut mask_ml = Array1::<f64>::zeros(n);
        let mut mask_ll = Array1::<f64>::zeros(n);
        for r in rows {
            let i = r.index;
            mask_mm[i] = self.coeff_mm[i] * r.weight;
            mask_ml[i] = self.coeff_ml[i] * r.weight;
            mask_ll[i] = self.coeff_ll[i] * r.weight;
        }
        self.coeff_mm = mask_mm;
        self.coeff_ml = mask_ml;
        self.coeff_ll = mask_ll;
    }
}

impl ExactNewtonJointHessianWorkspace for GaussianLocationScaleHessianWorkspace {
    fn hessian_dense(&self) -> Result<Option<Array2<f64>>, String> {
        // Same Hv structure as `hessian_matvec`, but built once via 3 GEMMs
        // (`Xᵀ diag(W) X` per block) instead of letting
        // `MatrixFreeSpdOperator::materialize_dense_operator` reconstruct the
        // dense Hessian via `total` canonical-basis HVPs. At biobank scale
        // (n≈320k, p_total≈82) the canonical-basis path takes ~568s per κ-iter
        // while the dense build via fast_xt_diag_x/y is ~1s.
        let pmu = self.xmu.ncols();
        let p_ls = self.x_ls.ncols();
        let total = pmu + p_ls;
        let h_mm = xt_diag_x_dense(self.xmu.as_ref(), &self.coeff_mm)?;
        let h_ml = xt_diag_y_dense(self.xmu.as_ref(), &self.coeff_ml, self.x_ls.as_ref())?;
        let h_ll = xt_diag_x_dense(self.x_ls.as_ref(), &self.coeff_ll)?;
        let mut h = Array2::<f64>::zeros((total, total));
        h.slice_mut(s![0..pmu, 0..pmu]).assign(&h_mm);
        h.slice_mut(s![0..pmu, pmu..total]).assign(&h_ml);
        h.slice_mut(s![pmu..total, pmu..total]).assign(&h_ll);
        mirror_upper_to_lower(&mut h);
        Ok(Some(h))
    }

    fn hessian_matvec_available(&self) -> bool {
        true
    }

    fn hessian_matvec(&self, v: &Array1<f64>) -> Result<Option<Array1<f64>>, String> {
        let pmu = self.xmu.ncols();
        let p_ls = self.x_ls.ncols();
        let total = pmu + p_ls;
        if v.len() != total {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScale matvec dimension mismatch: got {}, expected {}",
                    v.len(),
                    total
                ),
            }
            .into());
        }
        let u_mu = fast_av(self.xmu.as_ref(), &v.slice(s![0..pmu]));
        let u_ls = fast_av(self.x_ls.as_ref(), &v.slice(s![pmu..total]));
        let r_mu = &self.coeff_mm * &u_mu + &self.coeff_ml * &u_ls;
        let r_ls = &self.coeff_ml * &u_mu + &self.coeff_ll * &u_ls;
        let out_mu = fast_atv(self.xmu.as_ref(), &r_mu);
        let out_ls = fast_atv(self.x_ls.as_ref(), &r_ls);
        let mut out = Array1::<f64>::zeros(total);
        out.slice_mut(s![0..pmu]).assign(&out_mu);
        out.slice_mut(s![pmu..total]).assign(&out_ls);
        Ok(Some(out))
    }

    fn hessian_diagonal(&self) -> Result<Option<Array1<f64>>, String> {
        use rayon::iter::{IntoParallelIterator, ParallelIterator};
        let pmu = self.xmu.ncols();
        let p_ls = self.x_ls.ncols();
        let total = pmu + p_ls;
        // Per-column reduction is independent; parallelize across columns.
        let diag_mu: Vec<f64> = (0..pmu)
            .into_par_iter()
            .map(|j| {
                let col = self.xmu.column(j);
                col.iter()
                    .zip(self.coeff_mm.iter())
                    .map(|(&v, &c)| c * v * v)
                    .sum()
            })
            .collect();
        let diag_ls: Vec<f64> = (0..p_ls)
            .into_par_iter()
            .map(|j| {
                let col = self.x_ls.column(j);
                col.iter()
                    .zip(self.coeff_ll.iter())
                    .map(|(&v, &c)| c * v * v)
                    .sum()
            })
            .collect();
        let mut diag = Array1::<f64>::zeros(total);
        for (j, v) in diag_mu.into_iter().enumerate() {
            diag[j] = v;
        }
        for (j, v) in diag_ls.into_iter().enumerate() {
            diag[pmu + j] = v;
        }
        Ok(Some(diag))
    }

    fn directional_derivative(
        &self,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.family
            .exact_newton_joint_hessian_directional_derivative_from_designs(
                &self.block_states,
                &DenseOrOperator::Borrowed(self.xmu.as_ref()),
                &DenseOrOperator::Borrowed(self.x_ls.as_ref()),
                d_beta_flat,
            )
    }

    fn directional_derivative_operator(
        &self,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Arc<dyn crate::solver::estimate::reml::unified::HyperOperator>>, String>
    {
        let n = self.xmu.nrows();
        let pmu = self.xmu.ncols();
        let pls = self.x_ls.ncols();
        let total = pmu + pls;
        if d_beta_flat.len() != total {
            return Err(GamlssError::InvalidInput {
                reason: format!(
                    "GaussianLocationScale dH operator: d_beta length {} != {}",
                    d_beta_flat.len(),
                    total
                ),
            }
            .into());
        }
        let etamu = &self.block_states[GaussianLocationScaleFamily::BLOCK_MU].eta;
        let eta_ls = &self.block_states[GaussianLocationScaleFamily::BLOCK_LOG_SIGMA].eta;
        let rows = self.family.get_or_compute_row_scalars(etamu, eta_ls)?;
        let ximu = fast_av(self.xmu.as_ref(), &d_beta_flat.slice(s![0..pmu]));
        let xi_ls = fast_av(self.x_ls.as_ref(), &d_beta_flat.slice(s![pmu..total]));
        let directional = gaussian_joint_first_directionalweights(&rows, &ximu, &xi_ls);
        let c_mm = directional.0;
        let c_ll = directional.2;
        // Fisher cross block ≡ 0 (μ ⊥ σ; #684), so its directional derivative is
        // identically 0 — matching the dense
        // `exact_newton_joint_hessian_directional_derivative_from_designs`, which
        // likewise does not assemble `directional.1`.
        let c_ml = Array1::<f64>::zeros(c_mm.len());
        Ok(Some(Arc::new(make_two_block_row_coeff_operator(
            self.xmu.clone(),
            self.x_ls.clone(),
            c_mm,
            c_ml,
            c_ll,
            n,
        ))))
    }

    fn second_directional_derivative(
        &self,
        d_beta_u_flat: &Array1<f64>,
        d_beta_v_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.family
            .exact_newton_joint_hessiansecond_directional_derivative_from_designs(
                &self.block_states,
                &DenseOrOperator::Borrowed(self.xmu.as_ref()),
                &DenseOrOperator::Borrowed(self.x_ls.as_ref()),
                d_beta_u_flat,
                d_beta_v_flat,
            )
    }

    fn second_directional_derivative_operator(
        &self,
        d_beta_u: &Array1<f64>,
        d_beta_v: &Array1<f64>,
    ) -> Result<Option<Arc<dyn crate::solver::estimate::reml::unified::HyperOperator>>, String>
    {
        let n = self.xmu.nrows();
        let pmu = self.xmu.ncols();
        let pls = self.x_ls.ncols();
        let total = pmu + pls;
        if d_beta_u.len() != total || d_beta_v.len() != total {
            return Err(GamlssError::InvalidInput {
                reason: format!(
                    "GaussianLocationScale d2H operator: d_beta_{{u,v}} length {}/{} != {}",
                    d_beta_u.len(),
                    d_beta_v.len(),
                    total
                ),
            }
            .into());
        }
        let etamu = &self.block_states[GaussianLocationScaleFamily::BLOCK_MU].eta;
        let eta_ls = &self.block_states[GaussianLocationScaleFamily::BLOCK_LOG_SIGMA].eta;
        let rows = self.family.get_or_compute_row_scalars(etamu, eta_ls)?;
        let ximu_u = fast_av(self.xmu.as_ref(), &d_beta_u.slice(s![0..pmu]));
        let xi_ls_u = fast_av(self.x_ls.as_ref(), &d_beta_u.slice(s![pmu..total]));
        let ximu_v = fast_av(self.xmu.as_ref(), &d_beta_v.slice(s![0..pmu]));
        let xi_ls_v = fast_av(self.x_ls.as_ref(), &d_beta_v.slice(s![pmu..total]));
        let directional =
            gaussian_jointsecond_directionalweights(&rows, &ximu_u, &xi_ls_u, &ximu_v, &xi_ls_v);
        let c_mm = directional.0;
        let c_ll = directional.2;
        // Fisher cross block ≡ 0 (μ ⊥ σ; #684); its second directional
        // derivative is identically 0 too — match the dense path (which does not
        // assemble `directional.1`).
        let c_ml = Array1::<f64>::zeros(c_mm.len());
        Ok(Some(Arc::new(make_two_block_row_coeff_operator(
            self.xmu.clone(),
            self.x_ls.clone(),
            c_mm,
            c_ml,
            c_ll,
            n,
        ))))
    }
}

/// Build a `RowCoeffOperator` for the standard two-block GAMLSS structure
/// with one design per block and three pair coefficients (a,a), (a,b), (b,b).
/// The resulting matrix mirrors the dense
/// `X_a^T diag(c_aa) X_a + X_a^T diag(c_ab) X_b + X_b^T diag(c_ab) X_a + X_b^T diag(c_bb) X_b`
/// assembly emitted by `gaussian_joint_hessian_from_designs` (Gaussian path)
/// and the `xt_diag_*` block writers (binomial path).
fn make_two_block_row_coeff_operator(
    x_a: Arc<Array2<f64>>,
    x_b: Arc<Array2<f64>>,
    c_aa: Array1<f64>,
    c_ab: Array1<f64>,
    c_bb: Array1<f64>,
    nrows: usize,
) -> RowCoeffOperator {
    let pa = x_a.ncols();
    let pb = x_b.ncols();
    RowCoeffOperator::from_directions(
        vec![pa, pb],
        vec![(0, x_a), (1, x_b)],
        vec![(0, 0, c_aa), (0, 1, c_ab), (1, 1, c_bb)],
        nrows,
    )
}

fn make_two_block_design_row_coeff_operator(
    x_a: DesignMatrix,
    x_b: DesignMatrix,
    c_aa: Arc<Array1<f64>>,
    c_ab: Arc<Array1<f64>>,
    c_bb: Arc<Array1<f64>>,
) -> Result<DesignTwoBlockRowCoeffOperator, String> {
    let nrows = x_a.nrows();
    if x_b.nrows() != nrows || c_aa.len() != nrows || c_ab.len() != nrows || c_bb.len() != nrows {
        return Err(GamlssError::DimensionMismatch { reason: format!(
            "two-block row coefficient operator dimension mismatch: rows a={}, b={}, coeffs={}/{}/{}",
            nrows,
            x_b.nrows(),
            c_aa.len(),
            c_ab.len(),
            c_bb.len()
        ) }.into());
    }
    let pa = x_a.ncols();
    let pb = x_b.ncols();
    Ok(DesignTwoBlockRowCoeffOperator {
        x_a,
        x_b,
        c_aa,
        c_ab,
        c_bb,
        dim: pa + pb,
        nrows,
        pa,
    })
}

struct GaussianLocationScaleWiggleGeometry {
    basis: Array2<f64>,
    basis_d1: Array2<f64>,
    basis_d2: Array2<f64>,
    basis_d3: Array2<f64>,
    dq_dq0: Array1<f64>,
    d2q_dq02: Array1<f64>,
    d3q_dq03: Array1<f64>,
    d4q_dq04: Array1<f64>,
}

/// Per-row pieces of the 3-block Gaussian location-scale-wiggle joint
/// Hessian. Both the dense path and the matrix-free workspace share these
/// row coefficients; only the assembly differs.
struct GaussianLocationScaleWiggleHessianRowPieces {
    coeff_mm: Array1<f64>,
    coeff_ml: Array1<f64>,
    coeff_ll: Array1<f64>,
    coeff_mw_b: Array1<f64>,
    coeff_mw_d: Array1<f64>,
    coeff_lw_b: Array1<f64>,
    coeff_ww: Array1<f64>,
    basis: Array2<f64>,
    basis_d1: Array2<f64>,
}

impl GaussianLocationScaleWiggleHessianRowPieces {
    fn assemble_dense(&self, xmu: &Array2<f64>, x_ls: &Array2<f64>) -> Result<Array2<f64>, String> {
        let h_mm = xt_diag_x_dense(xmu, &self.coeff_mm)?;
        let h_ml = xt_diag_y_dense(xmu, &self.coeff_ml, x_ls)?;
        let h_ll = xt_diag_x_dense(x_ls, &self.coeff_ll)?;
        let h_mw = xt_diag_y_dense(xmu, &self.coeff_mw_b, &self.basis)?
            + &xt_diag_y_dense(xmu, &self.coeff_mw_d, &self.basis_d1)?;
        let h_lw = xt_diag_y_dense(x_ls, &self.coeff_lw_b, &self.basis)?;
        let h_ww = xt_diag_x_dense(&self.basis, &self.coeff_ww)?;
        Ok(gaussian_pack_wiggle_joint_symmetrichessian(
            &h_mm, &h_ml, &h_mw, &h_ll, &h_lw, &h_ww,
        ))
    }
}

fn scale_matrix_rows(mat: &Array2<f64>, coeffs: &Array1<f64>) -> Result<Array2<f64>, String> {
    if mat.nrows() != coeffs.len() {
        return Err(GamlssError::DimensionMismatch {
            reason: format!(
                "row scaling dimension mismatch: matrix has {} rows but coeffs have {} entries",
                mat.nrows(),
                coeffs.len()
            ),
        }
        .into());
    }
    Ok(Array2::from_shape_fn(mat.dim(), |(i, j)| {
        mat[[i, j]] * coeffs[i]
    }))
}

fn gaussian_pack_wiggle_joint_score(
    score_mu: &Array1<f64>,
    score_ls: &Array1<f64>,
    score_w: &Array1<f64>,
) -> Array1<f64> {
    let pmu = score_mu.len();
    let p_ls = score_ls.len();
    let pw = score_w.len();
    let total = pmu + p_ls + pw;
    let mut out = Array1::<f64>::zeros(total);
    out.slice_mut(s![0..pmu]).assign(score_mu);
    out.slice_mut(s![pmu..pmu + p_ls]).assign(score_ls);
    out.slice_mut(s![pmu + p_ls..total]).assign(score_w);
    out
}

fn gaussian_pack_wiggle_joint_symmetrichessian(
    h_mm: &Array2<f64>,
    h_ml: &Array2<f64>,
    h_mw: &Array2<f64>,
    h_ll: &Array2<f64>,
    h_lw: &Array2<f64>,
    h_ww: &Array2<f64>,
) -> Array2<f64> {
    let pmu = h_mm.nrows();
    let p_ls = h_ll.nrows();
    let pw = h_ww.nrows();
    let total = pmu + p_ls + pw;
    let mut out = Array2::<f64>::zeros((total, total));
    out.slice_mut(s![0..pmu, 0..pmu]).assign(h_mm);
    out.slice_mut(s![0..pmu, pmu..pmu + p_ls]).assign(h_ml);
    out.slice_mut(s![0..pmu, pmu + p_ls..total]).assign(h_mw);
    out.slice_mut(s![pmu..pmu + p_ls, pmu..pmu + p_ls])
        .assign(h_ll);
    out.slice_mut(s![pmu..pmu + p_ls, pmu + p_ls..total])
        .assign(h_lw);
    out.slice_mut(s![pmu + p_ls..total, pmu + p_ls..total])
        .assign(h_ww);
    mirror_upper_to_lower(&mut out);
    out
}

pub struct GaussianLocationScaleWiggleFamily {
    pub y: Array1<f64>,
    pub weights: Array1<f64>,
    pub mu_design: Option<DesignMatrix>,
    pub log_sigma_design: Option<DesignMatrix>,
    pub wiggle_knots: Array1<f64>,
    pub wiggle_degree: usize,
    /// Resource policy threaded into PsiDesignMap construction (and any other
    /// per-call materialization decision) made during exact-Newton joint psi
    /// derivative evaluation. Defaults to `ResourcePolicy::default_library()`
    /// when the family is built without an explicit policy.
    pub policy: crate::resource::ResourcePolicy,
    cached_row_scalars:
        std::sync::RwLock<Option<(f64, f64, f64, f64, f64, f64, Arc<GaussianJointRowScalars>)>>,
}

impl Clone for GaussianLocationScaleWiggleFamily {
    fn clone(&self) -> Self {
        Self {
            y: self.y.clone(),
            weights: self.weights.clone(),
            mu_design: self.mu_design.clone(),
            log_sigma_design: self.log_sigma_design.clone(),
            wiggle_knots: self.wiggle_knots.clone(),
            wiggle_degree: self.wiggle_degree,
            policy: self.policy.clone(),
            cached_row_scalars: std::sync::RwLock::new(
                self.cached_row_scalars
                    .read()
                    .expect("lock poisoned")
                    .clone(),
            ),
        }
    }
}

impl GaussianLocationScaleWiggleFamily {
    pub const BLOCK_MU: usize = 0;
    pub const BLOCK_LOG_SIGMA: usize = 1;
    pub const BLOCK_WIGGLE: usize = 2;

    pub fn parameternames() -> &'static [&'static str] {
        &["mu", "log_sigma", "wiggle"]
    }

    pub fn parameter_links() -> &'static [ParameterLink] {
        &[
            ParameterLink::Identity,
            ParameterLink::Log,
            ParameterLink::Wiggle,
        ]
    }

    pub fn metadata() -> FamilyMetadata {
        FamilyMetadata {
            name: "gaussian_location_scalewiggle",
            parameternames: Self::parameternames(),
            parameter_links: Self::parameter_links(),
        }
    }

    fn exact_joint_supported(&self) -> bool {
        self.mu_design.is_some() && self.log_sigma_design.is_some()
    }

    fn wiggle_basiswith_options(
        &self,
        q0: ArrayView1<'_, f64>,
        options: BasisOptions,
    ) -> Result<Array2<f64>, String> {
        monotone_wiggle_basis_with_derivative_order(
            q0,
            &self.wiggle_knots,
            self.wiggle_degree,
            options.derivative_order,
        )
    }

    fn wiggle_design(&self, q0: ArrayView1<'_, f64>) -> Result<Array2<f64>, String> {
        self.wiggle_basiswith_options(q0, BasisOptions::value())
    }

    fn wiggle_dq_dq0(
        &self,
        q0: ArrayView1<'_, f64>,
        beta_link_wiggle: ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, String> {
        let d1 = self.wiggle_basiswith_options(q0, BasisOptions::first_derivative())?;
        if d1.ncols() != beta_link_wiggle.len() {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "wiggle derivative/beta mismatch: basis has {} columns but beta_link_wiggle has {} coefficients",
                d1.ncols(),
                beta_link_wiggle.len()
            ) }.into());
        }
        Ok(d1.dot(&beta_link_wiggle) + 1.0)
    }

    fn wiggle_d2q_dq02(
        &self,
        q0: ArrayView1<'_, f64>,
        beta_link_wiggle: ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, String> {
        let d2 = self.wiggle_basiswith_options(q0, BasisOptions::second_derivative())?;
        if d2.ncols() != beta_link_wiggle.len() {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "wiggle second-derivative/beta mismatch: basis has {} columns but beta_link_wiggle has {} coefficients",
                d2.ncols(),
                beta_link_wiggle.len()
            ) }.into());
        }
        Ok(d2.dot(&beta_link_wiggle))
    }

    fn wiggle_d3basis_constrained(&self, q0: ArrayView1<'_, f64>) -> Result<Array2<f64>, String> {
        monotone_wiggle_basis_with_derivative_order(q0, &self.wiggle_knots, self.wiggle_degree, 3)
    }

    fn wiggle_d3q_dq03(
        &self,
        q0: ArrayView1<'_, f64>,
        beta_link_wiggle: ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, String> {
        let d3 = self.wiggle_d3basis_constrained(q0)?;
        if d3.ncols() != beta_link_wiggle.len() {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "wiggle third-derivative/beta mismatch: basis has {} columns but beta_link_wiggle has {} coefficients",
                d3.ncols(),
                beta_link_wiggle.len()
            ) }.into());
        }
        Ok(d3.dot(&beta_link_wiggle))
    }

    fn wiggle_d4q_dq04(
        &self,
        q0: ArrayView1<'_, f64>,
        beta_link_wiggle: ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, String> {
        let d4 = monotone_wiggle_basis_with_derivative_order(
            q0,
            &self.wiggle_knots,
            self.wiggle_degree,
            4,
        )?;
        if d4.ncols() != beta_link_wiggle.len() {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "wiggle fourth-derivative/beta mismatch: basis has {} columns but beta_link_wiggle has {} coefficients",
                d4.ncols(),
                beta_link_wiggle.len()
            ) }.into());
        }
        Ok(d4.dot(&beta_link_wiggle))
    }

    fn wiggle_geometry(
        &self,
        q0: ArrayView1<'_, f64>,
        beta_link_wiggle: ArrayView1<'_, f64>,
    ) -> Result<GaussianLocationScaleWiggleGeometry, String> {
        let basis = self.wiggle_design(q0)?;
        let basis_d1 = self.wiggle_basiswith_options(q0, BasisOptions::first_derivative())?;
        let basis_d2 = self.wiggle_basiswith_options(q0, BasisOptions::second_derivative())?;
        let basis_d3 = self.wiggle_d3basis_constrained(q0)?;
        let dq_dq0 = self.wiggle_dq_dq0(q0, beta_link_wiggle)?;
        let d2q_dq02 = self.wiggle_d2q_dq02(q0, beta_link_wiggle)?;
        let d3q_dq03 = self.wiggle_d3q_dq03(q0, beta_link_wiggle)?;
        let d4q_dq04 = self.wiggle_d4q_dq04(q0, beta_link_wiggle)?;
        Ok(GaussianLocationScaleWiggleGeometry {
            basis,
            basis_d1,
            basis_d2,
            basis_d3,
            dq_dq0,
            d2q_dq02,
            d3q_dq03,
            d4q_dq04,
        })
    }

    fn get_or_compute_row_scalars(
        &self,
        q: &Array1<f64>,
        eta_ls: &Array1<f64>,
    ) -> Result<Arc<GaussianJointRowScalars>, String> {
        Ok(Arc::new(gaussian_jointrow_scalars(
            &self.y,
            q,
            eta_ls,
            &self.weights,
        )?))
    }

    fn dense_block_designs(&self) -> Result<(Cow<'_, Array2<f64>>, Cow<'_, Array2<f64>>), String> {
        dense_locscale_block_designs_cached(
            self.mu_design.as_ref(),
            self.log_sigma_design.as_ref(),
            "GaussianLocationScaleWiggleFamily",
            "GaussianLocationScaleWiggle",
            "mu",
            &self.policy.material_policy(),
        )
    }
    fn dense_block_designs_fromspecs<'a>(
        &self,
        specs: &'a [ParameterBlockSpec],
    ) -> Result<(Cow<'a, Array2<f64>>, Cow<'a, Array2<f64>>), String> {
        dense_locscale_block_designs_fromspecs(
            specs,
            3,
            "GaussianLocationScaleWiggleFamily",
            "GaussianLocationScaleWiggle",
            Self::BLOCK_MU,
            Self::BLOCK_LOG_SIGMA,
            "mu",
            &self.policy.material_policy(),
        )
    }

    fn exact_joint_dense_block_designs<'a>(
        &'a self,
        specs: Option<&'a [ParameterBlockSpec]>,
    ) -> Result<Option<(Cow<'a, Array2<f64>>, Cow<'a, Array2<f64>>)>, String> {
        if self.exact_joint_supported() {
            return self.dense_block_designs().map(Some);
        }
        if let Some(specs) = specs {
            return self.dense_block_designs_fromspecs(specs).map(Some);
        }
        Ok(None)
    }

    /// Build the [`BlockEffectiveJacobian`] for block `block_idx`.
    ///
    /// The wiggle block (block 2) modulates the inverse link nonlinearly and
    /// does not contribute a linear additive term to any output η; its
    /// Jacobian is an `(2 * n, p_wiggle)` zero matrix.
    ///
    /// - block 0 (mu):        output 0 = design rows, output 1 = zeros
    /// - block 1 (log_sigma): output 0 = zeros, output 1 = design rows
    /// - block 2 (wiggle):    all zeros (nonlinear link modulation)
    pub fn block_effective_jacobian(
        specs: &[ParameterBlockSpec],
        block_idx: usize,
    ) -> Result<Box<dyn BlockEffectiveJacobian>, String> {
        crate::util::block_jacobian::AdditiveWiggleBlockLayout {
            family: "GaussianLocationScaleWiggleFamily",
            n_outputs: 2,
            additive_blocks: &[Self::BLOCK_MU, Self::BLOCK_LOG_SIGMA],
            wiggle_block: Some(Self::BLOCK_WIGGLE),
        }
        .block_effective_jacobian(specs, block_idx)
    }
}

/// Row-coefficient bundle for the GLS Wiggle joint second directional
/// derivative, shared by the matrix-free operator and the dense
/// `_from_designs` assemblies. Holds exactly the quantities both consumers
/// read downstream of the (identical) coefficient computation.
struct GlsWiggleSecondDirCoeffs {
    coeff_mm_uv: Array1<f64>,
    coeff_ml_uv: Array1<f64>,
    coeff_ll_uv: Array1<f64>,
    a_u: Array1<f64>,
    a_v: Array1<f64>,
    a_uv: Array1<f64>,
    c_u: Array1<f64>,
    c_v: Array1<f64>,
    c_uv: Array1<f64>,
    l_u: Array1<f64>,
    l_v: Array1<f64>,
    l_uv: Array1<f64>,
    dw_u: Array1<f64>,
    dw_v: Array1<f64>,
    dw_uv: Array1<f64>,
}

/// The two probe directions resolved to row space for the GLS Wiggle joint
/// second directional derivative: `xi`/`zeta` are the X_mu/X_ls contractions,
/// and `q`/`s1`/`g2` are the mixed first/second-derivative wiggle pieces.
struct GlsWiggleDirPieces<'a> {
    zeta_u: &'a Array1<f64>,
    zeta_v: &'a Array1<f64>,
    q_u: &'a Array1<f64>,
    q_v: &'a Array1<f64>,
    q_uv: &'a Array1<f64>,
    s1_u: &'a Array1<f64>,
    s1_v: &'a Array1<f64>,
    s1_uv: &'a Array1<f64>,
    g2_u: &'a Array1<f64>,
    g2_v: &'a Array1<f64>,
    g2_uv: &'a Array1<f64>,
}

/// Compute the shared GLS Wiggle second-directional row coefficients from the
/// per-row scalars, wiggle geometry, and the resolved probe directions.
fn gls_wiggle_second_directional_coeffs(
    rows: &GaussianJointRowScalars,
    geom: &GaussianLocationScaleWiggleGeometry,
    dir: &GlsWiggleDirPieces<'_>,
) -> GlsWiggleSecondDirCoeffs {
    let GlsWiggleDirPieces {
        zeta_u,
        zeta_v,
        q_u,
        q_v,
        q_uv,
        s1_u,
        s1_v,
        s1_uv,
        g2_u,
        g2_v,
        g2_uv,
    } = *dir;
    let szeta_u = &rows.kappa * zeta_u;
    let szeta_v = &rows.kappa * zeta_v;
    let zeta_u_zeta_v = zeta_u * zeta_v;
    let dw_u = -2.0 * &rows.w * &szeta_u;
    let dw_v = -2.0 * &rows.w * &szeta_v;
    let dw_uv =
        4.0 * &rows.w * &(&szeta_u * &szeta_v) - 2.0 * &rows.w * &rows.kappa_prime * &zeta_u_zeta_v;
    let dm_u = -(&rows.w * q_u) - &(2.0 * &rows.m * &szeta_u);
    let dm_v = -(&rows.w * q_v) - &(2.0 * &rows.m * &szeta_v);
    let dm_uv = &(2.0 * &rows.w * &(q_u * &szeta_v + q_v * &szeta_u)) - &(&rows.w * q_uv)
        + &(4.0 * &rows.m * &(&szeta_u * &szeta_v))
        - 2.0 * &rows.m * &rows.kappa_prime * &zeta_u_zeta_v;
    let coeff_mm_uv = &(&dw_uv * &geom.dq_dq0.mapv(|v| v * v))
        + &(2.0 * &dw_u * &geom.dq_dq0 * s1_v)
        + &(2.0 * &dw_v * &geom.dq_dq0 * s1_u)
        + &(2.0 * &rows.w * s1_u * s1_v)
        + &(2.0 * &rows.w * &geom.dq_dq0 * s1_uv)
        - &(&dm_uv * &geom.d2q_dq02)
        - &(&dm_u * g2_v)
        - &(&dm_v * g2_u)
        - &(&rows.m * g2_uv);
    let n = rows.m.len();
    // H_{μ,ls} ≡ Fisher 0 (mean⊥scale orthogonality; the wiggle and μ both
    // enter the mean, log σ is the only scale block), so every β-directional
    // derivative — including this second-order one — is identically 0.
    let coeff_ml_uv = Array1::<f64>::zeros(n);
    // Second directional derivative of the Fisher (log σ, log σ) block
    // coeff_ll = 2κ²a (#566). η_ls is linear in β (no zeta_uv), so the only
    // surviving term is ∂²(2κ²a)/∂η² · zeta_u·zeta_v = 4a(κ'²+κκ'')·zeta_u·zeta_v
    // — matching the dense helper `d_uv` (gaussian_jointsecond_directionalweights).
    let coeff_ll_uv = 4.0
        * &rows.obs_weight
        * &(&rows.kappa_prime * &rows.kappa_prime + &rows.kappa * &rows.kappa_dprime)
        * &zeta_u_zeta_v;

    let a_u = &dw_u * &geom.dq_dq0 + &rows.w * s1_u;
    let a_v = &dw_v * &geom.dq_dq0 + &rows.w * s1_v;
    let a_uv = &dw_uv * &geom.dq_dq0 + &dw_u * s1_v + &dw_v * s1_u + &rows.w * s1_uv;
    let c_u = -&dm_u;
    let c_v = -&dm_v;
    let c_uv = -&dm_uv;
    // H_{ls,w} ≡ Fisher 0 (wiggle is mean-side; mean⊥scale), so all of its
    // β-directional derivatives are 0.
    let l_u = Array1::<f64>::zeros(n);
    let l_v = Array1::<f64>::zeros(n);
    let l_uv = Array1::<f64>::zeros(n);

    GlsWiggleSecondDirCoeffs {
        coeff_mm_uv,
        coeff_ml_uv,
        coeff_ll_uv,
        a_u,
        a_v,
        a_uv,
        c_u,
        c_v,
        c_uv,
        l_u,
        l_v,
        l_uv,
        dw_u,
        dw_v,
        dw_uv,
    }
}

impl GaussianLocationScaleWiggleFamily {
    fn exact_newton_joint_hessian_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: Option<&[ParameterBlockSpec]>,
    ) -> Result<Option<Array2<f64>>, String> {
        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(specs)? else {
            return Ok(None);
        };
        self.exact_newton_joint_hessian_from_designs(block_states, &xmu, &x_ls)
    }

    fn exact_newton_joint_hessian_directional_derivative_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: Option<&[ParameterBlockSpec]>,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(specs)? else {
            return Ok(None);
        };
        self.exact_newton_joint_hessian_directional_derivative_from_designs(
            block_states,
            &xmu,
            &x_ls,
            d_beta_flat,
        )
    }

    fn exact_newton_joint_hessian_second_directional_derivative_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: Option<&[ParameterBlockSpec]>,
        d_beta_u_flat: &Array1<f64>,
        d_beta_v_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(specs)? else {
            return Ok(None);
        };
        self.exact_newton_joint_hessiansecond_directional_derivative_from_designs(
            block_states,
            &xmu,
            &x_ls,
            d_beta_u_flat,
            d_beta_v_flat,
        )
    }

    fn exact_newton_joint_psi_direction(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        xmu: &Array2<f64>,
        x_ls: &Array2<f64>,
        policy: &crate::resource::ResourcePolicy,
    ) -> Result<Option<LocationScaleJointPsiDirection>, String> {
        let Some(parts) = locscale_joint_psi_direction_parts(
            block_states,
            derivative_blocks,
            psi_index,
            self.y.len(),
            xmu.ncols(),
            x_ls.ncols(),
            Self::BLOCK_MU,
            Self::BLOCK_LOG_SIGMA,
            3,
            "GaussianLocationScaleWiggleFamily",
            "mu",
            policy,
        )?
        else {
            return Ok(None);
        };
        Ok(Some(LocationScaleJointPsiDirection {
            block_idx: parts.block_idx,
            local_idx: parts.local_idx,
            z_primary_psi: parts.primary_z,
            z_ls_psi: parts.log_sigma_z,
            x_primary_psi: parts.primary_psi,
            x_ls_psi: parts.log_sigma_psi,
        }))
    }

    fn exact_newton_joint_psisecond_design_drifts(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_a: &LocationScaleJointPsiDirection,
        psi_b: &LocationScaleJointPsiDirection,
        xmu: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<LocationScaleJointPsiSecondDrifts, String> {
        locscale_joint_psisecond_design_drifts(
            block_states,
            derivative_blocks,
            psi_a,
            psi_b,
            LocScalePsiDriftConfig {
                n: self.y.len(),
                p_primary: xmu.ncols(),
                p_log_sigma: x_ls.ncols(),
                primary_block_idx: Self::BLOCK_MU,
                log_sigma_block_idx: Self::BLOCK_LOG_SIGMA,
                family_name: "GaussianLocationScaleWiggleFamily",
                primary_label: "mu",
                policy: &self.policy,
            },
        )
    }

    /// Compute the rowwise Hessian pieces shared by the dense path and the
    /// matrix-free workspace operator. The same coefficients reconstruct the
    /// dense p×p matrix or apply `Hv` directly without ever forming it.
    fn wiggle_hessian_row_pieces(
        &self,
        block_states: &[ParameterBlockState],
    ) -> Result<GaussianLocationScaleWiggleHessianRowPieces, String> {
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let q0 = &block_states[Self::BLOCK_MU].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        let betaw = &block_states[Self::BLOCK_WIGGLE].beta;
        let n = self.y.len();
        if q0.len() != n || eta_ls.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "GaussianLocationScaleWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        let q = q0 + etaw;
        let geom = self.wiggle_geometry(q0.view(), betaw.view())?;
        if geom.basis.ncols() != betaw.len() {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "GaussianLocationScaleWiggleFamily wiggle basis/beta mismatch: basis has {} columns but beta has {} entries",
                geom.basis.ncols(),
                betaw.len()
            ) }.into());
        }
        let rows = self.get_or_compute_row_scalars(&q, eta_ls)?;
        let coeff_mm = &rows.w * &geom.dq_dq0.mapv(|v| v * v) - &rows.m * &geom.d2q_dq02;
        // Gaussian mean⊥scale Fisher orthogonality. μ (mu) AND the wiggle both
        // enter the MEAN q = q0 + B(q0)·βw (see `let q = q0 + etaw`); log σ is
        // the only scale-side block. The Fisher (expected) cross between any
        // mean-side parameter and log σ is exactly 0: H_{μ,ls} = 2κm·dq_dq0 and
        // H_{ls,w} = 2κm both carry m = r·w = (y−q)·weight/σ², and E[m] =
        // E[r]·w = 0. The dense and matrix-free workspace paths SHARE these row
        // pieces, so setting the cross coeffs to 0 fixes the curvature object
        // (the observed 2κm value) for both. Diagonal/same-side blocks
        // (coeff_mm within mean, coeff_ll within scale, coeff_mw_* within mean,
        // coeff_ww within mean) are untouched.
        let coeff_ml = Array1::<f64>::zeros(n);
        // Fisher/expected (log σ, log σ) information E[H_{ls,ls}] = 2κ²a (#566):
        // the observed 2κ²n + κ'(a−n) collapses at small residuals and
        // over-smooths the scale; E[n]=a gives the residual-free 2κ²a.
        let coeff_ll = 2.0 * &rows.kappa * &rows.kappa * &rows.obs_weight;
        let coeff_mw_b = &rows.w * &geom.dq_dq0;
        let coeff_mw_d = -&rows.m;
        // ls↔wiggle is a mean⊥scale cross (wiggle is mean-side): Fisher 0.
        let coeff_lw_b = Array1::<f64>::zeros(n);
        let coeff_ww = rows.w.clone();
        Ok(GaussianLocationScaleWiggleHessianRowPieces {
            coeff_mm,
            coeff_ml,
            coeff_ll,
            coeff_mw_b,
            coeff_mw_d,
            coeff_lw_b,
            coeff_ww,
            basis: geom.basis,
            basis_d1: geom.basis_d1,
        })
    }

    fn exact_newton_joint_hessian_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        xmu: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        let pieces = self.wiggle_hessian_row_pieces(block_states)?;
        Ok(Some(pieces.assemble_dense(xmu, x_ls)?))
    }

    fn exact_newton_joint_hessian_directional_derivative_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        xmu: &Array2<f64>,
        x_ls: &Array2<f64>,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let pmu = xmu.ncols();
        let p_ls = x_ls.ncols();
        let q0 = &block_states[Self::BLOCK_MU].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        let betaw = &block_states[Self::BLOCK_WIGGLE].beta;
        let n = self.y.len();
        let layout = GamlssBetaLayout::withwiggle(pmu, p_ls, betaw.len());
        let (umu, u_ls, uw) = layout.split_three(
            d_beta_flat,
            "GaussianLocationScaleWiggleFamily exact joint directional Hessian",
        )?;
        if q0.len() != n || eta_ls.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "GaussianLocationScaleWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        let q = q0 + etaw;
        let geom = self.wiggle_geometry(q0.view(), betaw.view())?;
        let rows = self.get_or_compute_row_scalars(&q, eta_ls)?;
        let xi = fast_av(xmu, &umu);
        let zeta = fast_av(x_ls, &u_ls);
        // logb κ-scaled η_ls direction; κ' = dκ/dη_ls = κ(1−κ).
        let szeta = &rows.kappa * &zeta;
        let phi = fast_av(&geom.basis, &uw);
        let mut q_u = &geom.dq_dq0 * &xi;
        q_u += &phi;
        let mut s1_u = &geom.d2q_dq02 * &xi;
        s1_u += &fast_av(&geom.basis_d1, &uw);
        let mut g2_u = &geom.d3q_dq03 * &xi;
        g2_u += &fast_av(&geom.basis_d2, &uw);
        let basis_u = scale_matrix_rows(&geom.basis_d1, &xi)?;
        let basis1_u = scale_matrix_rows(&geom.basis_d2, &xi)?;
        let dw_u = -2.0 * &rows.w * &szeta;
        let dm_u = -(&rows.w * &q_u) - &(2.0 * &rows.m * &szeta);

        let coeff_mm_u = &(&dw_u * &geom.dq_dq0.mapv(|v| v * v))
            + &(2.0 * &rows.w * &geom.dq_dq0 * &s1_u)
            - &(&dm_u * &geom.d2q_dq02)
            - &(&rows.m * &g2_u);
        // Static blocks: H_{μ,ls} = Fisher 0 (mean⊥scale); H_{ls,ls} = Fisher
        // 2κ²a (#566). H_{μ,ls} ≡ 0 for all β, so its directional derivative is
        // also identically 0. The Fisher (ls,ls) block 2κ²a depends only on
        // η_ls (a is the constant prior weight), so its directional derivative
        // is 4κκ'a·zeta.
        let coeff_ml_u = Array1::<f64>::zeros(n);
        let coeff_ll_u = 4.0 * &rows.kappa * &rows.kappa_prime * &(&zeta * &rows.obs_weight);
        let a_u = &dw_u * &geom.dq_dq0 + &rows.w * &s1_u;
        let c_u = -&dm_u;
        // ls↔wiggle cross block: Fisher 0 (wiggle is mean-side), so its
        // directional derivative is 0 as well.
        let l_u = Array1::<f64>::zeros(n);
        let zeros_ls_b1 = Array1::<f64>::zeros(n);

        let h_mm = xt_diag_x_dense(xmu, &coeff_mm_u)?;
        let h_ml = xt_diag_y_dense(xmu, &coeff_ml_u, x_ls)?;
        let h_ll = xt_diag_x_dense(x_ls, &coeff_ll_u)?;
        let h_mw = xt_diag_y_dense(xmu, &a_u, &geom.basis)?
            + &xt_diag_y_dense(xmu, &(&rows.w * &geom.dq_dq0), &basis_u)?
            + &xt_diag_y_dense(xmu, &c_u, &geom.basis_d1)?
            + &xt_diag_y_dense(xmu, &(-&rows.m), &basis1_u)?;
        let h_lw = xt_diag_y_dense(x_ls, &l_u, &geom.basis)?
            + &xt_diag_y_dense(x_ls, &zeros_ls_b1, &basis_u)?;
        let a_ww = xt_diag_y_dense(&basis_u, &rows.w, &geom.basis)?;
        let h_ww = &a_ww + &a_ww.t() + &xt_diag_x_dense(&geom.basis, &dw_u)?;
        Ok(Some(gaussian_pack_wiggle_joint_symmetrichessian(
            &h_mm, &h_ml, &h_mw, &h_ll, &h_lw, &h_ww,
        )))
    }

    /// Build a matrix-free `RowCoeffOperator` for the GLS Wiggle joint
    /// directional derivative `D_β H_L[u]`. Output dimension is
    /// `pmu + p_ls + pw`. Channels (in order): X_mu, X_ls, B, B', B''.
    fn gls_wiggle_directional_operator(
        &self,
        block_states: &[ParameterBlockState],
        xmu_arc: Arc<Array2<f64>>,
        x_ls_arc: Arc<Array2<f64>>,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Arc<dyn crate::solver::estimate::reml::unified::HyperOperator>>, String>
    {
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let pmu = xmu_arc.ncols();
        let p_ls = x_ls_arc.ncols();
        let q0_eta = &block_states[Self::BLOCK_MU].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        let betaw = &block_states[Self::BLOCK_WIGGLE].beta;
        let n = self.y.len();
        let layout = GamlssBetaLayout::withwiggle(pmu, p_ls, betaw.len());
        let (umu, u_ls, uw) =
            layout.split_three(d_beta_flat, "GLS Wiggle joint dH operator d_beta")?;
        if q0_eta.len() != n || eta_ls.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "GaussianLocationScaleWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        let q = q0_eta + etaw;
        let geom = self.wiggle_geometry(q0_eta.view(), betaw.view())?;
        let rows = self.get_or_compute_row_scalars(&q, eta_ls)?;
        let xi = fast_av(xmu_arc.as_ref(), &umu);
        let zeta = fast_av(x_ls_arc.as_ref(), &u_ls);
        let szeta = &rows.kappa * &zeta;
        let phi = fast_av(&geom.basis, &uw);
        let mut q_u = &geom.dq_dq0 * &xi;
        q_u += &phi;
        let mut s1_u = &geom.d2q_dq02 * &xi;
        s1_u += &fast_av(&geom.basis_d1, &uw);
        let mut g2_u = &geom.d3q_dq03 * &xi;
        g2_u += &fast_av(&geom.basis_d2, &uw);
        let dw_u = -2.0 * &rows.w * &szeta;
        let dm_u = -(&rows.w * &q_u) - &(2.0 * &rows.m * &szeta);

        let coeff_mm_u = &(&dw_u * &geom.dq_dq0.mapv(|v| v * v))
            + &(2.0 * &rows.w * &geom.dq_dq0 * &s1_u)
            - &(&dm_u * &geom.d2q_dq02)
            - &(&rows.m * &g2_u);
        // H_{μ,ls} ≡ Fisher 0 (mean⊥scale); its directional derivative is 0.
        let coeff_ml_u = Array1::<f64>::zeros(n);
        // Fisher (ls,ls) 2κ²a directional derivative: 4κκ'a·zeta (#566).
        let coeff_ll_u = 4.0 * &rows.kappa * &rows.kappa_prime * &(&zeta * &rows.obs_weight);
        let a_u = &dw_u * &geom.dq_dq0 + &rows.w * &s1_u;
        let c_u = -&dm_u;
        // H_{ls,w} ≡ Fisher 0 (wiggle is mean-side); its derivative is 0 in
        // both the B channel (l_u) and the B' channel (coeff_ls_b1).
        let l_u = Array1::<f64>::zeros(n);

        // Pair-coefficient bundles. For (0=X_mu, 3=B'): combine
        // `xt_diag_y_dense(xmu, &(w·dq_dq0), &basis_u=diag(xi)·B')`
        // (giving coeff `w·dq_dq0·xi`) with `xt_diag_y_dense(xmu, &c_u, &B')`
        // (coeff `c_u`).
        let coeff_m_b1 = &(&rows.w * &geom.dq_dq0 * &xi) + &c_u;
        // (0=X_mu, 4=B''): from `xt_diag_y_dense(xmu, &(-m), &basis1_u=diag(xi)·B'')`.
        let coeff_m_b2 = -(&rows.m * &xi);
        // (1=X_ls, 3=B'): ls↔wiggle Fisher-0 cross → zero.
        let coeff_ls_b1 = Array1::<f64>::zeros(n);
        // (2=B, 3=B'): a_ww + a_ww^T where a_ww = (diag(xi)·B')^T diag(w) B
        // = B'^T diag(w·xi) B. The symmetric pair contribution in
        // `RowCoeffOperator` reproduces a_ww + a_ww^T with c = w·xi.
        let coeff_b_b1 = &rows.w * &xi;

        let basis: Arc<Array2<f64>> = Arc::new(geom.basis.clone());
        let basis_d1: Arc<Array2<f64>> = Arc::new(geom.basis_d1.clone());
        let basis_d2: Arc<Array2<f64>> = Arc::new(geom.basis_d2.clone());
        let pw = basis.ncols();

        Ok(Some(Arc::new(RowCoeffOperator::from_directions(
            vec![pmu, p_ls, pw],
            vec![
                (0, xmu_arc),
                (1, x_ls_arc),
                (2, basis),
                (2, basis_d1),
                (2, basis_d2),
            ],
            vec![
                // (X_mu, X_mu) ← `xt_diag_x_dense(xmu, &coeff_mm_u)`
                (0, 0, coeff_mm_u),
                // (X_mu, X_ls) ← `xt_diag_y_dense(xmu, &coeff_ml_u, x_ls)`
                (0, 1, coeff_ml_u),
                // (X_ls, X_ls) ← `xt_diag_x_dense(x_ls, &coeff_ll_u)`
                (1, 1, coeff_ll_u),
                // (X_mu, B) ← `xt_diag_y_dense(xmu, &a_u, &geom.basis)`
                (0, 2, a_u),
                // (X_mu, B') ← `xt_diag_y_dense(xmu, w·dq_dq0, basis_u=diag(ξ)·B') + xt_diag_y_dense(xmu, c_u, B')`
                (0, 3, coeff_m_b1),
                // (X_mu, B'') ← `xt_diag_y_dense(xmu, -m, basis1_u=diag(ξ)·B'')`
                (0, 4, coeff_m_b2),
                // (X_ls, B) ← `xt_diag_y_dense(x_ls, &l_u, &geom.basis)`
                (1, 2, l_u),
                // (X_ls, B') ← ls↔wiggle is mean⊥scale Fisher 0, so coeff_ls_b1 = 0
                (1, 3, coeff_ls_b1),
                // (B, B) ← `xt_diag_x_dense(&geom.basis, &dw_u)`
                (2, 2, dw_u),
                // (B, B') ← a_ww + a_ww^T = B^T diag(w·ξ) B' + B'^T diag(w·ξ) B
                (2, 3, coeff_b_b1),
            ],
            n,
        ))))
    }

    /// Build a matrix-free `RowCoeffOperator` for the GLS Wiggle joint
    /// second directional derivative `D²_β H_L[u, v]`. Channels: X_mu,
    /// X_ls, B, B', B'', B'''. Pair list mirrors the 8-term `xt_diag_*`
    /// assembly in `_from_designs`, with row-coefficient bundles that
    /// absorb the `ξ_u, ξ_v, ξ_u·ξ_v` row factors arising from
    /// `basis_u = diag(ξ_u)·B'`, `basis_uv = diag(ξ_u·ξ_v)·B''`, etc.
    fn gls_wiggle_second_directional_operator(
        &self,
        block_states: &[ParameterBlockState],
        xmu_arc: Arc<Array2<f64>>,
        x_ls_arc: Arc<Array2<f64>>,
        d_beta_u: &Array1<f64>,
        d_beta_v: &Array1<f64>,
    ) -> Result<Option<Arc<dyn crate::solver::estimate::reml::unified::HyperOperator>>, String>
    {
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let pmu = xmu_arc.ncols();
        let p_ls = x_ls_arc.ncols();
        let q0_eta = &block_states[Self::BLOCK_MU].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        let betaw = &block_states[Self::BLOCK_WIGGLE].beta;
        let n = self.y.len();
        let layout = GamlssBetaLayout::withwiggle(pmu, p_ls, betaw.len());
        let (umu, u_ls, uw) = layout.split_three(d_beta_u, "GLS Wiggle d2H operator (u)")?;
        let (vmu, v_ls, vw) = layout.split_three(d_beta_v, "GLS Wiggle d2H operator (v)")?;
        if q0_eta.len() != n || eta_ls.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "GaussianLocationScaleWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        let q = q0_eta + etaw;
        let geom = self.wiggle_geometry(q0_eta.view(), betaw.view())?;
        let rows = self.get_or_compute_row_scalars(&q, eta_ls)?;

        let xi_u = fast_av(xmu_arc.as_ref(), &umu);
        let xi_v = fast_av(xmu_arc.as_ref(), &vmu);
        let zeta_u = fast_av(x_ls_arc.as_ref(), &u_ls);
        let zeta_v = fast_av(x_ls_arc.as_ref(), &v_ls);
        let phi_u = fast_av(&geom.basis, &uw);
        let phi_v = fast_av(&geom.basis, &vw);
        let b1u = fast_av(&geom.basis_d1, &uw);
        let b1v = fast_av(&geom.basis_d1, &vw);
        let b2u = fast_av(&geom.basis_d2, &uw);
        let b2v = fast_av(&geom.basis_d2, &vw);
        let b3u = fast_av(&geom.basis_d3, &uw);
        let b3v = fast_av(&geom.basis_d3, &vw);

        let mut q_u = &geom.dq_dq0 * &xi_u;
        q_u += &phi_u;
        let mut q_v = &geom.dq_dq0 * &xi_v;
        q_v += &phi_v;
        let mut s1_u = &geom.d2q_dq02 * &xi_u;
        s1_u += &b1u;
        let mut s1_v = &geom.d2q_dq02 * &xi_v;
        s1_v += &b1v;
        let mut g2_u = &geom.d3q_dq03 * &xi_u;
        g2_u += &b2u;
        let mut g2_v = &geom.d3q_dq03 * &xi_v;
        g2_v += &b2v;
        let q_uv = &(&geom.d2q_dq02 * &(&xi_u * &xi_v)) + &(&b1u * &xi_v) + &(&b1v * &xi_u);
        let s1_uv = &(&geom.d3q_dq03 * &(&xi_u * &xi_v)) + &(&b2u * &xi_v) + &(&b2v * &xi_u);
        let g2_uv = &(&geom.d4q_dq04 * &(&xi_u * &xi_v)) + &(&b3u * &xi_v) + &(&b3v * &xi_u);

        let GlsWiggleSecondDirCoeffs {
            coeff_mm_uv,
            coeff_ml_uv,
            coeff_ll_uv,
            a_u,
            a_v,
            a_uv,
            c_u,
            c_v,
            c_uv,
            l_u,
            l_v,
            l_uv,
            dw_u,
            dw_v,
            dw_uv,
        } = gls_wiggle_second_directional_coeffs(
            &rows,
            &geom,
            &GlsWiggleDirPieces {
                zeta_u: &zeta_u,
                zeta_v: &zeta_v,
                q_u: &q_u,
                q_v: &q_v,
                q_uv: &q_uv,
                s1_u: &s1_u,
                s1_v: &s1_v,
                s1_uv: &s1_uv,
                g2_u: &g2_u,
                g2_v: &g2_v,
                g2_uv: &g2_uv,
            },
        );

        // Pair-coefficient bundles. Cross-block (mu, B'/B'') absorb basis_u/v/uv row scaling.
        let xi_u_xi_v = &xi_u * &xi_v;
        let coeff_m_b1 = &(&a_u * &xi_v) + &(&a_v * &xi_u) + &c_uv;
        let coeff_m_b2 = &(&rows.w * &geom.dq_dq0 * &xi_u_xi_v) + &(&c_u * &xi_v) + &(&c_v * &xi_u);
        let coeff_m_b3 = -(&rows.m * &xi_u_xi_v);
        // ls↔wiggle is Fisher-0 (mean⊥scale): the B' (coeff_ls_b1) and B''
        // (coeff_ls_b2) channels of its second directional derivative vanish.
        let coeff_ls_b1 = &(&l_u * &xi_v) + &(&l_v * &xi_u);
        let coeff_ls_b2 = Array1::<f64>::zeros(n);
        // Wiggle-wiggle from a_ab + a_ab^T + a_ij + a_ij^T + a_iwj + a_iwj^T + a_jwi + a_jwi^T:
        //   a_ab = B''^T diag(w·ξ_uξ_v) B    → pair (B, B'', w·ξ_uξ_v)
        //   a_ij = B'^T diag(w·ξ_uξ_v) B'   → pair (B', B', 2·w·ξ_uξ_v)  (a_ij + a_ij^T)
        //   a_iwj+a_jwi = B'^T diag(dw_v·ξ_u + dw_u·ξ_v) B → pair (B, B', sum)
        let coeff_b_b1 = &(&dw_u * &xi_v) + &(&dw_v * &xi_u);
        let coeff_b_b2 = &rows.w * &xi_u_xi_v;
        let coeff_b1_b1 = 2.0 * &(&rows.w * &xi_u_xi_v);

        let basis: Arc<Array2<f64>> = Arc::new(geom.basis.clone());
        let basis_d1: Arc<Array2<f64>> = Arc::new(geom.basis_d1.clone());
        let basis_d2: Arc<Array2<f64>> = Arc::new(geom.basis_d2.clone());
        let basis_d3: Arc<Array2<f64>> = Arc::new(geom.basis_d3.clone());
        let pw = basis.ncols();

        Ok(Some(Arc::new(RowCoeffOperator::from_directions(
            vec![pmu, p_ls, pw],
            vec![
                (0, xmu_arc),
                (1, x_ls_arc),
                (2, basis),
                (2, basis_d1),
                (2, basis_d2),
                (2, basis_d3),
            ],
            vec![
                // (X_mu, X_mu) ← `xt_diag_x_dense(xmu, &coeff_mm_uv)`
                (0, 0, coeff_mm_uv),
                // (X_mu, X_ls) ← `xt_diag_y_dense(xmu, &coeff_ml_uv, x_ls)`
                (0, 1, coeff_ml_uv),
                // (X_ls, X_ls) ← `xt_diag_x_dense(x_ls, &coeff_ll_uv)`
                (1, 1, coeff_ll_uv),
                // (X_mu, B) ← `xt_diag_y_dense(xmu, &a_uv, &geom.basis)`
                (0, 2, a_uv),
                // (X_mu, B') ← combined `a_u·ξ_v + a_v·ξ_u + c_uv` from
                // `xt_diag_y_dense(xmu, a_u, basis_v) + xt_diag_y_dense(xmu,
                // a_v, basis_u) + xt_diag_y_dense(xmu, c_uv, B')`
                (0, 3, coeff_m_b1),
                // (X_mu, B'') ← `xt_diag_y_dense(xmu, w·dq_dq0, basis_uv) +
                // xt_diag_y_dense(xmu, c_u, basis1_v) + xt_diag_y_dense(xmu,
                // c_v, basis1_u)` (basis_uv = diag(ξ_uξ_v)·B'';
                // basis1_{u,v} = diag(ξ_{u,v})·B'')
                (0, 4, coeff_m_b2),
                // (X_mu, B''') ← `xt_diag_y_dense(xmu, -m, basis1_uv)`
                // with basis1_uv = diag(ξ_uξ_v)·B'''
                (0, 5, coeff_m_b3),
                // (X_ls, B) ← `xt_diag_y_dense(x_ls, &l_uv, &geom.basis)`
                (1, 2, l_uv),
                // (X_ls, B') ← combined from `xt_diag_y_dense(x_ls, l_u,
                // basis_v) + xt_diag_y_dense(x_ls, l_v, basis_u)` =
                // `l_u·ξ_v + l_v·ξ_u`
                (1, 3, coeff_ls_b1),
                // (X_ls, B'') ← ls↔wiggle is mean⊥scale Fisher 0, so coeff_ls_b2 = 0
                (1, 4, coeff_ls_b2),
                // (B, B) ← `xt_diag_x_dense(&geom.basis, &dw_uv)`
                (2, 2, dw_uv),
                // (B, B') ← combined `a_iwj + a_iwj^T + a_jwi + a_jwi^T` =
                // B^T diag(dw_u·ξ_v + dw_v·ξ_u) B' + B'^T diag(...) B
                (2, 3, coeff_b_b1),
                // (B, B'') ← `a_ab + a_ab^T` with a_ab = B''^T diag(w·ξ_uξ_v) B
                (2, 4, coeff_b_b2),
                // (B', B') ← `a_ij + a_ij^T = 2·B'^T diag(w·ξ_uξ_v) B'`;
                // diagonal pair coeff doubles to absorb the factor of 2
                (3, 3, coeff_b1_b1),
            ],
            n,
        ))))
    }

    fn exact_newton_joint_hessiansecond_directional_derivative_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        xmu: &Array2<f64>,
        x_ls: &Array2<f64>,
        d_beta_u_flat: &Array1<f64>,
        d_beta_v_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let pmu = xmu.ncols();
        let p_ls = x_ls.ncols();
        let q0 = &block_states[Self::BLOCK_MU].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        let betaw = &block_states[Self::BLOCK_WIGGLE].beta;
        let n = self.y.len();
        let layout = GamlssBetaLayout::withwiggle(pmu, p_ls, betaw.len());
        let (umu, u_ls, uw) = layout.split_three(
            d_beta_u_flat,
            "GaussianLocationScaleWiggleFamily exact joint second directional Hessian (u)",
        )?;
        let (vmu, v_ls, vw) = layout.split_three(
            d_beta_v_flat,
            "GaussianLocationScaleWiggleFamily exact joint second directional Hessian (v)",
        )?;
        if q0.len() != n || eta_ls.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "GaussianLocationScaleWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        let q = q0 + etaw;
        let geom = self.wiggle_geometry(q0.view(), betaw.view())?;
        let rows = self.get_or_compute_row_scalars(&q, eta_ls)?;

        let xi_u = fast_av(xmu, &umu);
        let xi_v = fast_av(xmu, &vmu);
        let zeta_u = fast_av(x_ls, &u_ls);
        let zeta_v = fast_av(x_ls, &v_ls);
        let phi_u = fast_av(&geom.basis, &uw);
        let phi_v = fast_av(&geom.basis, &vw);
        let b1u = fast_av(&geom.basis_d1, &uw);
        let b1v = fast_av(&geom.basis_d1, &vw);
        let b2u = fast_av(&geom.basis_d2, &uw);
        let b2v = fast_av(&geom.basis_d2, &vw);
        let b3u = fast_av(&geom.basis_d3, &uw);
        let b3v = fast_av(&geom.basis_d3, &vw);

        let mut q_u = &geom.dq_dq0 * &xi_u;
        q_u += &phi_u;
        let mut q_v = &geom.dq_dq0 * &xi_v;
        q_v += &phi_v;
        let mut s1_u = &geom.d2q_dq02 * &xi_u;
        s1_u += &b1u;
        let mut s1_v = &geom.d2q_dq02 * &xi_v;
        s1_v += &b1v;
        let mut g2_u = &geom.d3q_dq03 * &xi_u;
        g2_u += &b2u;
        let mut g2_v = &geom.d3q_dq03 * &xi_v;
        g2_v += &b2v;
        let q_uv = &(&geom.d2q_dq02 * &(&xi_u * &xi_v)) + &(&b1u * &xi_v) + &(&b1v * &xi_u);
        let s1_uv = &(&geom.d3q_dq03 * &(&xi_u * &xi_v)) + &(&b2u * &xi_v) + &(&b2v * &xi_u);
        let g2_uv = &(&geom.d4q_dq04 * &(&xi_u * &xi_v)) + &(&b3u * &xi_v) + &(&b3v * &xi_u);

        let basis_u = scale_matrix_rows(&geom.basis_d1, &xi_u)?;
        let basis_v = scale_matrix_rows(&geom.basis_d1, &xi_v)?;
        let basis_uv = scale_matrix_rows(&geom.basis_d2, &(&xi_u * &xi_v))?;
        let basis1_u = scale_matrix_rows(&geom.basis_d2, &xi_u)?;
        let basis1_v = scale_matrix_rows(&geom.basis_d2, &xi_v)?;
        let basis1_uv = scale_matrix_rows(&geom.basis_d3, &(&xi_u * &xi_v))?;

        // Shared κ-aware second-directional row coefficients (κ' = κ(1−κ),
        // κ'' = κ(1−κ)(1−2κ), κ''' = κ''(1−2κ) − 2(κ')²): identical to the
        // matrix-free operator path, factored into one helper.
        let GlsWiggleSecondDirCoeffs {
            coeff_mm_uv,
            coeff_ml_uv,
            coeff_ll_uv,
            a_u,
            a_v,
            a_uv,
            c_u,
            c_v,
            c_uv,
            l_u,
            l_v,
            l_uv,
            dw_u,
            dw_v,
            dw_uv,
        } = gls_wiggle_second_directional_coeffs(
            &rows,
            &geom,
            &GlsWiggleDirPieces {
                zeta_u: &zeta_u,
                zeta_v: &zeta_v,
                q_u: &q_u,
                q_v: &q_v,
                q_uv: &q_uv,
                s1_u: &s1_u,
                s1_v: &s1_v,
                s1_uv: &s1_uv,
                g2_u: &g2_u,
                g2_v: &g2_v,
                g2_uv: &g2_uv,
            },
        );

        let h_mm = xt_diag_x_dense(xmu, &coeff_mm_uv)?;
        let h_ml = xt_diag_y_dense(xmu, &coeff_ml_uv, x_ls)?;
        let h_ll = xt_diag_x_dense(x_ls, &coeff_ll_uv)?;
        let h_mw = xt_diag_y_dense(xmu, &a_uv, &geom.basis)?
            + &xt_diag_y_dense(xmu, &a_u, &basis_v)?
            + &xt_diag_y_dense(xmu, &a_v, &basis_u)?
            + &xt_diag_y_dense(xmu, &(&rows.w * &geom.dq_dq0), &basis_uv)?
            + &xt_diag_y_dense(xmu, &c_uv, &geom.basis_d1)?
            + &xt_diag_y_dense(xmu, &c_u, &basis1_v)?
            + &xt_diag_y_dense(xmu, &c_v, &basis1_u)?
            + &xt_diag_y_dense(xmu, &(-&rows.m), &basis1_uv)?;
        // H_{ls,w} ≡ Fisher 0 (mean⊥scale): l_uv/l_u/l_v are 0 (shared helper)
        // and the 2κm·B'' channel vanishes too.
        let zeros_ls_b2 = Array1::<f64>::zeros(n);
        let h_lw = xt_diag_y_dense(x_ls, &l_uv, &geom.basis)?
            + &xt_diag_y_dense(x_ls, &l_u, &basis_v)?
            + &xt_diag_y_dense(x_ls, &l_v, &basis_u)?
            + &xt_diag_y_dense(x_ls, &zeros_ls_b2, &basis_uv)?;
        let a_ab = xt_diag_y_dense(&basis_uv, &rows.w, &geom.basis)?;
        let a_ij = xt_diag_y_dense(&basis_u, &rows.w, &basis_v)?;
        let a_iwj = xt_diag_y_dense(&basis_u, &dw_v, &geom.basis)?;
        let a_jwi = xt_diag_y_dense(&basis_v, &dw_u, &geom.basis)?;
        let h_ww = &a_ab
            + &a_ab.t()
            + &a_ij
            + a_ij.t()
            + &a_iwj
            + a_iwj.t()
            + &a_jwi
            + a_jwi.t()
            + &xt_diag_x_dense(&geom.basis, &dw_uv)?;
        Ok(Some(gaussian_pack_wiggle_joint_symmetrichessian(
            &h_mm, &h_ml, &h_mw, &h_ll, &h_lw, &h_ww,
        )))
    }

    fn exact_newton_joint_psi_terms_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        xmu: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiTerms>, String> {
        let Some(dir_a) = self.exact_newton_joint_psi_direction(
            block_states,
            derivative_blocks,
            psi_index,
            xmu,
            x_ls,
            &self.policy,
        )?
        else {
            return Ok(None);
        };
        let q0 = &block_states[Self::BLOCK_MU].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        let betaw = &block_states[Self::BLOCK_WIGGLE].beta;
        let q = q0 + etaw;
        let geom = self.wiggle_geometry(q0.view(), betaw.view())?;
        let rows = self.get_or_compute_row_scalars(&q, eta_ls)?;
        let xmu_map = dir_a.x_primary_psi.as_linear_map_ref();
        let x_ls_map = dir_a.x_ls_psi.as_linear_map_ref();

        let q_a = &geom.dq_dq0 * &dir_a.z_primary_psi;
        let s1_a = &geom.d2q_dq02 * &dir_a.z_primary_psi;
        let g2_a = &geom.d3q_dq03 * &dir_a.z_primary_psi;
        let basis_a = scale_matrix_rows(&geom.basis_d1, &dir_a.z_primary_psi)?;
        let basis1_a = scale_matrix_rows(&geom.basis_d2, &dir_a.z_primary_psi)?;
        // logb κ-chain on η_ls; e_a = ∂η_ls/∂ψ_a row-direction.
        let e_a = &dir_a.z_ls_psi;
        let amn = &rows.obs_weight - &rows.n;
        let dw_a = -2.0 * &rows.w * &rows.kappa * e_a;
        let dm_a = -(&rows.w * &q_a) - &(2.0 * &rows.m * &rows.kappa * e_a);
        let dn_a = -(2.0 * &rows.m * &q_a) - &(2.0 * &rows.n * &rows.kappa * e_a);
        let s_mu = -&rows.m * &geom.dq_dq0;
        let s_mu_a = -(&dm_a * &geom.dq_dq0) - &(&rows.m * &s1_a);
        let s_ls = &rows.kappa * &amn;
        let s_ls_a = &rows.kappa_prime * &(e_a * &amn) - &rows.kappa * &dn_a;
        let s_w = -&rows.m;
        let s_w_a = -&dm_a;

        let objective_psi = (-&rows.m * &q_a + &s_ls * e_a).sum();
        let score_psi = gaussian_pack_wiggle_joint_score(
            &(xmu_map.transpose_mul(s_mu.view()) + fast_atv(xmu, &s_mu_a)),
            &(x_ls_map.transpose_mul(s_ls.view()) + fast_atv(x_ls, &s_ls_a)),
            &(fast_atv(&basis_a, &s_w) + fast_atv(&geom.basis, &s_w_a)),
        );

        // Static blocks under logb. Gaussian mean⊥scale Fisher orthogonality:
        // μ AND the wiggle both enter the MEAN q = q0 + B(q0)·βw, so log σ is
        // the only scale-side block. The Fisher (expected) cross between any
        // mean-side parameter and log σ is exactly 0 because it carries
        // m = r·weight/σ² and E[m] = E[r]·weight/σ² = 0:
        //   coeff_ml = E[H_{μ,ls}] = 0  (observed 2κmD)
        //   l        = E[H_{ls,w}] = 0  (observed 2κm)
        // A function identically 0 has 0 ψ-derivatives, so coeff_ml_a and l_a
        // vanish too. This mirrors the non-wiggle psi path
        // (gaussian_joint_psi_firstweights: hmu_ls = dhmu_ls = 0) and the
        // wiggle Newton/REML Hessian path (wiggle_hessian_row_pieces:
        // coeff_ml = coeff_lw_b = 0). The observed SCORE (s_mu/s_ls/s_w above)
        // stays exact so Fisher scoring still hits the joint MLE; only the
        // curvature feeding the REML determinant / IFT correction is the
        // (orthogonal) expectation. coeff_ll is the residual-free Fisher
        // 2κ²a (#566); its ψ-derivative coeff_ll_a = 4κκ'a·e_a depends only on
        // η_ls. Same-side blocks (coeff_mm within mean, a/c the μ↔wiggle
        // within-mean cross, coeff_ww within mean) are untouched.
        let n = rows.m.len();
        let coeff_mm = &rows.w * &geom.dq_dq0.mapv(|v| v * v) - &rows.m * &geom.d2q_dq02;
        let coeff_mm_a = &(&dw_a * &geom.dq_dq0.mapv(|v| v * v))
            + &(2.0 * &rows.w * &geom.dq_dq0 * &s1_a)
            - &(&dm_a * &geom.d2q_dq02)
            - &(&rows.m * &g2_a);
        let coeff_ml = Array1::<f64>::zeros(n);
        let coeff_ml_a = Array1::<f64>::zeros(n);
        let coeff_ll = 2.0 * &rows.kappa * &rows.kappa * &rows.obs_weight;
        let coeff_ll_a = 4.0 * &rows.kappa * &rows.kappa_prime * &rows.obs_weight * e_a;
        let a = &rows.w * &geom.dq_dq0;
        let a_a = &dw_a * &geom.dq_dq0 + &rows.w * &s1_a;
        let c = -&rows.m;
        let c_a = -&dm_a;
        let l = Array1::<f64>::zeros(n);
        let l_a = Array1::<f64>::zeros(n);
        let h_mm_a1 = weighted_crossprod_psi_maps(
            xmu_map,
            coeff_mm.view(),
            CustomFamilyPsiLinearMapRef::Dense(xmu),
        )?;
        let h_mm = &h_mm_a1 + &h_mm_a1.t() + &xt_diag_x_dense(xmu, &coeff_mm_a)?;
        let h_ml = weighted_crossprod_psi_maps(
            xmu_map,
            coeff_ml.view(),
            CustomFamilyPsiLinearMapRef::Dense(x_ls),
        )? + &weighted_crossprod_psi_maps(
            CustomFamilyPsiLinearMapRef::Dense(xmu),
            coeff_ml.view(),
            x_ls_map,
        )? + &xt_diag_y_dense(xmu, &coeff_ml_a, x_ls)?;
        let h_ll_a1 = weighted_crossprod_psi_maps(
            x_ls_map,
            coeff_ll.view(),
            CustomFamilyPsiLinearMapRef::Dense(x_ls),
        )?;
        let h_ll = &h_ll_a1 + &h_ll_a1.t() + &xt_diag_x_dense(x_ls, &coeff_ll_a)?;
        let h_mw = weighted_crossprod_psi_maps(
            xmu_map,
            a.view(),
            CustomFamilyPsiLinearMapRef::Dense(&geom.basis),
        )? + &xt_diag_y_dense(xmu, &a_a, &geom.basis)?
            + &xt_diag_y_dense(xmu, &a, &basis_a)?
            + &weighted_crossprod_psi_maps(
                xmu_map,
                c.view(),
                CustomFamilyPsiLinearMapRef::Dense(&geom.basis_d1),
            )?
            + &xt_diag_y_dense(xmu, &c_a, &geom.basis_d1)?
            + &xt_diag_y_dense(xmu, &c, &basis1_a)?;
        let h_lw = weighted_crossprod_psi_maps(
            x_ls_map,
            l.view(),
            CustomFamilyPsiLinearMapRef::Dense(&geom.basis),
        )? + &xt_diag_y_dense(x_ls, &l_a, &geom.basis)?
            + &xt_diag_y_dense(x_ls, &l, &basis_a)?;
        let h_ww_a1 = xt_diag_y_dense(&basis_a, &rows.w, &geom.basis)?;
        let h_ww = &h_ww_a1 + &h_ww_a1.t() + &xt_diag_x_dense(&geom.basis, &dw_a)?;

        Ok(Some(crate::custom_family::ExactNewtonJointPsiTerms {
            objective_psi,
            score_psi,
            hessian_psi: gaussian_pack_wiggle_joint_symmetrichessian(
                &h_mm, &h_ml, &h_mw, &h_ll, &h_lw, &h_ww,
            ),
            hessian_psi_operator: None,
        }))
    }

    fn exact_newton_joint_psisecond_order_terms_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_i: usize,
        psi_j: usize,
        xmu: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms>, String> {
        let Some(dir_a) = self.exact_newton_joint_psi_direction(
            block_states,
            derivative_blocks,
            psi_i,
            xmu,
            x_ls,
            &self.policy,
        )?
        else {
            return Ok(None);
        };
        let Some(dir_b) = self.exact_newton_joint_psi_direction(
            block_states,
            derivative_blocks,
            psi_j,
            xmu,
            x_ls,
            &self.policy,
        )?
        else {
            return Ok(None);
        };
        Ok(Some(
            self.exact_newton_joint_psisecond_order_terms_from_parts(
                block_states,
                derivative_blocks,
                &dir_a,
                &dir_b,
                xmu,
                x_ls,
            )?,
        ))
    }

    fn exact_newton_joint_psisecond_order_terms_from_parts(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        dir_a: &LocationScaleJointPsiDirection,
        dir_b: &LocationScaleJointPsiDirection,
        xmu: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms, String> {
        let second_drifts = self.exact_newton_joint_psisecond_design_drifts(
            block_states,
            derivative_blocks,
            dir_a,
            dir_b,
            xmu,
            x_ls,
        )?;
        let n = self.y.len();
        let xmu_a_map = dir_a.x_primary_psi.as_linear_map_ref();
        let x_ls_a_map = dir_a.x_ls_psi.as_linear_map_ref();
        let xmu_b_map = dir_b.x_primary_psi.as_linear_map_ref();
        let x_ls_b_map = dir_b.x_ls_psi.as_linear_map_ref();
        let xmu_ab_map = second_psi_linear_map(
            second_drifts.x_primary_ab_action.as_ref(),
            second_drifts.x_primary_ab.as_ref(),
            n,
            xmu.ncols(),
        );
        let x_ls_ab_map = second_psi_linear_map(
            second_drifts.x_ls_ab_action.as_ref(),
            second_drifts.x_ls_ab.as_ref(),
            n,
            x_ls.ncols(),
        );
        let q0 = &block_states[Self::BLOCK_MU].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        let betaw = &block_states[Self::BLOCK_WIGGLE].beta;
        let q = q0 + etaw;
        let geom = self.wiggle_geometry(q0.view(), betaw.view())?;
        let rows = self.get_or_compute_row_scalars(&q, eta_ls)?;

        let q_a = &geom.dq_dq0 * &dir_a.z_primary_psi;
        let q_b = &geom.dq_dq0 * &dir_b.z_primary_psi;
        let q_ab = &(&geom.dq_dq0 * &second_drifts.z_primary_ab)
            + &(&geom.d2q_dq02 * &(&dir_a.z_primary_psi * &dir_b.z_primary_psi));
        let s1_a = &geom.d2q_dq02 * &dir_a.z_primary_psi;
        let s1_b = &geom.d2q_dq02 * &dir_b.z_primary_psi;
        let s1_ab = &(&geom.d3q_dq03 * &(&dir_a.z_primary_psi * &dir_b.z_primary_psi))
            + &(&geom.d2q_dq02 * &second_drifts.z_primary_ab);
        let g2_a = &geom.d3q_dq03 * &dir_a.z_primary_psi;
        let g2_b = &geom.d3q_dq03 * &dir_b.z_primary_psi;
        let g2_ab = &(&geom.d4q_dq04 * &(&dir_a.z_primary_psi * &dir_b.z_primary_psi))
            + &(&geom.d3q_dq03 * &second_drifts.z_primary_ab);
        let basis_a = scale_matrix_rows(&geom.basis_d1, &dir_a.z_primary_psi)?;
        let basis_b = scale_matrix_rows(&geom.basis_d1, &dir_b.z_primary_psi)?;
        let basis_ab = scale_matrix_rows(&geom.basis_d1, &second_drifts.z_primary_ab)?
            + &scale_matrix_rows(
                &geom.basis_d2,
                &(&dir_a.z_primary_psi * &dir_b.z_primary_psi),
            )?;
        let basis1_a = scale_matrix_rows(&geom.basis_d2, &dir_a.z_primary_psi)?;
        let basis1_b = scale_matrix_rows(&geom.basis_d2, &dir_b.z_primary_psi)?;
        let basis1_ab = scale_matrix_rows(&geom.basis_d2, &second_drifts.z_primary_ab)?
            + &scale_matrix_rows(
                &geom.basis_d3,
                &(&dir_a.z_primary_psi * &dir_b.z_primary_psi),
            )?;

        // logb κ-chain on η_ls; κ' = κ(1−κ), κ'' = κ(1−κ)(1−2κ),
        // κ''' = κ''(1−2κ) − 2(κ')².
        let e_a = &dir_a.z_ls_psi;
        let e_b = &dir_b.z_ls_psi;
        let e_ab = &second_drifts.z_ls_ab;
        let amn = &rows.obs_weight - &rows.n;
        // 4κ² − 2κ' (∂²w/∂η² style coefficient when both directions hit η_ls).
        let four_k2_minus_2kpi = 4.0 * &rows.kappa * &rows.kappa - 2.0 * &rows.kappa_prime;

        // Row drifts under logb. The η_ls direction picks up a κ on each step,
        // and η_ls·η_ls picks up (4κ²−2κ') from differentiating κ on the
        // second leg. The η_ab (z_ls_ab) leg uses just one κ from the chain.
        let dw_a = -2.0 * &rows.w * &rows.kappa * e_a;
        let dw_b = -2.0 * &rows.w * &rows.kappa * e_b;
        let dw_ab =
            &four_k2_minus_2kpi * &rows.w * &(e_a * e_b) - &(2.0 * &rows.w * &rows.kappa * e_ab);
        let dm_a = -(&rows.w * &q_a) - &(2.0 * &rows.m * &rows.kappa * e_a);
        let dm_b = -(&rows.w * &q_b) - &(2.0 * &rows.m * &rows.kappa * e_b);
        let dm_ab = &(2.0 * &rows.w * &rows.kappa * &(&q_a * e_b + &q_b * e_a))
            - &(&rows.w * &q_ab)
            + &(&four_k2_minus_2kpi * &rows.m * &(e_a * e_b))
            - &(2.0 * &rows.m * &rows.kappa * e_ab);
        let dn_a = -(2.0 * &rows.m * &q_a) - &(2.0 * &rows.n * &rows.kappa * e_a);
        let dn_b = -(2.0 * &rows.m * &q_b) - &(2.0 * &rows.n * &rows.kappa * e_b);
        let dn_ab = &(2.0 * &rows.w * &(&q_a * &q_b))
            + &(4.0 * &rows.m * &rows.kappa * &(&q_a * e_b + &q_b * e_a))
            - &(2.0 * &rows.m * &q_ab)
            + &(&four_k2_minus_2kpi * &rows.n * &(e_a * e_b))
            - &(2.0 * &rows.n * &rows.kappa * e_ab);

        let s_mu = -&rows.m * &geom.dq_dq0;
        let s_mu_a = -(&dm_a * &geom.dq_dq0) - &(&rows.m * &s1_a);
        let s_mu_b = -(&dm_b * &geom.dq_dq0) - &(&rows.m * &s1_b);
        let s_mu_ab =
            -(&dm_ab * &geom.dq_dq0) - &(&dm_a * &s1_b) - &(&dm_b * &s1_a) - &(&rows.m * &s1_ab);
        // score_ls = κ(a−n); ψ derivatives carry κ' / κ'' from chain on κ.
        let s_ls = &rows.kappa * &amn;
        let s_ls_a = &rows.kappa_prime * &(e_a * &amn) - &rows.kappa * &dn_a;
        let s_ls_b = &rows.kappa_prime * &(e_b * &amn) - &rows.kappa * &dn_b;
        // s_ls_ab = κ''·e_a·e_b·(a−n) + κ'·e_ab·(a−n)
        //         − κ'·(e_a·n_b + e_b·n_a) − κ·n_ab
        let s_ls_ab = &rows.kappa_dprime * &(e_a * e_b) * &amn + &rows.kappa_prime * e_ab * &amn
            - &rows.kappa_prime * &(e_a * &dn_b + e_b * &dn_a)
            - &rows.kappa * &dn_ab;
        let s_w = -&rows.m;
        let s_w_a = -&dm_a;
        let s_w_b = -&dm_b;
        let s_w_ab = -&dm_ab;

        let objective_psi_psi = (&rows.w * &(&q_a * &q_b)
            + &(2.0 * &rows.m * &rows.kappa * &(&q_a * e_b + &q_b * e_a))
            + &((2.0 * &rows.kappa * &rows.kappa * &rows.n + &rows.kappa_prime * &amn)
                * &(e_a * e_b))
            - &(&rows.m * &q_ab)
            + &(&rows.kappa * &amn * e_ab))
            .sum();

        let score_psi_psi = gaussian_pack_wiggle_joint_score(
            &(xmu_ab_map.transpose_mul(s_mu.view())
                + xmu_a_map.transpose_mul(s_mu_b.view())
                + xmu_b_map.transpose_mul(s_mu_a.view())
                + fast_atv(xmu, &s_mu_ab)),
            &(x_ls_ab_map.transpose_mul(s_ls.view())
                + x_ls_a_map.transpose_mul(s_ls_b.view())
                + x_ls_b_map.transpose_mul(s_ls_a.view())
                + fast_atv(x_ls, &s_ls_ab)),
            &(fast_atv(&basis_ab, &s_w)
                + fast_atv(&basis_a, &s_w_b)
                + fast_atv(&basis_b, &s_w_a)
                + fast_atv(&geom.basis, &s_w_ab)),
        );

        // Static blocks under logb. coeff_mm has no κ; coeff_ll = Fisher 2κ²a
        // (#566). Gaussian mean⊥scale Fisher orthogonality: the wiggle and μ
        // both enter the mean (q = q0 + B·βw), log σ is the only scale block,
        // so coeff_ml = E[H_{μ,ls}] = 0 and l = E[H_{ls,w}] = 0 (observed 2κm,
        // E[m]=0). All of their ψ-directional derivatives (a/b/ab) are 0 since
        // a function identically 0 has 0 derivatives. The Fisher (ls,ls) block
        // depends only on η_ls so its derivatives carry only κ.
        let n = rows.m.len();
        let coeff_mm = &rows.w * &geom.dq_dq0.mapv(|v| v * v) - &rows.m * &geom.d2q_dq02;
        let coeff_ml = Array1::<f64>::zeros(n);
        let coeff_ll = 2.0 * &rows.kappa * &rows.kappa * &rows.obs_weight;
        // coeff_mm_a/b/ab: structurally κ-free; correctness now follows from
        // dw_a/_b/_ab and dm_a/_b/_ab carrying the κ chain on η_ls (above).
        let coeff_mm_a = &(&dw_a * &geom.dq_dq0.mapv(|v| v * v))
            + &(2.0 * &rows.w * &geom.dq_dq0 * &s1_a)
            - &(&dm_a * &geom.d2q_dq02)
            - &(&rows.m * &g2_a);
        let coeff_mm_b = &(&dw_b * &geom.dq_dq0.mapv(|v| v * v))
            + &(2.0 * &rows.w * &geom.dq_dq0 * &s1_b)
            - &(&dm_b * &geom.d2q_dq02)
            - &(&rows.m * &g2_b);
        let coeff_mm_ab = &(&dw_ab * &geom.dq_dq0.mapv(|v| v * v))
            + &(2.0 * &dw_a * &geom.dq_dq0 * &s1_b)
            + &(2.0 * &dw_b * &geom.dq_dq0 * &s1_a)
            + &(2.0 * &rows.w * &s1_a * &s1_b)
            + &(2.0 * &rows.w * &geom.dq_dq0 * &s1_ab)
            - &(&dm_ab * &geom.d2q_dq02)
            - &(&dm_a * &g2_b)
            - &(&dm_b * &g2_a)
            - &(&rows.m * &g2_ab);
        // coeff_ml (μ↔logσ) is Fisher 0; its 1st/2nd ψ-directional derivatives
        // are 0 as well.
        let coeff_ml_a = Array1::<f64>::zeros(n);
        let coeff_ml_b = Array1::<f64>::zeros(n);
        let coeff_ml_ab = Array1::<f64>::zeros(n);
        // Fisher (ls,ls) coeff_ll = 2κ²a (a constant prior weight) depends only
        // on η_ls (#566): ∂(2κ²a)/∂η = 4κκ'a, so the ψ-first derivatives are
        // 4κκ'a·e_a / e_b. The η_ab leg carries one κ on top.
        let coeff_ll_a = 4.0 * &rows.kappa * &rows.kappa_prime * &rows.obs_weight * e_a;
        let coeff_ll_b = 4.0 * &rows.kappa * &rows.kappa_prime * &rows.obs_weight * e_b;
        // coeff_ll_ab = ∂²(2κ²a)/∂a∂b = 4a(κ'²+κκ'')·e_a·e_b + 4κκ'a·e_ab
        // (mirrors the dense helper `d2h_ls_ls`).
        let coeff_ll_ab = 4.0
            * &rows.obs_weight
            * &(&rows.kappa_prime * &rows.kappa_prime + &rows.kappa * &rows.kappa_dprime)
            * &(e_a * e_b)
            + 4.0 * &rows.kappa * &rows.kappa_prime * &rows.obs_weight * e_ab;
        let a = &rows.w * &geom.dq_dq0;
        let a_a = &dw_a * &geom.dq_dq0 + &rows.w * &s1_a;
        let a_b = &dw_b * &geom.dq_dq0 + &rows.w * &s1_b;
        let a_ab = &dw_ab * &geom.dq_dq0 + &dw_a * &s1_b + &dw_b * &s1_a + &rows.w * &s1_ab;
        let c = -&rows.m;
        let c_a = -&dm_a;
        let c_b = -&dm_b;
        let c_ab = -&dm_ab;
        // l (logσ↔wiggle) is Fisher 0 (wiggle is mean-side; mean⊥scale), so all
        // of its 1st/2nd ψ-directional derivatives vanish.
        let l = Array1::<f64>::zeros(n);
        let l_a = Array1::<f64>::zeros(n);
        let l_b = Array1::<f64>::zeros(n);
        let l_ab = Array1::<f64>::zeros(n);

        let hmm_ab = weighted_crossprod_psi_maps(
            xmu_ab_map,
            coeff_mm.view(),
            CustomFamilyPsiLinearMapRef::Dense(xmu),
        )?;
        let hmm_ij = weighted_crossprod_psi_maps(xmu_a_map, coeff_mm.view(), xmu_b_map)?;
        let hmm_iwj = weighted_crossprod_psi_maps(
            xmu_a_map,
            coeff_mm_b.view(),
            CustomFamilyPsiLinearMapRef::Dense(xmu),
        )?;
        let hmm_jwi = weighted_crossprod_psi_maps(
            xmu_b_map,
            coeff_mm_a.view(),
            CustomFamilyPsiLinearMapRef::Dense(xmu),
        )?;
        let h_mm = &hmm_ab
            + &hmm_ab.t()
            + &hmm_ij
            + hmm_ij.t()
            + &hmm_iwj
            + hmm_iwj.t()
            + &hmm_jwi
            + hmm_jwi.t()
            + &xt_diag_x_dense(xmu, &coeff_mm_ab)?;
        let h_ml = weighted_crossprod_psi_maps(
            xmu_ab_map,
            coeff_ml.view(),
            CustomFamilyPsiLinearMapRef::Dense(x_ls),
        )? + &weighted_crossprod_psi_maps(xmu_a_map, coeff_ml.view(), x_ls_b_map)?
            + &weighted_crossprod_psi_maps(xmu_b_map, coeff_ml.view(), x_ls_a_map)?
            + &weighted_crossprod_psi_maps(
                xmu_a_map,
                coeff_ml_b.view(),
                CustomFamilyPsiLinearMapRef::Dense(x_ls),
            )?
            + &weighted_crossprod_psi_maps(
                xmu_b_map,
                coeff_ml_a.view(),
                CustomFamilyPsiLinearMapRef::Dense(x_ls),
            )?
            + &weighted_crossprod_psi_maps(
                CustomFamilyPsiLinearMapRef::Dense(xmu),
                coeff_ml_a.view(),
                x_ls_b_map,
            )?
            + &weighted_crossprod_psi_maps(
                CustomFamilyPsiLinearMapRef::Dense(xmu),
                coeff_ml_b.view(),
                x_ls_a_map,
            )?
            + &xt_diag_y_dense(xmu, &coeff_ml_ab, x_ls)?
            + &weighted_crossprod_psi_maps(
                CustomFamilyPsiLinearMapRef::Dense(xmu),
                coeff_ml.view(),
                x_ls_ab_map,
            )?;
        let hll_ab = weighted_crossprod_psi_maps(
            x_ls_ab_map,
            coeff_ll.view(),
            CustomFamilyPsiLinearMapRef::Dense(x_ls),
        )?;
        let hll_ij = weighted_crossprod_psi_maps(x_ls_a_map, coeff_ll.view(), x_ls_b_map)?;
        let hll_iwj = weighted_crossprod_psi_maps(
            x_ls_a_map,
            coeff_ll_b.view(),
            CustomFamilyPsiLinearMapRef::Dense(x_ls),
        )?;
        let hll_jwi = weighted_crossprod_psi_maps(
            x_ls_b_map,
            coeff_ll_a.view(),
            CustomFamilyPsiLinearMapRef::Dense(x_ls),
        )?;
        let h_ll = &hll_ab
            + &hll_ab.t()
            + &hll_ij
            + hll_ij.t()
            + &hll_iwj
            + hll_iwj.t()
            + &hll_jwi
            + hll_jwi.t()
            + &xt_diag_x_dense(x_ls, &coeff_ll_ab)?;
        let h_mw = weighted_crossprod_psi_maps(
            xmu_ab_map,
            a.view(),
            CustomFamilyPsiLinearMapRef::Dense(&geom.basis),
        )? + &weighted_crossprod_psi_maps(
            xmu_a_map,
            a_b.view(),
            CustomFamilyPsiLinearMapRef::Dense(&geom.basis),
        )? + &weighted_crossprod_psi_maps(
            xmu_a_map,
            a.view(),
            CustomFamilyPsiLinearMapRef::Dense(&basis_b),
        )? + &weighted_crossprod_psi_maps(
            xmu_b_map,
            a_a.view(),
            CustomFamilyPsiLinearMapRef::Dense(&geom.basis),
        )? + &xt_diag_y_dense(xmu, &a_ab, &geom.basis)?
            + &xt_diag_y_dense(xmu, &a_a, &basis_b)?
            + &weighted_crossprod_psi_maps(
                xmu_b_map,
                a.view(),
                CustomFamilyPsiLinearMapRef::Dense(&basis_a),
            )?
            + &xt_diag_y_dense(xmu, &a_b, &basis_a)?
            + &xt_diag_y_dense(xmu, &a, &basis_ab)?
            + &weighted_crossprod_psi_maps(
                xmu_ab_map,
                c.view(),
                CustomFamilyPsiLinearMapRef::Dense(&geom.basis_d1),
            )?
            + &weighted_crossprod_psi_maps(
                xmu_a_map,
                c_b.view(),
                CustomFamilyPsiLinearMapRef::Dense(&geom.basis_d1),
            )?
            + &weighted_crossprod_psi_maps(
                xmu_a_map,
                c.view(),
                CustomFamilyPsiLinearMapRef::Dense(&basis1_b),
            )?
            + &weighted_crossprod_psi_maps(
                xmu_b_map,
                c_a.view(),
                CustomFamilyPsiLinearMapRef::Dense(&geom.basis_d1),
            )?
            + &xt_diag_y_dense(xmu, &c_ab, &geom.basis_d1)?
            + &xt_diag_y_dense(xmu, &c_a, &basis1_b)?
            + &weighted_crossprod_psi_maps(
                xmu_b_map,
                c.view(),
                CustomFamilyPsiLinearMapRef::Dense(&basis1_a),
            )?
            + &xt_diag_y_dense(xmu, &c_b, &basis1_a)?
            + &xt_diag_y_dense(xmu, &c, &basis1_ab)?;
        let h_lw = weighted_crossprod_psi_maps(
            x_ls_ab_map,
            l.view(),
            CustomFamilyPsiLinearMapRef::Dense(&geom.basis),
        )? + &weighted_crossprod_psi_maps(
            x_ls_a_map,
            l_b.view(),
            CustomFamilyPsiLinearMapRef::Dense(&geom.basis),
        )? + &weighted_crossprod_psi_maps(
            x_ls_a_map,
            l.view(),
            CustomFamilyPsiLinearMapRef::Dense(&basis_b),
        )? + &weighted_crossprod_psi_maps(
            x_ls_b_map,
            l_a.view(),
            CustomFamilyPsiLinearMapRef::Dense(&geom.basis),
        )? + &xt_diag_y_dense(x_ls, &l_ab, &geom.basis)?
            + &xt_diag_y_dense(x_ls, &l_a, &basis_b)?
            + &weighted_crossprod_psi_maps(
                x_ls_b_map,
                l.view(),
                CustomFamilyPsiLinearMapRef::Dense(&basis_a),
            )?
            + &xt_diag_y_dense(x_ls, &l_b, &basis_a)?
            + &xt_diag_y_dense(x_ls, &l, &basis_ab)?;
        let hww_ab = xt_diag_y_dense(&basis_ab, &rows.w, &geom.basis)?;
        let hww_ij = xt_diag_y_dense(&basis_a, &rows.w, &basis_b)?;
        let hww_iwj = xt_diag_y_dense(&basis_a, &dw_b, &geom.basis)?;
        let hww_jwi = xt_diag_y_dense(&basis_b, &dw_a, &geom.basis)?;
        let h_ww = &hww_ab
            + &hww_ab.t()
            + &hww_ij
            + hww_ij.t()
            + &hww_iwj
            + hww_iwj.t()
            + &hww_jwi
            + hww_jwi.t()
            + &xt_diag_x_dense(&geom.basis, &dw_ab)?;

        Ok(crate::custom_family::ExactNewtonJointPsiSecondOrderTerms {
            objective_psi_psi,
            score_psi_psi,
            hessian_psi_psi: gaussian_pack_wiggle_joint_symmetrichessian(
                &h_mm, &h_ml, &h_mw, &h_ll, &h_lw, &h_ww,
            ),
            hessian_psi_psi_operator: None,
        })
    }

    fn exact_newton_joint_psihessian_directional_derivative_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        d_beta_flat: &Array1<f64>,
        xmu: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        let Some(dir_a) = self.exact_newton_joint_psi_direction(
            block_states,
            derivative_blocks,
            psi_index,
            xmu,
            x_ls,
            &self.policy,
        )?
        else {
            return Ok(None);
        };
        Ok(Some(
            self.exact_newton_joint_psihessian_directional_derivative_from_parts(
                block_states,
                &dir_a,
                d_beta_flat,
                xmu,
                x_ls,
            )?,
        ))
    }

    fn exact_newton_joint_psihessian_directional_derivative_from_parts(
        &self,
        block_states: &[ParameterBlockState],
        dir_a: &LocationScaleJointPsiDirection,
        d_beta_flat: &Array1<f64>,
        xmu: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<Array2<f64>, String> {
        let pmu = xmu.ncols();
        let p_ls = x_ls.ncols();
        let xmu_map = dir_a.x_primary_psi.as_linear_map_ref();
        let x_ls_map = dir_a.x_ls_psi.as_linear_map_ref();
        let q0 = &block_states[Self::BLOCK_MU].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        let betaw = &block_states[Self::BLOCK_WIGGLE].beta;
        let layout = GamlssBetaLayout::withwiggle(pmu, p_ls, betaw.len());
        let (umu, u_ls, uw) = layout.split_three(
            d_beta_flat,
            "GaussianLocationScaleWiggleFamily joint psi hessian directional derivative",
        )?;
        let q = q0 + etaw;
        let geom = self.wiggle_geometry(q0.view(), betaw.view())?;
        let rows = self.get_or_compute_row_scalars(&q, eta_ls)?;

        let xi = fast_av(xmu, &umu);
        let zeta = fast_av(x_ls, &u_ls);
        let zmu_a_u = xmu_map.forward_mul(umu.view());
        let zls_a_u = x_ls_map.forward_mul(u_ls.view());
        let b1u = fast_av(&geom.basis_d1, &uw);
        let b2u = fast_av(&geom.basis_d2, &uw);
        let b3u = fast_av(&geom.basis_d3, &uw);

        let q_u = &(&geom.dq_dq0 * &xi) + &fast_av(&geom.basis, &uw);
        let s1_u = &(&geom.d2q_dq02 * &xi) + &b1u;
        let g2_u = &(&geom.d3q_dq03 * &xi) + &b2u;
        let g3_u = &(&geom.d4q_dq04 * &xi) + &b3u;

        let q_a = &geom.dq_dq0 * &dir_a.z_primary_psi;
        let s1_a = &geom.d2q_dq02 * &dir_a.z_primary_psi;
        let g2_a = &geom.d3q_dq03 * &dir_a.z_primary_psi;
        let q_a_u = &(&s1_u * &dir_a.z_primary_psi) + &(&geom.dq_dq0 * &zmu_a_u);
        let s1_a_u = &(&g2_u * &dir_a.z_primary_psi) + &(&geom.d2q_dq02 * &zmu_a_u);
        let g2_a_u = &(&g3_u * &dir_a.z_primary_psi) + &(&geom.d3q_dq03 * &zmu_a_u);

        let basis_u = scale_matrix_rows(&geom.basis_d1, &xi)?;
        let basis1_u = scale_matrix_rows(&geom.basis_d2, &xi)?;
        let basis_a = scale_matrix_rows(&geom.basis_d1, &dir_a.z_primary_psi)?;
        let basis1_a = scale_matrix_rows(&geom.basis_d2, &dir_a.z_primary_psi)?;
        let basis_a_u = scale_matrix_rows(&geom.basis_d2, &(&xi * &dir_a.z_primary_psi))?
            + &scale_matrix_rows(&geom.basis_d1, &zmu_a_u)?;
        let basis1_a_u = scale_matrix_rows(&geom.basis_d3, &(&xi * &dir_a.z_primary_psi))?
            + &scale_matrix_rows(&geom.basis_d2, &zmu_a_u)?;

        // logb κ-chain on η_ls; e_a = ψ_a's η_ls direction, ζ = β-direction.
        // η_au = zls_a_u is the second mixed derivative (β·ψ).
        let e_a = &dir_a.z_ls_psi;
        let four_k2_minus_2kpi = 4.0 * &rows.kappa * &rows.kappa - 2.0 * &rows.kappa_prime;
        let dw_u = -2.0 * &rows.w * &rows.kappa * &zeta;
        let dm_u = -(&rows.w * &q_u) - &(2.0 * &rows.m * &rows.kappa * &zeta);
        let dw_a = -2.0 * &rows.w * &rows.kappa * e_a;
        let dm_a = -(&rows.w * &q_a) - &(2.0 * &rows.m * &rows.kappa * e_a);
        let dw_a_u = &four_k2_minus_2kpi * &rows.w * &(e_a * &zeta)
            - &(2.0 * &rows.w * &rows.kappa * &zls_a_u);
        let dm_a_u = &(2.0 * &rows.w * &rows.kappa * &(&q_a * &zeta + &q_u * e_a))
            - &(&rows.w * &q_a_u)
            + &(&four_k2_minus_2kpi * &rows.m * &(e_a * &zeta))
            - &(2.0 * &rows.m * &rows.kappa * &zls_a_u);

        let coeff_mm_u = &(&dw_u * &geom.dq_dq0.mapv(|v| v * v))
            + &(2.0 * &rows.w * &geom.dq_dq0 * &s1_u)
            - &(&dm_u * &geom.d2q_dq02)
            - &(&rows.m * &g2_u);
        // coeff_ml (μ↔logσ) is mean⊥scale Fisher 0 (E[m]=0), so both its
        // β-drift derivative coeff_ml_u and the mixed coeff_ml_a_u are 0.
        let n = rows.m.len();
        let coeff_ml_u = Array1::<f64>::zeros(n);
        // Fisher (ls,ls) coeff_ll = 2κ²a (#566); ∂(2κ²a)/∂η = 4κκ'a, so the
        // β-drift derivative along ζ is 4κκ'a·ζ.
        let coeff_ll_u = 4.0 * &rows.kappa * &rows.kappa_prime * &rows.obs_weight * &zeta;
        let coeff_mm_a_u = &(&dw_a_u * &geom.dq_dq0.mapv(|v| v * v))
            + &(2.0 * &dw_a * &geom.dq_dq0 * &s1_u)
            + &(2.0 * &dw_u * &geom.dq_dq0 * &s1_a)
            + &(2.0 * &rows.w * &s1_u * &s1_a)
            + &(2.0 * &rows.w * &geom.dq_dq0 * &s1_a_u)
            - &(&dm_a_u * &geom.d2q_dq02)
            - &(&dm_a * &g2_u)
            - &(&dm_u * &g2_a)
            - &(&rows.m * &g2_a_u);
        // coeff_ml_a_u = ∂²(coeff_ml)/∂a∂u = 0 (coeff_ml ≡ Fisher 0).
        let coeff_ml_a_u = Array1::<f64>::zeros(n);
        // coeff_ll_a_u = ∂²(2κ²a)/∂a∂u for the Fisher (ls,ls) block (#566):
        // 4a(κ'²+κκ'')·e_a·ζ + 4κκ'a·η_au (the η_au=zls_a_u mixed leg), mirroring
        // the dense mixed-drift helper.
        let coeff_ll_a_u = 4.0
            * &rows.obs_weight
            * &(&rows.kappa_prime * &rows.kappa_prime + &rows.kappa * &rows.kappa_dprime)
            * &(e_a * &zeta)
            + 4.0 * &rows.kappa * &rows.kappa_prime * &rows.obs_weight * &zls_a_u;

        let a = &rows.w * &geom.dq_dq0;
        let a_u = &dw_u * &geom.dq_dq0 + &rows.w * &s1_u;
        let a_a = &dw_a * &geom.dq_dq0 + &rows.w * &s1_a;
        let a_a_u = &dw_a_u * &geom.dq_dq0 + &dw_a * &s1_u + &dw_u * &s1_a + &rows.w * &s1_a_u;
        let c = -&rows.m;
        let c_u = -&dm_u;
        let c_a = -&dm_a;
        let c_a_u = -&dm_a_u;
        // l (logσ↔wiggle) is mean⊥scale Fisher 0 (wiggle is mean-side), so its
        // β-drift (l_u), ψ (l_a), and mixed (l_a_u) derivatives all vanish.
        let l = Array1::<f64>::zeros(n);
        let l_u = Array1::<f64>::zeros(n);
        let l_a = Array1::<f64>::zeros(n);
        let l_a_u = Array1::<f64>::zeros(n);

        let hmm_a1 = weighted_crossprod_psi_maps(
            xmu_map,
            coeff_mm_u.view(),
            CustomFamilyPsiLinearMapRef::Dense(xmu),
        )?;
        let h_mm = &hmm_a1 + &hmm_a1.t() + &xt_diag_x_dense(xmu, &coeff_mm_a_u)?;
        let h_ml = weighted_crossprod_psi_maps(
            xmu_map,
            coeff_ml_u.view(),
            CustomFamilyPsiLinearMapRef::Dense(x_ls),
        )? + &weighted_crossprod_psi_maps(
            CustomFamilyPsiLinearMapRef::Dense(xmu),
            coeff_ml_u.view(),
            x_ls_map,
        )? + &xt_diag_y_dense(xmu, &coeff_ml_a_u, x_ls)?;
        let hll_a1 = weighted_crossprod_psi_maps(
            x_ls_map,
            coeff_ll_u.view(),
            CustomFamilyPsiLinearMapRef::Dense(x_ls),
        )?;
        let h_ll = &hll_a1 + &hll_a1.t() + &xt_diag_x_dense(x_ls, &coeff_ll_a_u)?;
        let h_mw = weighted_crossprod_psi_maps(
            xmu_map,
            a_u.view(),
            CustomFamilyPsiLinearMapRef::Dense(&geom.basis),
        )? + &weighted_crossprod_psi_maps(
            xmu_map,
            a.view(),
            CustomFamilyPsiLinearMapRef::Dense(&basis_u),
        )? + &xt_diag_y_dense(xmu, &a_a_u, &geom.basis)?
            + &xt_diag_y_dense(xmu, &a_a, &basis_u)?
            + &xt_diag_y_dense(xmu, &a_u, &basis_a)?
            + &xt_diag_y_dense(xmu, &a, &basis_a_u)?
            + &weighted_crossprod_psi_maps(
                xmu_map,
                c_u.view(),
                CustomFamilyPsiLinearMapRef::Dense(&geom.basis_d1),
            )?
            + &weighted_crossprod_psi_maps(
                xmu_map,
                c.view(),
                CustomFamilyPsiLinearMapRef::Dense(&basis1_u),
            )?
            + &xt_diag_y_dense(xmu, &c_a_u, &geom.basis_d1)?
            + &xt_diag_y_dense(xmu, &c_a, &basis1_u)?
            + &xt_diag_y_dense(xmu, &c_u, &basis1_a)?
            + &xt_diag_y_dense(xmu, &c, &basis1_a_u)?;
        let h_lw = weighted_crossprod_psi_maps(
            x_ls_map,
            l_u.view(),
            CustomFamilyPsiLinearMapRef::Dense(&geom.basis),
        )? + &weighted_crossprod_psi_maps(
            x_ls_map,
            l.view(),
            CustomFamilyPsiLinearMapRef::Dense(&basis_u),
        )? + &xt_diag_y_dense(x_ls, &l_a_u, &geom.basis)?
            + &xt_diag_y_dense(x_ls, &l_a, &basis_u)?
            + &xt_diag_y_dense(x_ls, &l_u, &basis_a)?
            + &xt_diag_y_dense(x_ls, &l, &basis_a_u)?;
        let hww_a_u = xt_diag_y_dense(&basis_a_u, &rows.w, &geom.basis)?;
        let hww_aw = xt_diag_y_dense(&basis_a, &dw_u, &geom.basis)?;
        let hww_au = xt_diag_y_dense(&basis_a, &rows.w, &basis_u)?;
        let h_ww = &hww_a_u
            + &hww_a_u.t()
            + &hww_aw
            + hww_aw.t()
            + &hww_au
            + hww_au.t()
            + &xt_diag_x_dense(&geom.basis, &dw_a_u)?;

        Ok(gaussian_pack_wiggle_joint_symmetrichessian(
            &h_mm, &h_ml, &h_mw, &h_ll, &h_lw, &h_ww,
        ))
    }

    fn exact_newton_joint_psi_terms_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiTerms>, String> {
        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        self.exact_newton_joint_psi_terms_from_designs(
            block_states,
            derivative_blocks,
            psi_index,
            &xmu,
            &x_ls,
        )
    }

    fn exact_newton_joint_psisecond_order_terms_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_i: usize,
        psi_j: usize,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms>, String> {
        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        self.exact_newton_joint_psisecond_order_terms_from_designs(
            block_states,
            derivative_blocks,
            psi_i,
            psi_j,
            &xmu,
            &x_ls,
        )
    }

    fn exact_newton_joint_psihessian_directional_derivative_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        self.exact_newton_joint_psihessian_directional_derivative_from_designs(
            block_states,
            derivative_blocks,
            psi_index,
            d_beta_flat,
            &xmu,
            &x_ls,
        )
    }
}

impl CustomFamily for GaussianLocationScaleWiggleFamily {
    fn exact_newton_joint_hessian_beta_dependent(&self) -> bool {
        true
    }

    fn coefficient_hessian_cost(&self, specs: &[ParameterBlockSpec]) -> u64 {
        // Operator-aware (see GaussianLocationScaleFamily for derivation): when
        // `use_joint_matrix_free_path` selects the workspace operator, joint
        // Hv apply is O(n · (p_t + p_ℓ + p_w)) — the row-streaming RowCoeffOperator
        // never materializes the dense (p_t + p_ℓ + p_w)² matrix.
        crate::families::location_scale_engine::location_scale_coefficient_hessian_cost(
            self.y.len() as u64,
            specs,
        )
    }

    fn block_linear_constraints(
        &self,
        block_states: &[ParameterBlockState],
        block_idx: usize,
        spec: &ParameterBlockSpec,
    ) -> Result<Option<LinearInequalityConstraints>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        if block_idx != Self::BLOCK_WIGGLE {
            return Ok(None);
        }
        Ok(monotone_wiggle_nonnegative_constraints(spec.design.ncols()))
    }

    fn post_update_block_beta(
        &self,
        block_states: &[ParameterBlockState],
        block_idx: usize,
        block_spec: &ParameterBlockSpec,
        beta: Array1<f64>,
    ) -> Result<Array1<f64>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        assert!(!block_spec.name.is_empty());
        if block_idx != Self::BLOCK_WIGGLE {
            return Ok(beta);
        }
        validate_monotone_wiggle_beta_nonnegative(
            &beta,
            "GaussianLocationScaleWiggleFamily post-update",
        )?;
        Ok(beta)
    }

    fn evaluate(&self, block_states: &[ParameterBlockState]) -> Result<FamilyEvaluation, String> {
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let eta_mu = &block_states[Self::BLOCK_MU].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        if eta_mu.len() != n || eta_ls.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "GaussianLocationScaleWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        let ln2pi = (2.0 * std::f64::consts::PI).ln();
        // Per-row kernel emits 6 working values into pre-allocated outputs;
        // ll is reduced via Rayon's sum. Independent across rows. Note
        // wmu == ww (both equal location_working_weight) and the mean+wiggle
        // working responses share row.location_working_shift, applied to
        // eta_mu[i] and etaw[i] respectively. The previous `q = eta_mu + etaw`
        // intermediate is inlined to avoid an extra n-vector allocation.
        let mut zmu = Array1::<f64>::zeros(n);
        let mut wmu = Array1::<f64>::zeros(n);
        let mut zls = Array1::<f64>::zeros(n);
        let mut wls = Array1::<f64>::zeros(n);
        let mut zw = Array1::<f64>::zeros(n);
        let mut ww = Array1::<f64>::zeros(n);
        const CHUNK: usize = 1024;
        let zmu_s = zmu
            .as_slice_memory_order_mut()
            .expect("zeros is contiguous");
        let wmu_s = wmu
            .as_slice_memory_order_mut()
            .expect("zeros is contiguous");
        let zls_s = zls
            .as_slice_memory_order_mut()
            .expect("zeros is contiguous");
        let wls_s = wls
            .as_slice_memory_order_mut()
            .expect("zeros is contiguous");
        let zw_s = zw.as_slice_memory_order_mut().expect("zeros is contiguous");
        let ww_s = ww.as_slice_memory_order_mut().expect("zeros is contiguous");
        let y_view = self.y.view();
        let w_view = self.weights.view();
        let eta_mu_view = eta_mu.view();
        let eta_ls_view = eta_ls.view();
        let etaw_view = etaw.view();
        let ll: f64 = zmu_s
            .par_chunks_mut(CHUNK)
            .zip(wmu_s.par_chunks_mut(CHUNK))
            .zip(zls_s.par_chunks_mut(CHUNK))
            .zip(wls_s.par_chunks_mut(CHUNK))
            .zip(zw_s.par_chunks_mut(CHUNK))
            .zip(ww_s.par_chunks_mut(CHUNK))
            .enumerate()
            .map(
                |(chunk_idx, (((((zmu_c, wmu_c), zls_c), wls_c), zw_c), ww_c))| {
                    let start = chunk_idx * CHUNK;
                    let mut local_ll = 0.0;
                    for local in 0..zmu_c.len() {
                        let i = start + local;
                        let q_i = eta_mu_view[i] + etaw_view[i];
                        let row = gaussian_diagonal_row_kernel(
                            y_view[i],
                            q_i,
                            eta_ls_view[i],
                            w_view[i],
                            ln2pi,
                        );
                        let w_i = row.location_working_weight;
                        let shift = row.location_working_shift;
                        zmu_c[local] = eta_mu_view[i] + shift;
                        wmu_c[local] = w_i;
                        zw_c[local] = etaw_view[i] + shift;
                        ww_c[local] = w_i;
                        zls_c[local] = row.log_sigma_working_response;
                        wls_c[local] = row.log_sigma_working_weight;
                        local_ll += row.log_likelihood;
                    }
                    local_ll
                },
            )
            .sum();

        Ok(FamilyEvaluation {
            log_likelihood: ll,
            blockworking_sets: vec![
                BlockWorkingSet::diagonal_checked(zmu, wmu)?,
                BlockWorkingSet::diagonal_checked(zls, wls)?,
                BlockWorkingSet::diagonal_checked(zw, ww)?,
            ],
        })
    }

    fn log_likelihood_only(&self, block_states: &[ParameterBlockState]) -> Result<f64, String> {
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let eta_mu = &block_states[Self::BLOCK_MU].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        if eta_mu.len() != self.y.len()
            || eta_ls.len() != self.y.len()
            || etaw.len() != self.y.len()
            || self.weights.len() != self.y.len()
        {
            return Err(GamlssError::DimensionMismatch {
                reason: "GaussianLocationScaleWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        let q = eta_mu + etaw;
        let ln2pi = (2.0 * std::f64::consts::PI).ln();
        let mut ll = 0.0;
        for i in 0..self.y.len() {
            let sigma_i = logb_sigma_from_eta_scalar(eta_ls[i]);
            let inv_s2 = (sigma_i * sigma_i).recip();
            let r = self.y[i] - q[i];
            ll += self.weights[i] * (-0.5 * (r * r * inv_s2 + ln2pi + 2.0 * sigma_i.ln()));
        }
        Ok(ll)
    }

    /// Outer-only log-likelihood with optional row subsample.
    ///
    /// When `options.outer_score_subsample` is `Some`, only the sampled rows
    /// contribute; each row's per-row log-likelihood term is multiplied by
    /// `WeightedOuterRow.weight`, the Horvitz–Thompson inverse-inclusion
    /// factor 1/π_i (uniform or stratified sampling both supported), so the
    /// partial sum is an unbiased estimator of the full-data log-likelihood.
    /// When `None`, this returns the full-data `log_likelihood_only`. Inner
    /// PIRLS line searches never install the subsample option, so they
    /// continue to score the exact full-data log-likelihood.
    fn log_likelihood_only_with_options(
        &self,
        block_states: &[ParameterBlockState],
        options: &BlockwiseFitOptions,
    ) -> Result<f64, String> {
        let Some(subsample) = options.outer_score_subsample.as_ref() else {
            return self.log_likelihood_only(block_states);
        };
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let eta_mu = &block_states[Self::BLOCK_MU].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        if eta_mu.len() != n || eta_ls.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "GaussianLocationScaleWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        let ln2pi = (2.0 * std::f64::consts::PI).ln();
        use rayon::iter::ParallelIterator;
        let ll: f64 = subsample
            .rows
            .par_iter()
            .map(|row| {
                let i = row.index;
                let wi = self.weights[i];
                if wi == 0.0 {
                    return 0.0;
                }
                let sigma_i = logb_sigma_from_eta_scalar(eta_ls[i]);
                let inv_s2 = (sigma_i * sigma_i).recip();
                let r = self.y[i] - eta_mu[i] - etaw[i];
                row.weight * wi * (-0.5 * (r * r * inv_s2 + ln2pi + 2.0 * sigma_i.ln()))
            })
            .sum();
        Ok(ll)
    }

    fn requires_joint_outer_hyper_path(&self) -> bool {
        true
    }

    fn exact_newton_hessian_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        block_idx: usize,
        d_beta: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let pmu = self
            .mu_design
            .as_ref()
            .ok_or_else(|| {
                "GaussianLocationScaleWiggleFamily exact path is missing mu design".to_string()
            })?
            .ncols();
        let p_ls = self
            .log_sigma_design
            .as_ref()
            .ok_or_else(|| {
                "GaussianLocationScaleWiggleFamily exact path is missing log-sigma design"
                    .to_string()
            })?
            .ncols();
        let pw = block_states[Self::BLOCK_WIGGLE].beta.len();
        let total = pmu + p_ls + pw;
        let (start, end) = match block_idx {
            Self::BLOCK_MU => (0usize, pmu),
            Self::BLOCK_LOG_SIGMA => (pmu, pmu + p_ls),
            Self::BLOCK_WIGGLE => (pmu + p_ls, total),
            _ => return Ok(None),
        };
        if d_beta.len() != end - start {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "GaussianLocationScaleWiggleFamily block {block_idx} d_beta length mismatch: got {}, expected {}",
                d_beta.len(),
                end - start
            ) }.into());
        }
        let mut d_beta_flat = Array1::<f64>::zeros(total);
        d_beta_flat.slice_mut(s![start..end]).assign(d_beta);
        let (xmu, x_ls) = self.dense_block_designs()?;
        let d_joint = self
            .exact_newton_joint_hessian_directional_derivative_from_designs(
                block_states,
                &xmu,
                &x_ls,
                &d_beta_flat,
            )?
            .ok_or_else(|| "missing Gaussian wiggle exact joint directional Hessian".to_string())?;
        Ok(Some(d_joint.slice(s![start..end, start..end]).to_owned()))
    }

    fn exact_newton_joint_hessian(
        &self,
        block_states: &[ParameterBlockState],
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_for_specs(block_states, None)
    }

    fn has_explicit_joint_hessian(&self) -> bool {
        true
    }

    fn exact_newton_joint_hessian_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_directional_derivative_for_specs(
            block_states,
            None,
            d_beta_flat,
        )
    }

    fn exact_newton_joint_hessiansecond_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        d_beta_u_flat: &Array1<f64>,
        d_beta_v_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_second_directional_derivative_for_specs(
            block_states,
            None,
            d_beta_u_flat,
            d_beta_v_flat,
        )
    }

    fn exact_newton_joint_hessian_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_for_specs(block_states, Some(specs))
    }

    fn exact_newton_joint_hessian_directional_derivative_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_directional_derivative_for_specs(
            block_states,
            Some(specs),
            d_beta_flat,
        )
    }

    fn exact_newton_joint_hessian_second_directional_derivative_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        d_beta_u_flat: &Array1<f64>,
        d_beta_v_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_second_directional_derivative_for_specs(
            block_states,
            Some(specs),
            d_beta_u_flat,
            d_beta_v_flat,
        )
    }

    fn exact_newton_joint_psi_terms(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiTerms>, String> {
        self.exact_newton_joint_psi_terms_for_specs(
            block_states,
            specs,
            derivative_blocks,
            psi_index,
        )
    }

    fn exact_newton_joint_psisecond_order_terms(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_i: usize,
        psi_j: usize,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms>, String> {
        self.exact_newton_joint_psisecond_order_terms_for_specs(
            block_states,
            specs,
            derivative_blocks,
            psi_i,
            psi_j,
        )
    }

    fn exact_newton_joint_psihessian_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_psihessian_directional_derivative_for_specs(
            block_states,
            specs,
            derivative_blocks,
            psi_index,
            d_beta_flat,
        )
    }

    fn exact_newton_joint_psi_workspace(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
    ) -> Result<Option<Arc<dyn ExactNewtonJointPsiWorkspace>>, String> {
        if !self.exact_joint_supported() {
            return Ok(None);
        }
        Ok(Some(Arc::new(
            GaussianLocationScaleWiggleExactNewtonJointPsiWorkspace::new(
                self.clone(),
                block_states.to_vec(),
                specs,
                derivative_blocks.to_vec(),
            )?,
        )))
    }

    /// Outer-aware joint ψ workspace with optional row subsample.
    ///
    /// The wiggle ψ workspace shares the generic `LocationScaleJointPsiWorkspace`
    /// with the non-wiggle GLS family, and the subsample is plumbed through
    /// the trait. The wiggle's `ws_psi_*_from_parts` impls currently drop the
    /// subsample and fall back to the full-data exact wiggle ψ path; see
    /// their inline rationale and the `apply_ht_mask_*` helpers used by the
    /// non-wiggle GLS family. Storing the subsample here keeps the workspace
    /// signature uniform across both families and leaves a hook for the
    /// follow-up that refactors the wiggle inline arrays into a weights
    /// struct so HT masking can be applied in one place. Even without that
    /// refactor, the total outer score under subsampling remains an unbiased
    /// estimator of the full-data outer score: HT-unbiased LL
    /// (`log_likelihood_only_with_options`) + HT-unbiased ρ-Hessian
    /// (`exact_newton_joint_hessian_workspace_with_options`) + exact-unbiased
    /// ψ (the wiggle workspace path) = unbiased.
    fn exact_newton_joint_psi_workspace_with_options(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        options: &BlockwiseFitOptions,
    ) -> Result<Option<Arc<dyn ExactNewtonJointPsiWorkspace>>, String> {
        if !self.exact_joint_supported() {
            return Ok(None);
        }
        Ok(Some(Arc::new(
            GaussianLocationScaleWiggleExactNewtonJointPsiWorkspace::new_with_subsample(
                self.clone(),
                block_states.to_vec(),
                specs,
                derivative_blocks.to_vec(),
                options.outer_score_subsample.clone(),
            )?,
        )))
    }

    fn block_geometry(
        &self,
        block_states: &[ParameterBlockState],
        spec: &ParameterBlockSpec,
    ) -> Result<(DesignMatrix, Array1<f64>), String> {
        if spec.name != "wiggle" {
            return Ok((spec.design.clone(), spec.offset.clone()));
        }
        if block_states.is_empty() {
            return Err(GamlssError::UnsupportedConfiguration {
                reason: "Gaussian wiggle geometry requires mean block".to_string(),
            }
            .into());
        }
        let eta_mu = &block_states[Self::BLOCK_MU].eta;
        if eta_mu.len() != self.y.len() {
            return Err(GamlssError::DimensionMismatch {
                reason: "Gaussian wiggle geometry input size mismatch".to_string(),
            }
            .into());
        }
        let x = self.wiggle_design(eta_mu.view())?;
        if x.ncols() != spec.design.ncols() {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "Gaussian dynamic wiggle design col mismatch: got {}, expected {}",
                    x.ncols(),
                    spec.design.ncols()
                ),
            }
            .into());
        }
        let nrows = x.nrows();
        Ok((
            DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(x)),
            Array1::zeros(nrows),
        ))
    }

    fn block_geometry_is_dynamic(&self) -> bool {
        true
    }

    fn exact_newton_joint_hessian_workspace(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
    ) -> Result<Option<Arc<dyn ExactNewtonJointHessianWorkspace>>, String> {
        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        let workspace = GaussianLocationScaleWiggleHessianWorkspace::new(
            self.clone(),
            block_states.to_vec(),
            xmu.into_owned(),
            x_ls.into_owned(),
        )?;
        Ok(Some(Arc::new(workspace)))
    }

    /// Outer-aware joint-Hessian workspace with optional row subsample.
    ///
    /// When `options.outer_score_subsample` is `None`, this is byte-identical
    /// to `exact_newton_joint_hessian_workspace`. When `Some`, the precomputed
    /// per-row coefficient arrays in `pieces` (`coeff_mm`, `coeff_ml`,
    /// `coeff_ll`, `coeff_mw_b`, `coeff_mw_d`, `coeff_lw_b`, `coeff_ww`) —
    /// which every downstream assembly (`hessian_dense`, `hessian_matvec`,
    /// `hessian_diagonal`) consumes row-linearly via `Xᵀ diag(W) Y` — are
    /// replaced by a Horvitz–Thompson mask: each sampled row's coefficient
    /// is multiplied by `WeightedOuterRow.weight` (the inverse-inclusion
    /// factor 1/π_i; uniform or stratified sampling both supported), and
    /// non-sampled rows are zeroed. The `basis`/`basis_d1` matrices are
    /// row-weight-independent and remain unchanged. Note that the Gaussian
    /// wiggle has one fewer cross-coefficient than the binomial wiggle
    /// (no `coeff_lw_d`) because the wiggle enters the Gaussian likelihood
    /// only through `q = η_μ + η_w` (no σ-chain). The resulting joint Hessian
    /// is an unbiased estimator of the full-data joint Hessian. Inner PIRLS
    /// never installs the option, so the inner solve continues to consume
    /// the exact full-data Hessian.
    fn exact_newton_joint_hessian_workspace_with_options(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        options: &BlockwiseFitOptions,
    ) -> Result<Option<Arc<dyn ExactNewtonJointHessianWorkspace>>, String> {
        let Some((xmu, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        let mut workspace = GaussianLocationScaleWiggleHessianWorkspace::new(
            self.clone(),
            block_states.to_vec(),
            xmu.into_owned(),
            x_ls.into_owned(),
        )?;
        if let Some(subsample) = options.outer_score_subsample.as_ref() {
            workspace.apply_outer_subsample(subsample.rows.as_ref());
        }
        Ok(Some(Arc::new(workspace)))
    }

    /// Outer-derivative policy: declare HT-subsample capability.
    ///
    /// GaussianLocationScaleWiggleFamily overrides
    /// `log_likelihood_only_with_options` and
    /// `exact_newton_joint_hessian_workspace_with_options` to consume
    /// `options.outer_score_subsample` with per-row Horvitz–Thompson weights
    /// (each sampled row's contribution is multiplied by
    /// `WeightedOuterRow.weight = 1/π_i`; non-sampled rows are zeroed),
    /// yielding unbiased estimators of the full-data log-likelihood and
    /// joint Hessian. The ψ-workspace path is also subsample-aware via
    /// `exact_newton_joint_psi_workspace_with_options`, which threads the
    /// subsample down to per-row weight masking inside the joint-ψ second-
    /// order and directional-derivative reductions. Inner-PIRLS and final-
    /// covariance paths never install the option, so they continue to
    /// consume the exact full-data quantities.
    fn outer_derivative_subsample_capable(&self) -> bool {
        true
    }

    fn inner_coefficient_hessian_hvp_available(&self, specs: &[ParameterBlockSpec]) -> bool {
        // Same gating as the workspace impl above: matrix-free fires when
        // `exact_joint_dense_block_designs` is satisfiable, which requires
        // both location and scale block designs to be present.  The wiggle
        // block is folded into the operator via the per-row pieces — its
        // presence is implied by reaching the wiggle family in the first
        // place — so the predicate matches the non-wiggle case.
        self.exact_joint_supported()
            && matches!(
                self.exact_joint_dense_block_designs(Some(specs)),
                Ok(Some(_))
            )
    }
}

/// Matrix-free joint-Hessian operator for the 3-block Gaussian
/// location-scale wiggle family. See `GaussianLocationScaleWiggleHessianRowPieces`
/// for the per-row weight structure. The matvec applies
///
///   r_μ  = D_mm u_μ + D_ml u_ls + D_mw_b (B v_w) + D_mw_d (B' v_w),
///   r_ls = D_ml u_μ + D_ll u_ls + D_lw_b (B v_w),
///   r_b  = D_mw_b u_μ + D_lw_b u_ls + D_ww (B v_w),
///   r_d  = D_mw_d u_μ,
///
/// then forms `out_w = B^T r_b + (B')^T r_d`. The ls-wiggle cross block has
/// no B' contribution because the wiggle enters the Gaussian likelihood only
/// through `q = η_μ + η_w` (no σ-chain), so the Gaussian wiggle has one
/// fewer cross-coefficient than the binomial wiggle.
struct GaussianLocationScaleWiggleHessianWorkspace {
    family: GaussianLocationScaleWiggleFamily,
    block_states: Vec<ParameterBlockState>,
    xmu: Arc<Array2<f64>>,
    x_ls: Arc<Array2<f64>>,
    pieces: GaussianLocationScaleWiggleHessianRowPieces,
}

impl GaussianLocationScaleWiggleHessianWorkspace {
    fn new(
        family: GaussianLocationScaleWiggleFamily,
        block_states: Vec<ParameterBlockState>,
        xmu: Array2<f64>,
        x_ls: Array2<f64>,
    ) -> Result<Self, String> {
        let pieces = family.wiggle_hessian_row_pieces(&block_states)?;
        Ok(Self {
            family,
            block_states,
            xmu: Arc::new(xmu),
            x_ls: Arc::new(x_ls),
            pieces,
        })
    }

    /// Apply a Horvitz–Thompson outer-row subsample mask to the precomputed
    /// per-row coefficient arrays in place.
    ///
    /// Each sampled row's `coeff_*[i]` is multiplied by its
    /// `WeightedOuterRow.weight` (the HT inverse-inclusion factor 1/π_i —
    /// uniform or stratified sampling both supported). All non-sampled rows
    /// are zeroed. Because every downstream assembly (`hessian_dense`,
    /// `hessian_matvec`, `hessian_diagonal`) is row-linear in these arrays
    /// via `Xᵀ diag(W) Y`, the resulting joint-Hessian is an unbiased
    /// estimator of the full-data joint Hessian. The `basis`/`basis_d1`
    /// matrices are independent of the per-row weights and remain unchanged.
    /// The Gaussian wiggle has 7 coefficient arrays (no `coeff_lw_d`, unlike
    /// the binomial wiggle's 8) because the wiggle enters the Gaussian
    /// likelihood only through `q = η_μ + η_w` (no σ-chain).
    fn apply_outer_subsample(
        &mut self,
        rows: &[crate::families::marginal_slope_shared::WeightedOuterRow],
    ) {
        let n = self.pieces.coeff_mm.len();
        let mut mask_mm = Array1::<f64>::zeros(n);
        let mut mask_ml = Array1::<f64>::zeros(n);
        let mut mask_ll = Array1::<f64>::zeros(n);
        let mut mask_mw_b = Array1::<f64>::zeros(n);
        let mut mask_mw_d = Array1::<f64>::zeros(n);
        let mut mask_lw_b = Array1::<f64>::zeros(n);
        let mut maskww = Array1::<f64>::zeros(n);
        for r in rows {
            let i = r.index;
            let w = r.weight;
            mask_mm[i] = self.pieces.coeff_mm[i] * w;
            mask_ml[i] = self.pieces.coeff_ml[i] * w;
            mask_ll[i] = self.pieces.coeff_ll[i] * w;
            mask_mw_b[i] = self.pieces.coeff_mw_b[i] * w;
            mask_mw_d[i] = self.pieces.coeff_mw_d[i] * w;
            mask_lw_b[i] = self.pieces.coeff_lw_b[i] * w;
            maskww[i] = self.pieces.coeff_ww[i] * w;
        }
        self.pieces.coeff_mm = mask_mm;
        self.pieces.coeff_ml = mask_ml;
        self.pieces.coeff_ll = mask_ll;
        self.pieces.coeff_mw_b = mask_mw_b;
        self.pieces.coeff_mw_d = mask_mw_d;
        self.pieces.coeff_lw_b = mask_lw_b;
        self.pieces.coeff_ww = maskww;
    }
}

impl ExactNewtonJointHessianWorkspace for GaussianLocationScaleWiggleHessianWorkspace {
    fn hessian_dense(&self) -> Result<Option<Array2<f64>>, String> {
        // Same Hv structure as `hessian_matvec`, but routed through the
        // already-existing `assemble_dense` row-pieces helper (six GEMMs:
        // h_mm, h_ml, h_mw_b, h_mw_d, h_lw, h_ww). Avoids `total` canonical-
        // basis HVPs in `MatrixFreeSpdOperator::materialize_dense_operator`,
        // which at biobank scale (n≈320k, p_total≈82) costs ~568s per κ-iter
        // versus ~1s for the dense build.
        let dense = self
            .pieces
            .assemble_dense(self.xmu.as_ref(), self.x_ls.as_ref())?;
        Ok(Some(dense))
    }

    fn hessian_matvec_available(&self) -> bool {
        true
    }

    fn hessian_matvec(&self, v: &Array1<f64>) -> Result<Option<Array1<f64>>, String> {
        let pmu = self.xmu.ncols();
        let p_ls = self.x_ls.ncols();
        let pw = self.pieces.basis.ncols();
        let total = pmu + p_ls + pw;
        if v.len() != total {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleWiggle matvec dimension mismatch: got {}, expected {}",
                    v.len(),
                    total
                ),
            }
            .into());
        }
        let v_mu = v.slice(s![0..pmu]);
        let v_ls = v.slice(s![pmu..pmu + p_ls]);
        let v_w = v.slice(s![pmu + p_ls..total]);

        let u_mu = fast_av(self.xmu.as_ref(), &v_mu);
        let u_ls = fast_av(self.x_ls.as_ref(), &v_ls);
        let u_b = fast_av(&self.pieces.basis, &v_w);
        let u_d = fast_av(&self.pieces.basis_d1, &v_w);

        let r_mu = &self.pieces.coeff_mm * &u_mu
            + &self.pieces.coeff_ml * &u_ls
            + &self.pieces.coeff_mw_b * &u_b
            + &self.pieces.coeff_mw_d * &u_d;
        let r_ls = &self.pieces.coeff_ml * &u_mu
            + &self.pieces.coeff_ll * &u_ls
            + &self.pieces.coeff_lw_b * &u_b;
        let r_b = &self.pieces.coeff_mw_b * &u_mu
            + &self.pieces.coeff_lw_b * &u_ls
            + &self.pieces.coeff_ww * &u_b;
        let r_d = &self.pieces.coeff_mw_d * &u_mu;

        let out_mu = fast_atv(self.xmu.as_ref(), &r_mu);
        let out_ls = fast_atv(self.x_ls.as_ref(), &r_ls);
        let out_w = fast_atv(&self.pieces.basis, &r_b) + &fast_atv(&self.pieces.basis_d1, &r_d);

        let mut out = Array1::<f64>::zeros(total);
        out.slice_mut(s![0..pmu]).assign(&out_mu);
        out.slice_mut(s![pmu..pmu + p_ls]).assign(&out_ls);
        out.slice_mut(s![pmu + p_ls..total]).assign(&out_w);
        Ok(Some(out))
    }

    fn hessian_diagonal(&self) -> Result<Option<Array1<f64>>, String> {
        let pmu = self.xmu.ncols();
        let p_ls = self.x_ls.ncols();
        let pw = self.pieces.basis.ncols();
        let total = pmu + p_ls + pw;
        // Diagonals are independent column-wise reductions: parallelize.
        use rayon::iter::{IntoParallelIterator, ParallelIterator};
        let diag_mu: Vec<f64> = (0..pmu)
            .into_par_iter()
            .map(|j| {
                let col = self.xmu.column(j);
                col.iter()
                    .zip(self.pieces.coeff_mm.iter())
                    .map(|(&v, &c)| c * v * v)
                    .sum()
            })
            .collect();
        let diag_ls: Vec<f64> = (0..p_ls)
            .into_par_iter()
            .map(|j| {
                let col = self.x_ls.column(j);
                col.iter()
                    .zip(self.pieces.coeff_ll.iter())
                    .map(|(&v, &c)| c * v * v)
                    .sum()
            })
            .collect();
        let diag_w: Vec<f64> = (0..pw)
            .into_par_iter()
            .map(|j| {
                let col = self.pieces.basis.column(j);
                col.iter()
                    .zip(self.pieces.coeff_ww.iter())
                    .map(|(&v, &c)| c * v * v)
                    .sum()
            })
            .collect();
        let mut diag = Array1::<f64>::zeros(total);
        for (j, v) in diag_mu.into_iter().enumerate() {
            diag[j] = v;
        }
        for (j, v) in diag_ls.into_iter().enumerate() {
            diag[pmu + j] = v;
        }
        for (j, v) in diag_w.into_iter().enumerate() {
            diag[pmu + p_ls + j] = v;
        }
        Ok(Some(diag))
    }

    fn directional_derivative(
        &self,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.family
            .exact_newton_joint_hessian_directional_derivative_from_designs(
                &self.block_states,
                self.xmu.as_ref(),
                self.x_ls.as_ref(),
                d_beta_flat,
            )
    }

    fn directional_derivative_operator(
        &self,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Arc<dyn crate::solver::estimate::reml::unified::HyperOperator>>, String>
    {
        self.family.gls_wiggle_directional_operator(
            &self.block_states,
            self.xmu.clone(),
            self.x_ls.clone(),
            d_beta_flat,
        )
    }

    fn second_directional_derivative(
        &self,
        d_beta_u_flat: &Array1<f64>,
        d_beta_v_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.family
            .exact_newton_joint_hessiansecond_directional_derivative_from_designs(
                &self.block_states,
                self.xmu.as_ref(),
                self.x_ls.as_ref(),
                d_beta_u_flat,
                d_beta_v_flat,
            )
    }

    fn second_directional_derivative_operator(
        &self,
        d_beta_u: &Array1<f64>,
        d_beta_v: &Array1<f64>,
    ) -> Result<Option<Arc<dyn crate::solver::estimate::reml::unified::HyperOperator>>, String>
    {
        self.family.gls_wiggle_second_directional_operator(
            &self.block_states,
            self.xmu.clone(),
            self.x_ls.clone(),
            d_beta_u,
            d_beta_v,
        )
    }
}

impl CustomFamilyGenerative for GaussianLocationScaleWiggleFamily {
    fn generativespec(
        &self,
        block_states: &[ParameterBlockState],
    ) -> Result<GenerativeSpec, String> {
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "GaussianLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let eta_mu = &block_states[Self::BLOCK_MU].eta;
        let eta_wiggle = &block_states[Self::BLOCK_WIGGLE].eta;
        let eta_log_sigma = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let n = eta_mu.len();
        let mean = gamlss_rowwise_map(n, |i| eta_mu[i] + eta_wiggle[i]);
        let sigma = gamlss_rowwise_map(n, |i| logb_sigma_from_eta_scalar(eta_log_sigma[i]));
        Ok(GenerativeSpec {
            mean,
            noise: NoiseModel::Gaussian { sigma },
        })
    }
}

fn expect_single_block<'a>(
    block_states: &'a [ParameterBlockState],
    family_name: &str,
) -> Result<&'a ParameterBlockState, String> {
    if block_states.len() != 1 {
        return Err(GamlssError::DimensionMismatch {
            reason: format!("{family_name} expects 1 block, got {}", block_states.len()),
        }
        .into());
    }
    Ok(&block_states[0])
}

#[derive(Clone)]
pub struct BinomialMeanWiggleFamily {
    pub y: Array1<f64>,
    pub weights: Array1<f64>,
    pub link_kind: InverseLink,
    pub wiggle_knots: Array1<f64>,
    pub wiggle_degree: usize,
    /// Resource policy threaded into PsiDesignMap construction during
    /// exact-Newton joint psi evaluation. Defaults to
    /// `ResourcePolicy::default_library()` when the family is built without
    /// an explicit policy.
    pub policy: crate::resource::ResourcePolicy,
}

struct BinomialMeanWiggleGeometry {
    basis: Array2<f64>,
    basis_d1: Array2<f64>,
    basis_d2: Array2<f64>,
    basis_d3: Array2<f64>,
    dq_dq0: Array1<f64>,
    d2q_dq02: Array1<f64>,
    d3q_dq03: Array1<f64>,
    d4q_dq04: Array1<f64>,
}

struct BinomialMeanWiggleJointPsiDirection {
    x_eta_psi: Option<Array2<f64>>,
    z_eta_psi: Array1<f64>,
}

fn binomial_pack_mean_wiggle_joint_score(
    score_eta: &Array1<f64>,
    score_w: &Array1<f64>,
) -> Array1<f64> {
    let p_eta = score_eta.len();
    let pw = score_w.len();
    let mut out = Array1::<f64>::zeros(p_eta + pw);
    out.slice_mut(s![0..p_eta]).assign(score_eta);
    out.slice_mut(s![p_eta..p_eta + pw]).assign(score_w);
    out
}

fn binomial_pack_mean_wiggle_joint_symmetrichessian(
    h_eta_eta: &Array2<f64>,
    h_eta_w: &Array2<f64>,
    h_ww: &Array2<f64>,
) -> Array2<f64> {
    let p_eta = h_eta_eta.nrows();
    let pw = h_ww.nrows();
    let total = p_eta + pw;
    let mut out = Array2::<f64>::zeros((total, total));
    out.slice_mut(s![0..p_eta, 0..p_eta]).assign(h_eta_eta);
    out.slice_mut(s![0..p_eta, p_eta..total]).assign(h_eta_w);
    out.slice_mut(s![p_eta..total, p_eta..total]).assign(h_ww);
    mirror_upper_to_lower(&mut out);
    out
}

impl BinomialMeanWiggleFamily {
    pub const BLOCK_ETA: usize = 0;
    pub const BLOCK_WIGGLE: usize = 1;

    fn wiggle_basiswith_options(
        &self,
        q0: ArrayView1<'_, f64>,
        options: BasisOptions,
    ) -> Result<Array2<f64>, String> {
        monotone_wiggle_basis_with_derivative_order(
            q0,
            &self.wiggle_knots,
            self.wiggle_degree,
            options.derivative_order,
        )
    }

    fn wiggle_design(&self, q0: ArrayView1<'_, f64>) -> Result<Array2<f64>, String> {
        self.wiggle_basiswith_options(q0, BasisOptions::value())
    }

    fn wiggle_dq_dq0(
        &self,
        q0: ArrayView1<'_, f64>,
        beta_link_wiggle: ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, String> {
        let d_constrained = self.wiggle_basiswith_options(q0, BasisOptions::first_derivative())?;
        if d_constrained.ncols() != beta_link_wiggle.len() {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "wiggle derivative/beta mismatch: basis has {} columns but beta_link_wiggle has {} coefficients",
                d_constrained.ncols(),
                beta_link_wiggle.len()
            ) }.into());
        }
        Ok(d_constrained.dot(&beta_link_wiggle) + 1.0)
    }

    fn wiggle_d2q_dq02(
        &self,
        q0: ArrayView1<'_, f64>,
        beta_link_wiggle: ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, String> {
        let d2 = self.wiggle_basiswith_options(q0, BasisOptions::second_derivative())?;
        if d2.ncols() != beta_link_wiggle.len() {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "wiggle second-derivative/beta mismatch: basis has {} columns but beta_link_wiggle has {} coefficients",
                d2.ncols(),
                beta_link_wiggle.len()
            ) }.into());
        }
        Ok(d2.dot(&beta_link_wiggle))
    }

    fn wiggle_d3basis_constrained(&self, q0: ArrayView1<'_, f64>) -> Result<Array2<f64>, String> {
        monotone_wiggle_basis_with_derivative_order(q0, &self.wiggle_knots, self.wiggle_degree, 3)
    }

    fn wiggle_d3q_dq03(
        &self,
        q0: ArrayView1<'_, f64>,
        beta_link_wiggle: ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, String> {
        let d3 = self.wiggle_d3basis_constrained(q0)?;
        if d3.ncols() != beta_link_wiggle.len() {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "wiggle third-derivative/beta mismatch: basis has {} columns but beta_link_wiggle has {} coefficients",
                d3.ncols(),
                beta_link_wiggle.len()
            ) }.into());
        }
        Ok(d3.dot(&beta_link_wiggle))
    }

    fn wiggle_d4q_dq04(
        &self,
        q0: ArrayView1<'_, f64>,
        beta_link_wiggle: ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, String> {
        let d4 = monotone_wiggle_basis_with_derivative_order(
            q0,
            &self.wiggle_knots,
            self.wiggle_degree,
            4,
        )?;
        if d4.ncols() != beta_link_wiggle.len() {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "wiggle fourth-derivative/beta mismatch: basis has {} columns but beta_link_wiggle has {} coefficients",
                d4.ncols(),
                beta_link_wiggle.len()
            ) }.into());
        }
        Ok(d4.dot(&beta_link_wiggle))
    }

    fn wiggle_geometry(
        &self,
        q0: ArrayView1<'_, f64>,
        beta_link_wiggle: ArrayView1<'_, f64>,
    ) -> Result<BinomialMeanWiggleGeometry, String> {
        let basis = self.wiggle_design(q0)?;
        let basis_d1 = self.wiggle_basiswith_options(q0, BasisOptions::first_derivative())?;
        let basis_d2 = self.wiggle_basiswith_options(q0, BasisOptions::second_derivative())?;
        let basis_d3 = self.wiggle_d3basis_constrained(q0)?;
        let dq_dq0 = self.wiggle_dq_dq0(q0, beta_link_wiggle)?;
        let d2q_dq02 = self.wiggle_d2q_dq02(q0, beta_link_wiggle)?;
        let d3q_dq03 = self.wiggle_d3q_dq03(q0, beta_link_wiggle)?;
        let d4q_dq04 = self.wiggle_d4q_dq04(q0, beta_link_wiggle)?;
        Ok(BinomialMeanWiggleGeometry {
            basis,
            basis_d1,
            basis_d2,
            basis_d3,
            dq_dq0,
            d2q_dq02,
            d3q_dq03,
            d4q_dq04,
        })
    }

    fn neglog_q_derivatives(&self, y: f64, weight: f64, q: f64) -> Result<(f64, f64, f64), String> {
        let jet = inverse_link_jet_for_inverse_link(&self.link_kind, q)
            .map_err(|e| format!("fixed-link wiggle inverse-link evaluation failed: {e}"))?;
        // Pass μ RAW: the dispatch returns the exact q-derivatives of the
        // evaluated loss for every representable μ in (0,1) and handles the
        // saturated boundary itself. See binomial_location_scalerow (#948).
        Ok(binomial_neglog_q_derivatives_dispatch(
            y,
            weight,
            q,
            jet.mu,
            jet.d1,
            jet.d2,
            jet.d3,
            &self.link_kind,
        ))
    }

    fn neglog_q_fourth_derivative(&self, y: f64, weight: f64, q: f64) -> Result<f64, String> {
        let jet = inverse_link_jet_for_inverse_link(&self.link_kind, q)
            .map_err(|e| format!("fixed-link wiggle inverse-link evaluation failed: {e}"))?;
        // Pass μ RAW — see neglog_q_derivatives above (#948).
        binomial_neglog_q_fourth_derivative_dispatch(
            y,
            weight,
            q,
            jet.mu,
            jet.d1,
            jet.d2,
            jet.d3,
            &self.link_kind,
        )
    }

    fn dense_eta_design_fromspecs<'a>(
        &self,
        specs: &'a [ParameterBlockSpec],
    ) -> Result<Cow<'a, Array2<f64>>, String> {
        if specs.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialMeanWiggleFamily expects 2 specs, got {}",
                    specs.len()
                ),
            }
            .into());
        }
        Ok(match specs[Self::BLOCK_ETA].design.as_dense_ref() {
            Some(d) => Cow::Borrowed(d),
            None => Cow::Owned(
                specs[Self::BLOCK_ETA]
                    .design
                    .try_to_dense_with_policy(
                        &self.policy.material_policy(),
                        "BinomialMeanWiggle dense_eta_design_fromspecs eta",
                    )
                    .map_err(|e| e.to_string())?
                    .as_ref()
                    .clone(),
            ),
        })
    }

    fn exact_newton_joint_psi_direction(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        x_eta: &Array2<f64>,
    ) -> Result<Option<BinomialMeanWiggleJointPsiDirection>, String> {
        if block_states.len() != 2 || derivative_blocks.len() != 2 {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "BinomialMeanWiggleFamily joint psi direction expects 2 blocks and 2 derivative block lists, got {} and {}",
                block_states.len(),
                derivative_blocks.len()
            ) }.into());
        }
        let n = self.y.len();
        let p_eta = x_eta.ncols();
        let beta_eta = &block_states[Self::BLOCK_ETA].beta;
        let mut global = 0usize;
        for (block_idx, block_derivs) in derivative_blocks.iter().enumerate() {
            for deriv in block_derivs {
                if global == psi_index {
                    if block_idx != Self::BLOCK_ETA {
                        return Ok(None);
                    }
                    let x_eta_psi_map = resolve_custom_family_x_psi_map(
                        deriv,
                        n,
                        p_eta,
                        0..n,
                        "BinomialMeanWiggleFamily eta",
                        &self.policy,
                    )?;
                    let x_eta_psi = x_eta_psi_map.row_chunk(0..n)?;
                    let z_eta_psi = x_eta_psi.dot(beta_eta);
                    return Ok(Some(BinomialMeanWiggleJointPsiDirection {
                        x_eta_psi: Some(x_eta_psi),
                        z_eta_psi,
                    }));
                }
                global += 1;
            }
        }
        Ok(None)
    }

    fn exact_newton_joint_psi_action(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        p_eta: usize,
    ) -> Result<Option<(CustomFamilyPsiDesignAction, Array1<f64>)>, String> {
        if block_states.len() != 2 || derivative_blocks.len() != 2 {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "BinomialMeanWiggleFamily joint psi action expects 2 blocks and 2 derivative block lists, got {} and {}",
                block_states.len(),
                derivative_blocks.len()
            ) }.into());
        }
        let n = self.y.len();
        let beta_eta = &block_states[Self::BLOCK_ETA].beta;
        let mut global = 0usize;
        for (block_idx, block_derivs) in derivative_blocks.iter().enumerate() {
            for deriv in block_derivs {
                if global == psi_index {
                    if block_idx != Self::BLOCK_ETA {
                        return Ok(None);
                    }
                    let action = match CustomFamilyPsiDesignAction::from_first_derivative(
                        deriv,
                        n,
                        p_eta,
                        0..n,
                        "BinomialMeanWiggleFamily eta",
                    ) {
                        Ok(action) => action,
                        Err(_) => return Ok(None),
                    };
                    let z_eta_psi = action.forward_mul(beta_eta.view());
                    return Ok(Some((action, z_eta_psi)));
                }
                global += 1;
            }
        }
        Ok(None)
    }

    fn bmw_static_hessian_operator(
        &self,
        block_states: &[ParameterBlockState],
        x_eta_arc: Arc<Array2<f64>>,
    ) -> Result<Arc<RowCoeffOperator>, String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialMeanWiggleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let eta = &block_states[Self::BLOCK_ETA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        let betaw = &block_states[Self::BLOCK_WIGGLE].beta;
        let n = self.y.len();
        if eta.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialMeanWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        let geom = self.wiggle_geometry(eta.view(), betaw.view())?;
        let p_eta = x_eta_arc.ncols();
        let pw = geom.basis.ncols();
        let mut coeff_eta = Array1::<f64>::zeros(n);
        let mut coeff_etaw_b = Array1::<f64>::zeros(n);
        let mut coeff_etaw_d1 = Array1::<f64>::zeros(n);
        let mut coeff_ww = Array1::<f64>::zeros(n);
        for row in 0..n {
            let q = eta[row] + etaw[row];
            let (m1, m2, _) = self.neglog_q_derivatives(self.y[row], self.weights[row], q)?;
            let a = geom.dq_dq0[row];
            let b = geom.d2q_dq02[row];
            coeff_eta[row] = hessian_coeff_fromobjective_q_terms(m1, m2, a, a, b);
            coeff_etaw_b[row] = m2 * a;
            coeff_etaw_d1[row] = m1;
            coeff_ww[row] = m2;
        }
        Ok(Arc::new(RowCoeffOperator::from_directions(
            vec![p_eta, pw],
            vec![
                (0, x_eta_arc),
                (1, Arc::new(geom.basis)),
                (1, Arc::new(geom.basis_d1)),
            ],
            vec![
                (0, 0, coeff_eta),
                (0, 1, coeff_etaw_b),
                (0, 2, coeff_etaw_d1),
                (1, 1, coeff_ww),
            ],
            n,
        )))
    }

    fn bmw_directional_operator(
        &self,
        block_states: &[ParameterBlockState],
        x_eta_arc: Arc<Array2<f64>>,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Arc<dyn crate::solver::estimate::reml::unified::HyperOperator>>, String>
    {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialMeanWiggleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let eta = &block_states[Self::BLOCK_ETA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        let betaw = &block_states[Self::BLOCK_WIGGLE].beta;
        let n = self.y.len();
        if eta.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialMeanWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        let geom = self.wiggle_geometry(eta.view(), betaw.view())?;
        let p_eta = x_eta_arc.ncols();
        let pw = geom.basis.ncols();
        let total = p_eta + pw;
        if d_beta_flat.len() != total {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialMeanWiggleFamily joint d_beta length mismatch: got {}, expected {}",
                    d_beta_flat.len(),
                    total
                ),
            }
            .into());
        }
        let u_eta = d_beta_flat.slice(s![0..p_eta]).to_owned();
        let uw = d_beta_flat.slice(s![p_eta..total]).to_owned();
        let xi = fast_av(x_eta_arc.as_ref(), &u_eta);
        let phi = fast_av(&geom.basis, &uw);
        let basis1_u = fast_av(&geom.basis_d1, &uw);
        let basis2_u = fast_av(&geom.basis_d2, &uw);

        let mut coeff_eta = Array1::<f64>::zeros(n);
        let mut coeff_etaw_b = Array1::<f64>::zeros(n);
        let mut coeff_etaw_d1 = Array1::<f64>::zeros(n);
        let mut coeff_etaw_d2 = Array1::<f64>::zeros(n);
        let mut coeff_ww_bb = Array1::<f64>::zeros(n);
        let mut coeff_ww_db = Array1::<f64>::zeros(n);
        for row in 0..n {
            let q = eta[row] + etaw[row];
            let (m1, m2, m3) = self.neglog_q_derivatives(self.y[row], self.weights[row], q)?;
            let a = geom.dq_dq0[row];
            let b = geom.d2q_dq02[row];
            let c = geom.d3q_dq03[row];
            let q_u = a * xi[row] + phi[row];
            let a_u = b * xi[row] + basis1_u[row];
            let b_u = c * xi[row] + basis2_u[row];
            coeff_eta[row] = directionalhessian_coeff_fromobjective_q_terms(
                m1, m2, m3, q_u, a, a, b, a_u, a_u, b_u,
            );
            coeff_etaw_b[row] = m3 * q_u * a + m2 * a_u;
            coeff_etaw_d1[row] = m2 * (a * xi[row] + q_u);
            coeff_etaw_d2[row] = m1 * xi[row];
            coeff_ww_bb[row] = m3 * q_u;
            coeff_ww_db[row] = m2 * xi[row];
        }
        Ok(Some(Arc::new(RowCoeffOperator::from_directions(
            vec![p_eta, pw],
            vec![
                (0, x_eta_arc),
                (1, Arc::new(geom.basis)),
                (1, Arc::new(geom.basis_d1)),
                (1, Arc::new(geom.basis_d2)),
            ],
            vec![
                (0, 0, coeff_eta),
                (0, 1, coeff_etaw_b),
                (0, 2, coeff_etaw_d1),
                (0, 3, coeff_etaw_d2),
                (1, 1, coeff_ww_bb),
                (1, 2, coeff_ww_db),
            ],
            n,
        ))))
    }

    fn bmw_second_directional_operator(
        &self,
        block_states: &[ParameterBlockState],
        x_eta_arc: Arc<Array2<f64>>,
        d_beta_u_flat: &Array1<f64>,
        d_beta_v_flat: &Array1<f64>,
    ) -> Result<Option<Arc<dyn crate::solver::estimate::reml::unified::HyperOperator>>, String>
    {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialMeanWiggleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let eta = &block_states[Self::BLOCK_ETA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        let betaw = &block_states[Self::BLOCK_WIGGLE].beta;
        let n = self.y.len();
        if eta.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialMeanWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        let geom = self.wiggle_geometry(eta.view(), betaw.view())?;
        let p_eta = x_eta_arc.ncols();
        let pw = geom.basis.ncols();
        let total = p_eta + pw;
        if d_beta_u_flat.len() != total || d_beta_v_flat.len() != total {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "BinomialMeanWiggleFamily joint second d_beta length mismatch: got {} and {}, expected {}",
                d_beta_u_flat.len(),
                d_beta_v_flat.len(),
                total
            ) }.into());
        }
        let u_eta = d_beta_u_flat.slice(s![0..p_eta]).to_owned();
        let v_eta = d_beta_v_flat.slice(s![0..p_eta]).to_owned();
        let uw = d_beta_u_flat.slice(s![p_eta..total]).to_owned();
        let vw = d_beta_v_flat.slice(s![p_eta..total]).to_owned();

        let xi_u = fast_av(x_eta_arc.as_ref(), &u_eta);
        let xi_v = fast_av(x_eta_arc.as_ref(), &v_eta);
        let phi_u = fast_av(&geom.basis, &uw);
        let phi_v = fast_av(&geom.basis, &vw);
        let b1u = fast_av(&geom.basis_d1, &uw);
        let b1v = fast_av(&geom.basis_d1, &vw);
        let b2u = fast_av(&geom.basis_d2, &uw);
        let b2v = fast_av(&geom.basis_d2, &vw);
        let b3u = fast_av(&geom.basis_d3, &uw);
        let b3v = fast_av(&geom.basis_d3, &vw);

        let mut coeff_eta = Array1::<f64>::zeros(n);
        let mut coeff_etaw_b = Array1::<f64>::zeros(n);
        let mut coeff_etaw_d1 = Array1::<f64>::zeros(n);
        let mut coeff_etaw_d2 = Array1::<f64>::zeros(n);
        let mut coeff_etaw_d3 = Array1::<f64>::zeros(n);
        let mut coeff_ww_bb = Array1::<f64>::zeros(n);
        let mut coeff_ww_db = Array1::<f64>::zeros(n);
        let mut coeff_ww_ddb = Array1::<f64>::zeros(n);
        let mut coeff_ww_dd = Array1::<f64>::zeros(n);

        for row in 0..n {
            let q = eta[row] + etaw[row];
            let (m1, m2, m3) = self.neglog_q_derivatives(self.y[row], self.weights[row], q)?;
            let m4 = self.neglog_q_fourth_derivative(self.y[row], self.weights[row], q)?;
            let a = geom.dq_dq0[row];
            let b = geom.d2q_dq02[row];
            let c = geom.d3q_dq03[row];
            let d = geom.d4q_dq04[row];

            let q_u = a * xi_u[row] + phi_u[row];
            let a_u = b * xi_u[row] + b1u[row];
            let b_u = c * xi_u[row] + b2u[row];
            let q_v = a * xi_v[row] + phi_v[row];
            let a_v = b * xi_v[row] + b1v[row];
            let b_v = c * xi_v[row] + b2v[row];
            let q_uv = b * xi_u[row] * xi_v[row] + b1u[row] * xi_v[row] + b1v[row] * xi_u[row];
            let a_uv = c * xi_u[row] * xi_v[row] + b2u[row] * xi_v[row] + b2v[row] * xi_u[row];
            let b_uv = d * xi_u[row] * xi_v[row] + b3u[row] * xi_v[row] + b3v[row] * xi_u[row];

            coeff_eta[row] = second_directionalhessian_coeff_fromobjective_q_terms(
                m1, m2, m3, m4, q_u, q_v, q_uv, a, a, b, a_u, a_v, a_u, a_v, a_uv, a_uv, b_u, b_v,
                b_uv,
            );
            let d2_c_b = m4 * q_u * q_v * a + m3 * (q_uv * a + q_u * a_v + q_v * a_u) + m2 * a_uv;
            let dc_b_u = m3 * q_u * a + m2 * a_u;
            let dc_b_v = m3 * q_v * a + m2 * a_v;
            let c_b_static = m2 * a;
            let d2_c_b1 = m3 * q_u * q_v + m2 * q_uv;
            let dc_b1_u = m2 * q_u;
            let dc_b1_v = m2 * q_v;

            coeff_etaw_b[row] = d2_c_b;
            coeff_etaw_d1[row] = dc_b_u * xi_v[row] + dc_b_v * xi_u[row] + d2_c_b1;
            coeff_etaw_d2[row] =
                c_b_static * xi_u[row] * xi_v[row] + dc_b1_u * xi_v[row] + dc_b1_v * xi_u[row];
            coeff_etaw_d3[row] = m1 * xi_u[row] * xi_v[row];

            let dw = m2;
            let dw_u = m3 * q_u;
            let dw_v = m3 * q_v;
            let dw_uv = m4 * q_u * q_v + m3 * q_uv;
            let xixj = xi_u[row] * xi_v[row];
            coeff_ww_bb[row] = dw_uv;
            coeff_ww_db[row] = dw_v * xi_u[row] + dw_u * xi_v[row];
            coeff_ww_ddb[row] = dw * xixj;
            coeff_ww_dd[row] = 2.0 * dw * xixj;
        }

        Ok(Some(Arc::new(RowCoeffOperator::from_directions(
            vec![p_eta, pw],
            vec![
                (0, x_eta_arc),
                (1, Arc::new(geom.basis)),
                (1, Arc::new(geom.basis_d1)),
                (1, Arc::new(geom.basis_d2)),
                (1, Arc::new(geom.basis_d3)),
            ],
            vec![
                (0, 0, coeff_eta),
                (0, 1, coeff_etaw_b),
                (0, 2, coeff_etaw_d1),
                (0, 3, coeff_etaw_d2),
                (0, 4, coeff_etaw_d3),
                (1, 1, coeff_ww_bb),
                (1, 2, coeff_ww_db),
                (1, 3, coeff_ww_ddb),
                (2, 2, coeff_ww_dd),
            ],
            n,
        ))))
    }

    /// Build the [`BlockEffectiveJacobian`] for block `block_idx`.
    ///
    /// `BinomialMeanWiggle` has a single location output (n_outputs = 1):
    /// - block 0 (eta):    output 0 = design rows
    /// - block 1 (wiggle): all zeros (nonlinear link modulation)
    pub fn block_effective_jacobian(
        specs: &[ParameterBlockSpec],
        block_idx: usize,
    ) -> Result<Box<dyn BlockEffectiveJacobian>, String> {
        crate::util::block_jacobian::AdditiveWiggleBlockLayout {
            family: "BinomialMeanWiggleFamily",
            n_outputs: 1,
            additive_blocks: &[Self::BLOCK_ETA],
            wiggle_block: Some(Self::BLOCK_WIGGLE),
        }
        .block_effective_jacobian(specs, block_idx)
    }
}

impl CustomFamily for BinomialMeanWiggleFamily {
    fn exact_newton_joint_hessian_beta_dependent(&self) -> bool {
        true
    }

    /// The binomial mean link-wiggle refit must NOT carry the full-span
    /// Jeffreys/Firth augmentation, for the same structural reason
    /// `GaussianLocationScaleWiggleFamily` opts out (#684–#688) — and the
    /// binomial wiggle hits it harder. This is a *second-stage* refit: the
    /// pilot binomial mean fit has already converged through the ordinary
    /// PIRLS path (which is itself un-Firthed unless the user opts in — the
    /// standard binomial fit logs `firth=false` / `jeffreys_logdet=none`), so
    /// the wiggle refit only adds a *penalized*, *monotone-constrained*
    /// I-spline link-shape correction `q = η + B(η)·β_w` around an
    /// already-finite mode. Two failure modes follow from leaving the term on
    /// (default `true`):
    ///
    /// 1. **Phantom stationarity residual.** When `H_pen` is full-rank and
    ///    well-conditioned (the normal case — e.g. `cond ≈ 5.5e2` on the #872
    ///    pure-probit repro) the Jeffreys gate smooth-steps the curvature
    ///    `H_Φ → 0`, but the matching score `∇Φ` does not vanish in lock-step,
    ///    so it leaks a nonzero `|∇L − Sβ + ∇Φ|` into the inner joint-Newton
    ///    KKT residual. The certificate then refuses every iterate and the
    ///    outer REML rejects all seeds (exactly the #684–#688 abort signature).
    /// 2. **Saturation barrier / divergence.** `−Φ = −½log|I_J|` is folded into
    ///    the objective and `∇Φ ∝ I_J⁻¹` into the gradient. The I-spline warp
    ///    can drive the binomial linear predictor toward saturation, where the
    ///    reduced Fisher information `I_J` goes singular: `−Φ → +∞` and
    ///    `∇Φ → ∞`. The augmented objective grows a barrier that the joint
    ///    Newton diverges into — the #872 repro runs the full 1200-cycle budget
    ///    with the augmented objective pinned at ~4.6e9 and the augmented
    ///    residual at ~5.8e9 while the plain data gradient is only ~2.3e2,
    ///    aborting the documented `link(type=flexible(...)) + linkwiggle(...)`
    ///    fit.
    ///
    /// Separation robustness is not lost: the wiggle block carries both a
    /// difference penalty (λ selected by REML) and a hard non-negativity
    /// constraint, and the underlying mean is fit by the pilot; a penalized,
    /// constrained refit around a finite pilot mode does not run away to
    /// `β → ∞` the way an unpenalized MLE can. Turning the term off here makes
    /// the wiggle refit consistent with the un-Firthed pilot and removes the
    /// phantom residual that blocked convergence.
    fn joint_jeffreys_term_required(&self) -> bool {
        false
    }

    fn coefficient_hessian_cost(&self, specs: &[ParameterBlockSpec]) -> u64 {
        // The mean-wiggle Hessian is exposed as a row-coefficient operator,
        // so the hot representation cost is one Θ(n · (p_eta + p_w)) HVP
        // rather than dense Θ(n · (p_eta + p_w)^2) assembly.
        let p_total = specs
            .iter()
            .map(|s| s.design.ncols() as u64)
            .fold(0u64, |acc, p| acc.saturating_add(p));
        (self.y.len() as u64).saturating_mul(p_total.max(1))
    }

    fn block_linear_constraints(
        &self,
        block_states: &[ParameterBlockState],
        block_idx: usize,
        spec: &ParameterBlockSpec,
    ) -> Result<Option<LinearInequalityConstraints>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        if block_idx != Self::BLOCK_WIGGLE {
            return Ok(None);
        }
        Ok(monotone_wiggle_nonnegative_constraints(spec.design.ncols()))
    }

    fn post_update_block_beta(
        &self,
        block_states: &[ParameterBlockState],
        block_idx: usize,
        block_spec: &ParameterBlockSpec,
        beta: Array1<f64>,
    ) -> Result<Array1<f64>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        assert!(!block_spec.name.is_empty());
        if block_idx != Self::BLOCK_WIGGLE {
            return Ok(beta);
        }
        validate_monotone_wiggle_beta_nonnegative(&beta, "BinomialMeanWiggleFamily post-update")?;
        Ok(beta)
    }

    fn evaluate(&self, block_states: &[ParameterBlockState]) -> Result<FamilyEvaluation, String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialMeanWiggleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let eta = &block_states[Self::BLOCK_ETA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        let betaw = &block_states[Self::BLOCK_WIGGLE].beta;
        let n = self.y.len();
        if eta.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialMeanWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        let dq_dq0 = self.wiggle_dq_dq0(eta.view(), betaw.view())?;
        if dq_dq0.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialMeanWiggleFamily dq/dq0 length mismatch: got {}, expected {}",
                    dq_dq0.len(),
                    n
                ),
            }
            .into());
        }

        let mut ll = 0.0;
        let mut z_eta = Array1::<f64>::zeros(n);
        let mut w_eta = Array1::<f64>::zeros(n);
        let mut z_wiggle = Array1::<f64>::zeros(n);
        let mut w_wiggle = Array1::<f64>::zeros(n);
        for i in 0..n {
            let q = eta[i] + etaw[i];
            let (mu_q, d1_q) = inverse_link_mu_d1_for_inverse_link(&self.link_kind, q)
                .map_err(|e| format!("fixed-link wiggle inverse-link evaluation failed: {e}"))?;
            let yi = self.y[i];
            let wi = self.weights[i];
            ll += binomial_location_scale_log_likelihood(yi, wi, q, &self.link_kind, mu_q)?;

            let mu = mu_q.clamp(1e-12, 1.0 - 1e-12);
            let var = (mu * (1.0 - mu)).max(MIN_PROB);
            let dmu_deta = d1_q * dq_dq0[i];
            let dmu_dw = d1_q;
            if wi == 0.0 || !var.is_finite() {
                z_eta[i] = eta[i];
                z_wiggle[i] = etaw[i];
                continue;
            }

            if dmu_deta.is_finite() {
                w_eta[i] = floor_positiveweight(wi * (dmu_deta * dmu_deta / var), MIN_WEIGHT);
                z_eta[i] = eta[i] + (yi - mu) / signedwith_floor(dmu_deta, MIN_DERIV);
            } else {
                z_eta[i] = eta[i];
            }

            if dmu_dw.is_finite() {
                w_wiggle[i] = floor_positiveweight(wi * (dmu_dw * dmu_dw / var), MIN_WEIGHT);
                z_wiggle[i] = etaw[i] + (yi - mu) / signedwith_floor(dmu_dw, MIN_DERIV);
            } else {
                z_wiggle[i] = etaw[i];
            }
        }

        Ok(FamilyEvaluation {
            log_likelihood: ll,
            blockworking_sets: vec![
                BlockWorkingSet::diagonal_checked(z_eta, w_eta)?,
                BlockWorkingSet::diagonal_checked(z_wiggle, w_wiggle)?,
            ],
        })
    }

    fn block_geometry(
        &self,
        block_states: &[ParameterBlockState],
        spec: &ParameterBlockSpec,
    ) -> Result<(DesignMatrix, Array1<f64>), String> {
        if spec.name != "wiggle" {
            return Ok((spec.design.clone(), spec.offset.clone()));
        }
        if block_states.is_empty() {
            return Err(GamlssError::UnsupportedConfiguration {
                reason: "wiggle geometry requires eta block".to_string(),
            }
            .into());
        }
        let eta = &block_states[Self::BLOCK_ETA].eta;
        if eta.len() != self.y.len() {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialMeanWiggleFamily eta size mismatch".to_string(),
            }
            .into());
        }
        let x = self.wiggle_design(eta.view())?;
        if x.ncols() != spec.design.ncols() {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "dynamic wiggle design col mismatch: got {}, expected {}",
                    x.ncols(),
                    spec.design.ncols()
                ),
            }
            .into());
        }
        let nrows = x.nrows();
        Ok((
            DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(x)),
            Array1::zeros(nrows),
        ))
    }

    fn block_geometry_is_dynamic(&self) -> bool {
        true
    }

    fn exact_newton_joint_hessian_workspace(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
    ) -> Result<Option<Arc<dyn ExactNewtonJointHessianWorkspace>>, String> {
        let x_eta = self.dense_eta_design_fromspecs(specs)?.into_owned();
        let workspace =
            BinomialMeanWiggleHessianWorkspace::new(self.clone(), block_states.to_vec(), x_eta)?;
        Ok(Some(Arc::new(workspace)))
    }

    fn inner_coefficient_hessian_hvp_available(&self, specs: &[ParameterBlockSpec]) -> bool {
        self.dense_eta_design_fromspecs(specs).is_ok()
    }

    fn exact_newton_joint_hessian_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
    ) -> Result<Option<Array2<f64>>, String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialMeanWiggleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let x_eta = self.dense_eta_design_fromspecs(specs)?;
        let eta = &block_states[Self::BLOCK_ETA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        let betaw = &block_states[Self::BLOCK_WIGGLE].beta;
        let n = self.y.len();
        if eta.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialMeanWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        let geom = self.wiggle_geometry(eta.view(), betaw.view())?;
        let p_eta = x_eta.ncols();
        let pw = geom.basis.ncols();
        let mut coeff_eta = Array1::<f64>::zeros(n);
        let mut coeff_etaw_b = Array1::<f64>::zeros(n);
        let mut coeff_etaw_d1 = Array1::<f64>::zeros(n);
        let mut coeff_ww = Array1::<f64>::zeros(n);
        for row in 0..n {
            let q = eta[row] + etaw[row];
            let (m1, m2, _) = self.neglog_q_derivatives(self.y[row], self.weights[row], q)?;
            let a = geom.dq_dq0[row];
            let b = geom.d2q_dq02[row];
            coeff_eta[row] = hessian_coeff_fromobjective_q_terms(m1, m2, a, a, b);
            coeff_etaw_b[row] = m2 * a;
            coeff_etaw_d1[row] = m1;
            coeff_ww[row] = m2;
        }
        let h_eta_eta = xt_diag_x_dense(&x_eta, &coeff_eta)?;
        let h_eta_w = xt_diag_y_dense(&x_eta, &coeff_etaw_b, &geom.basis)?
            + &xt_diag_y_dense(&x_eta, &coeff_etaw_d1, &geom.basis_d1)?;
        let h_ww = xt_diag_x_dense(&geom.basis, &coeff_ww)?;
        assert_eq!(h_eta_eta.nrows(), p_eta);
        assert_eq!(h_ww.nrows(), pw);
        Ok(Some(binomial_pack_mean_wiggle_joint_symmetrichessian(
            &h_eta_eta, &h_eta_w, &h_ww,
        )))
    }

    fn exact_newton_joint_hessian_directional_derivative_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialMeanWiggleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let x_eta = self.dense_eta_design_fromspecs(specs)?;
        let eta = &block_states[Self::BLOCK_ETA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        let betaw = &block_states[Self::BLOCK_WIGGLE].beta;
        let n = self.y.len();
        if eta.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialMeanWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        let geom = self.wiggle_geometry(eta.view(), betaw.view())?;
        let p_eta = x_eta.ncols();
        let pw = geom.basis.ncols();
        if d_beta_flat.len() != p_eta + pw {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialMeanWiggleFamily joint d_beta length mismatch: got {}, expected {}",
                    d_beta_flat.len(),
                    p_eta + pw
                ),
            }
            .into());
        }
        let u_eta = d_beta_flat.slice(s![0..p_eta]).to_owned();
        let uw = d_beta_flat.slice(s![p_eta..p_eta + pw]).to_owned();
        let xi = x_eta.dot(&u_eta);
        let phi = geom.basis.dot(&uw);
        let basis1_u = geom.basis_d1.dot(&uw);
        let basis2_u = geom.basis_d2.dot(&uw);

        let mut coeff_eta = Array1::<f64>::zeros(n);
        let mut coeff_etaw_b = Array1::<f64>::zeros(n);
        let mut coeff_etaw_d1 = Array1::<f64>::zeros(n);
        let mut coeff_etaw_d2 = Array1::<f64>::zeros(n);
        let mut coeff_ww_bb = Array1::<f64>::zeros(n);
        let mut coeff_ww_db = Array1::<f64>::zeros(n);
        for row in 0..n {
            let q = eta[row] + etaw[row];
            let (m1, m2, m3) = self.neglog_q_derivatives(self.y[row], self.weights[row], q)?;
            let a = geom.dq_dq0[row];
            let b = geom.d2q_dq02[row];
            let c = geom.d3q_dq03[row];
            let q_u = a * xi[row] + phi[row];
            let a_u = b * xi[row] + basis1_u[row];
            let b_u = c * xi[row] + basis2_u[row];
            coeff_eta[row] = directionalhessian_coeff_fromobjective_q_terms(
                m1, m2, m3, q_u, a, a, b, a_u, a_u, b_u,
            );
            coeff_etaw_b[row] = m3 * q_u * a + m2 * a_u;
            coeff_etaw_d1[row] = m2 * (a * xi[row] + q_u);
            coeff_etaw_d2[row] = m1 * xi[row];
            coeff_ww_bb[row] = m3 * q_u;
            coeff_ww_db[row] = m2 * xi[row];
        }

        let d_h_eta_eta = xt_diag_x_dense(&x_eta, &coeff_eta)?;
        let d_h_eta_w = xt_diag_y_dense(&x_eta, &coeff_etaw_b, &geom.basis)?
            + &xt_diag_y_dense(&x_eta, &coeff_etaw_d1, &geom.basis_d1)?
            + &xt_diag_y_dense(&x_eta, &coeff_etaw_d2, &geom.basis_d2)?;
        let a_ww = xt_diag_y_dense(&geom.basis_d1, &coeff_ww_db, &geom.basis)?;
        let d_h_ww = xt_diag_x_dense(&geom.basis, &coeff_ww_bb)? + &a_ww + a_ww.t();
        Ok(Some(binomial_pack_mean_wiggle_joint_symmetrichessian(
            &d_h_eta_eta,
            &d_h_eta_w,
            &d_h_ww,
        )))
    }

    /// Exact second-order directional derivative D²H[u,v] of the joint Hessian
    /// for the BinomialMeanWiggle two-block model (eta, wiggle).
    ///
    /// # Mathematical derivation
    ///
    /// The negative log-likelihood Hessian element for indices (a, b) in the
    /// joint coefficient vector is:
    ///
    ///   H_ab = m2 * q_a * q_b + m1 * q_ab
    ///
    /// where m_k = d^k F / dq^k (k-th derivative of the negative log-likelihood
    /// w.r.t. the effective predictor q), q_a = dq/d(beta_a), and q_ab =
    /// d²q/(d(beta_a) d(beta_b)).
    ///
    /// The effective predictor is q = q0 + w(q0) where q0 = X_eta * beta_eta
    /// and w(q0) = B(q0) * beta_w is the link wiggle.  Write:
    ///   a = dq/dq0 = 1 + B'·beta_w       (geometry first derivative)
    ///   b = d²q/dq0² = B''·beta_w         (geometry second derivative)
    ///   c = d³q/dq0³ = B'''·beta_w        (geometry third derivative)
    ///   d = d⁴q/dq0⁴ = B''''·beta_w       (geometry fourth derivative)
    ///
    /// For a perturbation direction u = (u_eta, u_w), the chain-rule
    /// perturbations are:
    ///   q_u   = a·xi_u + phi_u             (first-order predictor perturbation)
    ///   a_u   = b·xi_u + basis1_u          (perturbation of geometry factor a)
    ///   b_u   = c·xi_u + basis2_u          (perturbation of geometry factor b)
    ///   c_u   = d·xi_u + basis3_u          (perturbation of geometry factor c)
    ///
    /// where xi_u = X_eta·u_eta, phi_u = B·u_w, basis_k_u = B^(k)·u_w.
    ///
    /// Mixed second-order perturbations (u,v) are:
    ///   q_uv  = b·xi_u·xi_v + basis1_u·xi_v + basis1_v·xi_u
    ///   a_uv  = c·xi_u·xi_v + basis2_u·xi_v + basis2_v·xi_u
    ///   b_uv  = d·xi_u·xi_v + basis3_u·xi_v + basis3_v·xi_u
    ///
    /// ## Block decomposition
    ///
    /// **eta-eta block** (X_eta' diag(coeff) X_eta):
    ///   The Hessian element for eta indices (i,j) factors as
    ///     H(eta_i, eta_j) = [m2·a² + m1·b] · x_eta(i)·x_eta(j)
    ///   so D²H_eta_eta[u,v] = X_eta' diag(coeff_eta) X_eta
    ///   where coeff_eta uses `second_directionalhessian_coeff_fromobjective_q_terms`
    ///   with q_a=a, q_b=a, q_ab=b and their chain-rule perturbations.
    ///
    /// **eta-w block** (X_eta' diag(...) [B, B', B'', B''']):
    ///   The static Hessian is:
    ///     H(eta_i, w_j) = (m2·a)·x_eta(i)·B_j + m1·x_eta(i)·B'_j
    ///   Taking D²[u,v] requires differentiating both the scalar coefficients
    ///   (m2·a, m1) and the basis matrices (B, B' depend on q0 via the chain
    ///   rule dB_j/du = B'_j·xi_u).  The full product rule gives four basis-matrix
    ///   tiers: B, B', B'', B'''.
    ///
    /// **w-w block** (B' diag(...) B, etc.):
    ///   The static Hessian is H(w_i, w_j) = m2·B_i·B_j.
    ///   D²[u,v] expands via the product rule on m2, B_i, B_j, each of which
    ///   depends on beta through q and q0.  This gives terms involving
    ///   B·B, B'·B, B'·B', and B''·B (all symmetrised).
    fn exact_newton_joint_hessian_second_directional_derivative_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        d_beta_u_flat: &Array1<f64>,
        d_beta_v_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialMeanWiggleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let x_eta = self.dense_eta_design_fromspecs(specs)?;
        let eta = &block_states[Self::BLOCK_ETA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        let betaw = &block_states[Self::BLOCK_WIGGLE].beta;
        let n = self.y.len();
        if eta.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialMeanWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        let geom = self.wiggle_geometry(eta.view(), betaw.view())?;
        let p_eta = x_eta.ncols();
        let pw = geom.basis.ncols();
        let total = p_eta + pw;
        if d_beta_u_flat.len() != total || d_beta_v_flat.len() != total {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "BinomialMeanWiggleFamily joint second d_beta length mismatch: got {} and {}, expected {}",
                d_beta_u_flat.len(),
                d_beta_v_flat.len(),
                total
            ) }.into());
        }

        // Split directions into eta and wiggle components.
        let u_eta = d_beta_u_flat.slice(s![0..p_eta]).to_owned();
        let v_eta = d_beta_v_flat.slice(s![0..p_eta]).to_owned();
        let uw = d_beta_u_flat.slice(s![p_eta..total]).to_owned();
        let vw = d_beta_v_flat.slice(s![p_eta..total]).to_owned();

        // Per-row linear-predictor perturbations from each direction.
        let xi_u = x_eta.dot(&u_eta); // eta perturbation in direction u
        let xi_v = x_eta.dot(&v_eta); // eta perturbation in direction v
        let phi_u = geom.basis.dot(&uw); // direct wiggle basis, direction u
        let phi_v = geom.basis.dot(&vw); // direct wiggle basis, direction v
        let b1u = geom.basis_d1.dot(&uw); // first-derivative basis, direction u
        let b1v = geom.basis_d1.dot(&vw);
        let b2u = geom.basis_d2.dot(&uw); // second-derivative basis, direction u
        let b2v = geom.basis_d2.dot(&vw);
        let b3u = geom.basis_d3.dot(&uw); // third-derivative basis, direction u
        let b3v = geom.basis_d3.dot(&vw);

        // Per-row chain-rule perturbations of q, a = dq/dq0, b = d²q/dq0²:
        //   q_u = a·xi_u + phi_u
        //   a_u = b·xi_u + basis1_u
        //   b_u = c·xi_u + basis2_u
        //   c_u = d·xi_u + basis3_u
        // Mixed second-order perturbations:
        //   q_uv = b·xi_u·xi_v + basis1_u·xi_v + basis1_v·xi_u
        //   a_uv = c·xi_u·xi_v + basis2_u·xi_v + basis2_v·xi_u
        //   b_uv = d·xi_u·xi_v + basis3_u·xi_v + basis3_v·xi_u

        // Scaled basis matrices for the cross-product terms in the w-w and eta-w
        // blocks (same pattern as GaussianLocationScaleWiggleFamily).
        let basis_u = scale_matrix_rows(&geom.basis_d1, &xi_u)?; // dB/du = B'·xi_u
        let basis_v = scale_matrix_rows(&geom.basis_d1, &xi_v)?; // dB/dv = B'·xi_v
        let basis_uv = scale_matrix_rows(&geom.basis_d2, &(&xi_u * &xi_v))?; // d²B/dudv = B''·xi_u·xi_v
        // Per-row coefficient arrays for assembling the block-matrix products.
        let mut coeff_eta = Array1::<f64>::zeros(n);

        // Coefficients for the eta-w block: X_eta' diag(c_*) M where M ∈ {B, B', B'', B'''}
        //
        // The static cross-Hessian is:
        //   H(eta_i, w_j) = (m2·a)·x_i·B_j + m1·x_i·B'_j
        // where B_j and B'_j are row evaluations of basis column j.
        //
        // Write C_B = m2·a (scalar coefficient multiplying B in the cross block)
        // and   C_B1 = m1  (scalar coefficient multiplying B' in the cross block).
        //
        // Product rule on C_B·B:
        //   d(C_B·B)/du = (dC_B/du)·B + C_B·B'·xi_u
        //   d²(C_B·B)/dudv = (d²C_B/dudv)·B + (dC_B/du)·B'·xi_v
        //                   + (dC_B/dv)·B'·xi_u + C_B·B''·xi_u·xi_v
        //
        // Product rule on C_B1·B':
        //   d²(C_B1·B')/dudv = (d²C_B1/dudv)·B' + (dC_B1/du)·B''·xi_v
        //                     + (dC_B1/dv)·B''·xi_u + C_B1·B'''·xi_u·xi_v
        //
        // Derivatives of the scalar coefficients:
        //   C_B  = m2·a
        //   dC_B/du  = m3·q_u·a + m2·a_u
        //   dC_B/dv  = m3·q_v·a + m2·a_v
        //   d²C_B/dudv = m4·q_u·q_v·a + m3·(q_uv·a + q_u·a_v + q_v·a_u) + m2·a_uv
        //
        //   C_B1 = m1
        //   dC_B1/du = m2·q_u
        //   dC_B1/dv = m2·q_v
        //   d²C_B1/dudv = m3·q_u·q_v + m2·q_uv
        //
        // Grouping by basis-matrix tier:
        //   B:   d²C_B/dudv
        //   B':  (dC_B/du)·xi_v + (dC_B/dv)·xi_u + d²C_B1/dudv
        //   B'': C_B·xi_u·xi_v + (dC_B1/du)·xi_v + (dC_B1/dv)·xi_u
        //   B''': C_B1·xi_u·xi_v
        let mut coeff_etaw_b = Array1::<f64>::zeros(n);
        let mut coeff_etaw_d1 = Array1::<f64>::zeros(n);
        let mut coeff_etaw_d2 = Array1::<f64>::zeros(n);
        let mut coeff_etaw_d3 = Array1::<f64>::zeros(n);

        // Coefficients for the w-w block.
        //
        // The static w-w Hessian is:
        //   H(w_i, w_j) = m2·B_i·B_j
        //
        // Note: there is no m1·q_ij term because d²q/(d(beta_w_i) d(beta_w_j)) = 0
        // (the basis vectors B_i enter q linearly in beta_w).
        //
        // Product rule on m2·B_i·B_j, treating each factor as depending on beta:
        //   d²(m2·B_i·B_j)/dudv
        //     = (d²m2/dudv)·B_i·B_j                        → B'diag B  (symmetrised)
        //     + (dm2/du)·(B'_i·xi_v·B_j + B_i·B'_j·xi_v)  → dw_u terms
        //     + (dm2/dv)·(B'_i·xi_u·B_j + B_i·B'_j·xi_u)  → dw_v terms
        //     + m2·(B''_i·xi_u·xi_v·B_j + B'_i·xi_u·B'_j·xi_v
        //          + B'_i·xi_v·B'_j·xi_u + B_i·B''_j·xi_u·xi_v)
        //
        // where dm2/du = m3·q_u, dm2/dv = m3·q_v, d²m2/dudv = m4·q_u·q_v + m3·q_uv.
        //
        // Following the Gaussian LS wiggle pattern, we express this via:
        //   xt_diag_x_dense(B, dw_uv)                    — coeff: d²m2
        //   xt_diag_y_dense(basis_u, dw_v, B) + transpose — dB/du weighted by dm2/dv
        //   xt_diag_y_dense(basis_v, dw_u, B) + transpose — dB/dv weighted by dm2/du
        //   xt_diag_y_dense(basis_uv, w, B) + transpose   — d²B/dudv weighted by m2
        //   xt_diag_y_dense(basis_u, w, basis_v) + transpose — dB/du·dB/dv weighted by m2
        let mut dw = Array1::<f64>::zeros(n);
        let mut dw_u = Array1::<f64>::zeros(n);
        let mut dw_v = Array1::<f64>::zeros(n);
        let mut dw_uv = Array1::<f64>::zeros(n);

        for row in 0..n {
            let q = eta[row] + etaw[row];
            let (m1, m2, m3) = self.neglog_q_derivatives(self.y[row], self.weights[row], q)?;
            let m4 = self.neglog_q_fourth_derivative(self.y[row], self.weights[row], q)?;
            let a = geom.dq_dq0[row];
            let b = geom.d2q_dq02[row];
            let c = geom.d3q_dq03[row];
            let d = geom.d4q_dq04[row];

            // Chain-rule perturbations in direction u.
            let q_u = a * xi_u[row] + phi_u[row];
            let a_u = b * xi_u[row] + b1u[row];
            let b_u = c * xi_u[row] + b2u[row];

            // Chain-rule perturbations in direction v.
            let q_v = a * xi_v[row] + phi_v[row];
            let a_v = b * xi_v[row] + b1v[row];
            let b_v = c * xi_v[row] + b2v[row];

            // Mixed second-order perturbations.
            let q_uv = b * xi_u[row] * xi_v[row] + b1u[row] * xi_v[row] + b1v[row] * xi_u[row];
            let a_uv = c * xi_u[row] * xi_v[row] + b2u[row] * xi_v[row] + b2v[row] * xi_u[row];
            let b_uv = d * xi_u[row] * xi_v[row] + b3u[row] * xi_v[row] + b3v[row] * xi_u[row];

            // ── eta-eta block ──
            // H(eta_i, eta_j) uses q_a = a, q_b = a, q_ab = b (absorbing x_eta
            // into the matrix product).  The perturbations of these geometric
            // quantities are: dq_a/du = a_u, dq_b/du = a_u (since q_a = q_b = a),
            // dq_ab/du = b_u (since q_ab = b), and analogously for v.
            coeff_eta[row] = second_directionalhessian_coeff_fromobjective_q_terms(
                m1, m2, m3, m4, q_u, q_v, q_uv, a, a, b, // q_a, q_b, q_ab
                a_u, a_v, // dq_a_u, dq_a_v
                a_u, a_v, // dq_b_u, dq_b_v  (q_b = a so same perturbation)
                a_uv, a_uv, // d2q_a_uv, d2q_b_uv
                b_u, b_v,  // dq_ab_u, dq_ab_v  (q_ab = b)
                b_uv, // d2q_ab_uv
            );

            // ── eta-w block coefficients ──
            // See the derivation in the docstring above.  We group by which basis
            // matrix tier (B, B', B'', B''') the coefficient multiplies.

            // d²(m2·a)/dudv
            let d2_c_b = m4 * q_u * q_v * a + m3 * (q_uv * a + q_u * a_v + q_v * a_u) + m2 * a_uv;
            // d(m2·a)/du and d(m2·a)/dv
            let dc_b_u = m3 * q_u * a + m2 * a_u;
            let dc_b_v = m3 * q_v * a + m2 * a_v;
            // m2·a (static coefficient for B in the cross block)
            let c_b_static = m2 * a;
            // d²(m1)/dudv
            let d2_c_b1 = m3 * q_u * q_v + m2 * q_uv;
            // d(m1)/du and d(m1)/dv
            let dc_b1_u = m2 * q_u;
            let dc_b1_v = m2 * q_v;

            coeff_etaw_b[row] = d2_c_b;
            coeff_etaw_d1[row] = dc_b_u * xi_v[row] + dc_b_v * xi_u[row] + d2_c_b1;
            coeff_etaw_d2[row] =
                c_b_static * xi_u[row] * xi_v[row] + dc_b1_u * xi_v[row] + dc_b1_v * xi_u[row];
            coeff_etaw_d3[row] = m1 * xi_u[row] * xi_v[row];

            // ── w-w block coefficients ──
            // The w-w static Hessian coefficient is m2 (for B'diag B).
            dw[row] = m2;
            dw_u[row] = m3 * q_u;
            dw_v[row] = m3 * q_v;
            dw_uv[row] = m4 * q_u * q_v + m3 * q_uv;
        }

        // ── Assemble eta-eta block ──
        let d2_h_eta_eta = xt_diag_x_dense(&x_eta, &coeff_eta)?;

        // ── Assemble eta-w block ──
        // The second-order directional derivative of the cross block H_eta_w is:
        //   d²H_eta_w[u,v] = X_eta' diag(coeff_etaw_b)  B
        //                   + X_eta' diag(coeff_etaw_d1) B'
        //                   + X_eta' diag(coeff_etaw_d2) B''
        //                   + X_eta' diag(coeff_etaw_d3) B'''
        let d2_h_eta_w = xt_diag_y_dense(&x_eta, &coeff_etaw_b, &geom.basis)?
            + &xt_diag_y_dense(&x_eta, &coeff_etaw_d1, &geom.basis_d1)?
            + &xt_diag_y_dense(&x_eta, &coeff_etaw_d2, &geom.basis_d2)?
            + &xt_diag_y_dense(&x_eta, &coeff_etaw_d3, &geom.basis_d3)?;

        // ── Assemble w-w block ──
        // Following the Gaussian LS wiggle pattern (lines 6351-6363), the w-w
        // second directional derivative is assembled from scaled basis products:
        //
        //   d²(m2·B_i·B_j)/dudv decomposition:
        //     (d²m2)     · B_i·B_j        → xt_diag_x(B, dw_uv)
        //     (dm2/du)   · dB_j/dv · B_i  → xt_diag_y(basis_v, dw_u, B) + transpose
        //     (dm2/dv)   · dB_j/du · B_i  → xt_diag_y(basis_u, dw_v, B) + transpose
        //     m2 · d²B_j/dudv · B_i       → xt_diag_y(basis_uv, dw, B) + transpose
        //     m2 · dB_i/du · dB_j/dv      → xt_diag_y(basis_u, dw, basis_v) + transpose
        let a_ab = xt_diag_y_dense(&basis_uv, &dw, &geom.basis)?;
        let a_ij = xt_diag_y_dense(&basis_u, &dw, &basis_v)?;
        let a_iwj = xt_diag_y_dense(&basis_u, &dw_v, &geom.basis)?;
        let a_jwi = xt_diag_y_dense(&basis_v, &dw_u, &geom.basis)?;
        let d2_h_ww = &a_ab
            + &a_ab.t()
            + &a_ij
            + a_ij.t()
            + &a_iwj
            + a_iwj.t()
            + &a_jwi
            + a_jwi.t()
            + &xt_diag_x_dense(&geom.basis, &dw_uv)?;

        Ok(Some(binomial_pack_mean_wiggle_joint_symmetrichessian(
            &d2_h_eta_eta,
            &d2_h_eta_w,
            &d2_h_ww,
        )))
    }

    fn exact_newton_joint_psi_terms(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiTerms>, String> {
        if block_states.len() != 2 || derivative_blocks.len() != 2 {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "BinomialMeanWiggleFamily joint psi terms expect 2 blocks and 2 derivative block lists, got {} and {}",
                block_states.len(),
                derivative_blocks.len()
            ) }.into());
        }
        let x_eta = self.dense_eta_design_fromspecs(specs)?;
        let eta = &block_states[Self::BLOCK_ETA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        let betaw = &block_states[Self::BLOCK_WIGGLE].beta;
        let n = self.y.len();
        if eta.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialMeanWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        let geom = self.wiggle_geometry(eta.view(), betaw.view())?;
        let p_eta = x_eta.ncols();
        let pw = geom.basis.ncols();
        let implicit_dir =
            self.exact_newton_joint_psi_action(block_states, derivative_blocks, psi_index, p_eta)?;
        let dense_dir = if implicit_dir.is_none() {
            self.exact_newton_joint_psi_direction(
                block_states,
                derivative_blocks,
                psi_index,
                &x_eta,
            )?
        } else {
            None
        };
        let z_eta_psi = if let Some((_, ref z_eta_psi)) = implicit_dir {
            z_eta_psi
        } else if let Some(ref dir_a) = dense_dir {
            &dir_a.z_eta_psi
        } else {
            return Ok(None);
        };

        let mut objective_psi = 0.0;
        let mut score_eta_xa = Array1::<f64>::zeros(n);
        let mut score_eta_x = Array1::<f64>::zeros(n);
        let mut score_w_b = Array1::<f64>::zeros(n);
        let mut score_w_d1 = Array1::<f64>::zeros(n);

        let mut coeff_eta_eta_xx = Array1::<f64>::zeros(n);
        let mut coeff_eta_eta_xa_x = Array1::<f64>::zeros(n);
        let mut coeff_eta_w_xa_b = Array1::<f64>::zeros(n);
        let mut coeff_eta_w_x_b = Array1::<f64>::zeros(n);
        let mut coeff_eta_w_x_d1 = Array1::<f64>::zeros(n);
        let mut coeff_eta_w_xa_d1 = Array1::<f64>::zeros(n);
        let mut coeff_eta_w_x_d2 = Array1::<f64>::zeros(n);
        let mut coeff_ww_bb = Array1::<f64>::zeros(n);
        let mut coeff_ww_db = Array1::<f64>::zeros(n);

        for row in 0..n {
            let q = eta[row] + etaw[row];
            let (m1, m2, m3) = self.neglog_q_derivatives(self.y[row], self.weights[row], q)?;
            let z_a = z_eta_psi[row];
            let a = geom.dq_dq0[row];
            let b = geom.d2q_dq02[row];
            let c = geom.d3q_dq03[row];
            let q_a = a * z_a;

            objective_psi += m1 * q_a;

            score_eta_xa[row] = m1 * a;
            score_eta_x[row] = m2 * q_a * a + m1 * b * z_a;
            score_w_b[row] = m2 * q_a;
            score_w_d1[row] = m1 * z_a;

            coeff_eta_eta_xx[row] =
                m3 * q_a * a * a + m2 * (2.0 * a * b * z_a + q_a * b) + m1 * c * z_a;
            coeff_eta_eta_xa_x[row] = m2 * a * a + m1 * b;
            coeff_eta_w_xa_b[row] = m2 * a;
            coeff_eta_w_x_b[row] = m3 * q_a * a + m2 * b * z_a;
            coeff_eta_w_x_d1[row] = m2 * (a * z_a + q_a);
            coeff_eta_w_xa_d1[row] = m1;
            coeff_eta_w_x_d2[row] = m1 * z_a;
            coeff_ww_bb[row] = m3 * q_a;
            coeff_ww_db[row] = m2 * z_a;
        }

        let score_w = crate::faer_ndarray::fast_atv(&geom.basis, &score_w_b)
            + crate::faer_ndarray::fast_atv(&geom.basis_d1, &score_w_d1);

        if let Some((action, _)) = implicit_dir {
            let score_eta = action.transpose_mul(score_eta_xa.view())
                + crate::faer_ndarray::fast_atv(x_eta.as_ref(), &score_eta_x);
            let score_psi = binomial_pack_mean_wiggle_joint_score(&score_eta, &score_w);
            let x_eta_arc = shared_dense_arc(x_eta.as_ref());
            let basis_arc = Arc::new(geom.basis.clone());
            let basis_d1_arc = Arc::new(geom.basis_d1.clone());
            let basis_d2_arc = Arc::new(geom.basis_d2.clone());
            let zeros = Array1::<f64>::zeros(n);
            let operator = CustomFamilyJointPsiOperator::new(
                p_eta + pw,
                vec![
                    CustomFamilyJointDesignChannel::new(
                        0..p_eta,
                        Arc::clone(&x_eta_arc),
                        Some(action),
                    ),
                    CustomFamilyJointDesignChannel::new(
                        p_eta..p_eta + pw,
                        Arc::clone(&basis_arc),
                        None,
                    ),
                    CustomFamilyJointDesignChannel::new(
                        p_eta..p_eta + pw,
                        Arc::clone(&basis_d1_arc),
                        None,
                    ),
                    CustomFamilyJointDesignChannel::new(
                        p_eta..p_eta + pw,
                        Arc::clone(&basis_d2_arc),
                        None,
                    ),
                ],
                vec![
                    CustomFamilyJointDesignPairContribution::new(
                        0,
                        0,
                        coeff_eta_eta_xa_x.clone(),
                        coeff_eta_eta_xx.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        0,
                        1,
                        coeff_eta_w_xa_b.clone(),
                        coeff_eta_w_x_b.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        1,
                        0,
                        coeff_eta_w_xa_b.clone(),
                        coeff_eta_w_x_b.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        0,
                        2,
                        coeff_eta_w_xa_d1.clone(),
                        coeff_eta_w_x_d1.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        2,
                        0,
                        coeff_eta_w_xa_d1.clone(),
                        coeff_eta_w_x_d1.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        0,
                        3,
                        zeros.clone(),
                        coeff_eta_w_x_d2.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        3,
                        0,
                        zeros.clone(),
                        coeff_eta_w_x_d2.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        1,
                        1,
                        zeros.clone(),
                        coeff_ww_bb.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        2,
                        1,
                        zeros.clone(),
                        coeff_ww_db.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(1, 2, zeros, coeff_ww_db.clone()),
                ],
            );
            return Ok(Some(crate::custom_family::ExactNewtonJointPsiTerms {
                objective_psi,
                score_psi,
                hessian_psi: Array2::zeros((0, 0)),
                hessian_psi_operator: Some(std::sync::Arc::new(operator)),
            }));
        }

        let dir_a =
            dense_dir.expect("dense psi direction should exist when implicit direction is absent");
        let x_eta_psi = dir_a
            .x_eta_psi
            .as_ref()
            .expect("dense eta psi design should exist when implicit direction is absent");
        let score_psi = binomial_pack_mean_wiggle_joint_score(
            &(crate::faer_ndarray::fast_atv(x_eta_psi, &score_eta_xa)
                + crate::faer_ndarray::fast_atv(x_eta.as_ref(), &score_eta_x)),
            &score_w,
        );
        let a_eta_eta = xt_diag_y_dense(x_eta_psi, &coeff_eta_eta_xa_x, &x_eta)?;
        let h_eta_eta = &a_eta_eta + &a_eta_eta.t() + &xt_diag_x_dense(&x_eta, &coeff_eta_eta_xx)?;
        let h_eta_w = xt_diag_y_dense(x_eta_psi, &coeff_eta_w_xa_b, &geom.basis)?
            + &xt_diag_y_dense(&x_eta, &coeff_eta_w_x_b, &geom.basis)?
            + &xt_diag_y_dense(&x_eta, &coeff_eta_w_x_d1, &geom.basis_d1)?
            + &xt_diag_y_dense(x_eta_psi, &coeff_eta_w_xa_d1, &geom.basis_d1)?
            + &xt_diag_y_dense(&x_eta, &coeff_eta_w_x_d2, &geom.basis_d2)?;
        let a_ww = xt_diag_y_dense(&geom.basis_d1, &coeff_ww_db, &geom.basis)?;
        let h_ww = xt_diag_x_dense(&geom.basis, &coeff_ww_bb)? + &a_ww + a_ww.t();

        Ok(Some(crate::custom_family::ExactNewtonJointPsiTerms {
            objective_psi,
            score_psi,
            hessian_psi: binomial_pack_mean_wiggle_joint_symmetrichessian(
                &h_eta_eta, &h_eta_w, &h_ww,
            ),
            hessian_psi_operator: None,
        }))
    }
}

struct BinomialMeanWiggleHessianWorkspace {
    family: BinomialMeanWiggleFamily,
    block_states: Vec<ParameterBlockState>,
    x_eta: Arc<Array2<f64>>,
    hessian_operator: Arc<RowCoeffOperator>,
}

impl BinomialMeanWiggleHessianWorkspace {
    fn new(
        family: BinomialMeanWiggleFamily,
        block_states: Vec<ParameterBlockState>,
        x_eta: Array2<f64>,
    ) -> Result<Self, String> {
        let x_eta = Arc::new(x_eta);
        let hessian_operator = family.bmw_static_hessian_operator(&block_states, x_eta.clone())?;
        Ok(Self {
            family,
            block_states,
            x_eta,
            hessian_operator,
        })
    }
}

impl ExactNewtonJointHessianWorkspace for BinomialMeanWiggleHessianWorkspace {
    fn hessian_matvec_available(&self) -> bool {
        true
    }

    fn hessian_matvec(&self, v: &Array1<f64>) -> Result<Option<Array1<f64>>, String> {
        Ok(Some(
            crate::solver::estimate::reml::unified::HyperOperator::mul_vec(
                self.hessian_operator.as_ref(),
                v,
            ),
        ))
    }

    fn hessian_diagonal(&self) -> Result<Option<Array1<f64>>, String> {
        Ok(None)
    }

    fn directional_derivative(
        &self,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        Ok(self
            .directional_derivative_operator(d_beta_flat)?
            .map(|operator| operator.to_dense()))
    }

    fn directional_derivative_operator(
        &self,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Arc<dyn crate::solver::estimate::reml::unified::HyperOperator>>, String>
    {
        self.family
            .bmw_directional_operator(&self.block_states, self.x_eta.clone(), d_beta_flat)
    }

    fn second_directional_derivative(
        &self,
        d_beta_u_flat: &Array1<f64>,
        d_beta_v_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        Ok(self
            .second_directional_derivative_operator(d_beta_u_flat, d_beta_v_flat)?
            .map(|operator| operator.to_dense()))
    }

    fn second_directional_derivative_operator(
        &self,
        d_beta_u: &Array1<f64>,
        d_beta_v: &Array1<f64>,
    ) -> Result<Option<Arc<dyn crate::solver::estimate::reml::unified::HyperOperator>>, String>
    {
        self.family.bmw_second_directional_operator(
            &self.block_states,
            self.x_eta.clone(),
            d_beta_u,
            d_beta_v,
        )
    }
}

impl CustomFamilyGenerative for BinomialMeanWiggleFamily {
    fn generativespec(
        &self,
        block_states: &[ParameterBlockState],
    ) -> Result<GenerativeSpec, String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialMeanWiggleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let eta = &block_states[Self::BLOCK_ETA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        if eta.len() != self.y.len() || etaw.len() != self.y.len() {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialMeanWiggleFamily generative size mismatch".to_string(),
            }
            .into());
        }
        let mean = gamlss_rowwise_map_result(self.y.len(), |i| {
            let jet = inverse_link_jet_for_inverse_link(&self.link_kind, eta[i] + etaw[i])
                .map_err(|e| format!("fixed-link wiggle inverse-link evaluation failed: {e}"))?;
            Ok(jet.mu)
        })?;
        Ok(GenerativeSpec {
            mean,
            noise: NoiseModel::Bernoulli,
        })
    }
}

/// Built-in Poisson log-link family (single parameter block).
#[derive(Clone)]
pub struct PoissonLogFamily {
    pub y: Array1<f64>,
    pub weights: Array1<f64>,
}

impl PoissonLogFamily {
    pub const BLOCK_ETA: usize = 0;

    pub fn parameternames() -> &'static [&'static str] {
        &["eta"]
    }

    pub fn parameter_links() -> &'static [ParameterLink] {
        &[ParameterLink::Log]
    }

    pub fn metadata() -> FamilyMetadata {
        FamilyMetadata {
            name: "poisson_log",
            parameternames: Self::parameternames(),
            parameter_links: Self::parameter_links(),
        }
    }
}

/// Per-row IRLS contribution that a single-parameter log-link family must
/// produce. The shared driver `evaluate_log_link_diagonal_irls` consumes
/// these and assembles the full `FamilyEvaluation` so the three pieces of
/// code that previously lived inside each family — size validation, per-row
/// y validation + η clamping + saturated `exp`, the active-clamp w/z guard,
/// and the final return — exist in exactly one place.
struct DiagonalIrlsRow {
    /// Weighted contribution to ℓ at this row.
    log_lik_increment: f64,
    /// Unfloored observed Hessian weight (the driver applies `MIN_WEIGHT`).
    observed_weight: f64,
    /// Per-row Newton step on the working response: `z = e + working_step`.
    /// Each family computes this with its own (score, denominator); the
    /// driver only handles the active-clamp / zero-weight guard.
    working_step: f64,
}

/// Trait implemented by single-block log-link families that share the
/// diagonal IRLS structure (Poisson, Gamma). Each impl is responsible only
/// for the family-specific math: validating `y[i]` and producing the
/// per-row triple `(ℓ_increment, observed_weight, working_step)`.
trait LogLinkDiagonalIrlsFamily {
    /// Short, human-readable name used in size-mismatch errors.
    fn family_label(&self) -> &'static str;

    /// Read access to the shared (y, prior weights) buffers.
    fn y(&self) -> &Array1<f64>;
    fn prior_weights(&self) -> &Array1<f64>;

    /// Optional pre-loop validation hook for parameters outside the
    /// (y, weights, eta) triple (e.g. Gamma shape > 0).
    fn validate_self(&self) -> Result<(), String> {
        Ok(())
    }

    /// Validate `y[i]` and return an error message if rejected. Default
    /// implementation enforces only finiteness; concrete families override
    /// to add domain constraints.
    fn validate_yi(&self, yi: f64, idx: usize) -> Result<(), String>;

    /// Family-specific per-row math; `m = saturated_exp_eta(eta_clamped)`
    /// is computed by the driver and handed in.
    fn row_kernel(&self, yi: f64, e_clamped: f64, m: f64, prior_w: f64) -> DiagonalIrlsRow;
}

/// Shared IRLS driver for [`LogLinkDiagonalIrlsFamily`]. Centralises the
/// size-check, η-clamp, saturated-exp, active-clamp guard, ll accumulation,
/// and `FamilyEvaluation` assembly so all log-link families with the diagonal
/// structure (Poisson, Gamma) cannot drift apart numerically.
fn evaluate_log_link_diagonal_irls<F: LogLinkDiagonalIrlsFamily + ?Sized>(
    family: &F,
    block_states: &[ParameterBlockState],
) -> Result<FamilyEvaluation, String> {
    let label = family.family_label();
    let eta = &expect_single_block(block_states, label)?.eta;
    let y = family.y();
    let prior_weights = family.prior_weights();
    let n = y.len();
    if eta.len() != n || prior_weights.len() != n {
        return Err(GamlssError::DimensionMismatch {
            reason: format!("{label} input size mismatch"),
        }
        .into());
    }
    family.validate_self()?;

    let mut ll = 0.0;
    let mut z = Array1::<f64>::zeros(n);
    let mut w = Array1::<f64>::zeros(n);

    for i in 0..n {
        let yi = y[i];
        family.validate_yi(yi, i)?;
        let e_raw = eta[i];
        let e = e_raw.clamp(-ETA_HARD_CLAMP, ETA_HARD_CLAMP);
        let active_clamp = e != e_raw;
        let m = saturated_exp_eta(e_raw);
        let prior_w = prior_weights[i];
        let row = family.row_kernel(yi, e, m, prior_w);
        ll += row.log_lik_increment;
        if prior_w == 0.0 || active_clamp {
            w[i] = 0.0;
            z[i] = e_raw;
        } else {
            w[i] = floor_positiveweight(row.observed_weight, MIN_WEIGHT);
            z[i] = e + row.working_step;
        }
    }

    Ok(FamilyEvaluation {
        log_likelihood: ll,
        blockworking_sets: vec![BlockWorkingSet::diagonal_checked(z, w)?],
    })
}

impl LogLinkDiagonalIrlsFamily for PoissonLogFamily {
    fn family_label(&self) -> &'static str {
        "PoissonLogFamily"
    }
    fn y(&self) -> &Array1<f64> {
        &self.y
    }
    fn prior_weights(&self) -> &Array1<f64> {
        &self.weights
    }
    fn validate_yi(&self, yi: f64, idx: usize) -> Result<(), String> {
        if !yi.is_finite() || yi < 0.0 {
            return Err(GamlssError::InvalidInput {
                reason: format!(
                    "PoissonLogFamily requires non-negative finite y; found y[{idx}]={yi}"
                ),
            }
            .into());
        }
        Ok::<(), _>(())
    }
    #[inline]
    fn row_kernel(&self, yi: f64, e_clamped: f64, m: f64, prior_w: f64) -> DiagonalIrlsRow {
        // Drop log(y!) constant in objective.
        let log_lik_increment = prior_w * (yi * e_clamped - m);
        let dmu = m.max(MIN_DERIV);
        let var = m.max(MIN_PROB);
        DiagonalIrlsRow {
            log_lik_increment,
            observed_weight: prior_w * (dmu * dmu / var),
            // (yi - m)/dmu, identical to the previous direct expression.
            working_step: (yi - m) / signedwith_floor(dmu, MIN_DERIV),
        }
    }
}

impl CustomFamily for PoissonLogFamily {
    fn evaluate(&self, block_states: &[ParameterBlockState]) -> Result<FamilyEvaluation, String> {
        evaluate_log_link_diagonal_irls(self, block_states)
    }
}

impl CustomFamilyGenerative for PoissonLogFamily {
    fn generativespec(
        &self,
        block_states: &[ParameterBlockState],
    ) -> Result<GenerativeSpec, String> {
        let eta = &expect_single_block(block_states, "PoissonLogFamily")?.eta;
        let mean = gamlss_rowwise_map(eta.len(), |i| saturated_exp_eta(eta[i]));
        Ok(GenerativeSpec {
            mean,
            noise: NoiseModel::Poisson,
        })
    }
}

/// Built-in Gamma log-link family (single parameter block, fixed shape).
#[derive(Clone)]
pub struct GammaLogFamily {
    pub y: Array1<f64>,
    pub weights: Array1<f64>,
    pub shape: f64,
}

impl GammaLogFamily {
    pub const BLOCK_ETA: usize = 0;

    pub fn parameternames() -> &'static [&'static str] {
        &["eta"]
    }

    pub fn parameter_links() -> &'static [ParameterLink] {
        &[ParameterLink::Log]
    }

    pub fn metadata() -> FamilyMetadata {
        FamilyMetadata {
            name: "gamma_log",
            parameternames: Self::parameternames(),
            parameter_links: Self::parameter_links(),
        }
    }
}

impl LogLinkDiagonalIrlsFamily for GammaLogFamily {
    fn family_label(&self) -> &'static str {
        "GammaLogFamily"
    }
    fn y(&self) -> &Array1<f64> {
        &self.y
    }
    fn prior_weights(&self) -> &Array1<f64> {
        &self.weights
    }
    fn validate_self(&self) -> Result<(), String> {
        if !self.shape.is_finite() || self.shape <= 0.0 {
            return Err(GamlssError::NonFinite {
                reason: "GammaLogFamily shape must be finite and > 0".to_string(),
            }
            .into());
        }
        Ok(())
    }
    fn validate_yi(&self, yi: f64, idx: usize) -> Result<(), String> {
        if !yi.is_finite() || yi <= 0.0 {
            return Err(GamlssError::InvalidInput {
                reason: format!("GammaLogFamily requires positive finite y; found y[{idx}]={yi}"),
            }
            .into());
        }
        Ok::<(), _>(())
    }
    #[inline]
    fn row_kernel(&self, yi: f64, e_clamped: f64, m: f64, prior_w: f64) -> DiagonalIrlsRow {
        assert!(e_clamped.is_finite());
        assert!((e_clamped.exp() - m).abs() <= 1.0e-8 * m.abs().max(1.0));
        // Gamma(shape=k, scale=mu/k), dropping eta-independent constants.
        let log_lik_increment = prior_w * (-self.shape * (yi / m + m.ln()));
        // Gamma with log mean is non-canonical. Use the exact observed
        // η-space curvature -d²ℓ/dη² = prior_w * shape * y / μ, not the
        // Fisher weight prior_w * shape, so diagonal REML/LAML Hessians
        // use the true Laplace curvature instead of a PQL/Fisher surrogate.
        let observed_weight = prior_w * self.shape * yi / m;
        let score = prior_w * self.shape * (yi / m - 1.0);
        // Mirror the pre-extraction formula z = e + score / w_floored exactly;
        // the driver applies MIN_WEIGHT *before* writing w[i], but the old
        // code divided by the already-floored w[i] for non-degenerate rows,
        // and the floor only activates on the degenerate `observed_weight <=
        // MIN_WEIGHT` tail. Reproduce that branch here to preserve bitwise
        // step shape on every row that used to hit the floor.
        let w_floored = observed_weight.max(MIN_WEIGHT);
        DiagonalIrlsRow {
            log_lik_increment,
            observed_weight,
            working_step: score / w_floored,
        }
    }
}

impl CustomFamily for GammaLogFamily {
    fn evaluate(&self, block_states: &[ParameterBlockState]) -> Result<FamilyEvaluation, String> {
        evaluate_log_link_diagonal_irls(self, block_states)
    }

    fn diagonalworking_weights_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        block_idx: usize,
        d_eta: &Array1<f64>,
    ) -> Result<Option<Array1<f64>>, String> {
        if block_idx != Self::BLOCK_ETA {
            return Ok(None);
        }
        let eta = &expect_single_block(block_states, "GammaLogFamily")?.eta;
        let n = self.y.len();
        if eta.len() != n || self.weights.len() != n || d_eta.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "GammaLogFamily input size mismatch".to_string(),
            }
            .into());
        }
        if !self.shape.is_finite() || self.shape <= 0.0 {
            return Err(GamlssError::NonFinite {
                reason: "GammaLogFamily shape must be finite and > 0".to_string(),
            }
            .into());
        }

        let mut dw = Array1::<f64>::zeros(n);
        for i in 0..n {
            let yi = self.y[i];
            if !yi.is_finite() || yi <= 0.0 {
                return Err(GamlssError::InvalidInput {
                    reason: format!("GammaLogFamily requires positive finite y; found y[{i}]={yi}"),
                }
                .into());
            }
            let e_raw = eta[i];
            let e = e_raw.clamp(-ETA_HARD_CLAMP, ETA_HARD_CLAMP);
            if self.weights[i] == 0.0 || e != e_raw {
                dw[i] = 0.0;
                continue;
            }
            let m = safe_exp(e).max(MIN_WEIGHT);
            let observed_weight = self.weights[i] * self.shape * yi / m;
            // d/dη [prior_weight * shape * y / exp(η)] = -W_obs.
            // If the positive floor is active, match the evaluated local piece.
            if observed_weight <= MIN_WEIGHT {
                dw[i] = 0.0;
            } else {
                dw[i] = -observed_weight * d_eta[i];
            }
        }
        Ok(Some(dw))
    }
}

impl CustomFamilyGenerative for GammaLogFamily {
    fn generativespec(
        &self,
        block_states: &[ParameterBlockState],
    ) -> Result<GenerativeSpec, String> {
        let eta = &expect_single_block(block_states, "GammaLogFamily")?.eta;
        let mean = gamlss_rowwise_map(eta.len(), |i| saturated_exp_eta(eta[i]));
        Ok(GenerativeSpec {
            mean,
            noise: NoiseModel::Gamma { shape: self.shape },
        })
    }
}

/// Built-in binomial location-scale family with a configurable inverse link.
///
/// Parameters:
/// - Block 0: threshold/location T(covariates)
/// - Block 1: log-scale log σ(covariates)
#[derive(Clone)]
pub struct BinomialLocationScaleFamily {
    pub y: Array1<f64>,
    pub weights: Array1<f64>,
    pub link_kind: InverseLink,
    pub threshold_design: Option<DesignMatrix>,
    pub log_sigma_design: Option<DesignMatrix>,
    /// Resource policy threaded into PsiDesignMap construction (and any other
    /// per-call materialization decision) made during exact-Newton joint psi
    /// derivative evaluation. Defaults to `ResourcePolicy::default_library()`
    /// when the family is built without an explicit policy.
    pub policy: crate::resource::ResourcePolicy,
}

/// Both Binomial location-scale families plug into the unified
/// [`LocationScaleJointPsiFamily`] trait with byte-identical thin delegations
/// to inherent methods, differing only in the implementing type and its
/// `LABEL` fragment; generate them from one template. The Binomial families do
/// not thread the outer-row subsample (they run the full-data exact ψ path), so
/// the trait's `subsample` argument is accepted and ignored here.
macro_rules! impl_binomial_location_scale_joint_psi_family {
    ($family:ty, $label:literal) => {
        impl LocationScaleJointPsiFamily for $family {
            type Direction = LocationScaleJointPsiDirection;
            const LABEL: &'static str = $label;

            fn ws_policy(&self) -> &crate::resource::ResourcePolicy {
                &self.policy
            }

            fn ws_exact_joint_dense_block_designs<'a>(
                &'a self,
                specs: Option<&'a [ParameterBlockSpec]>,
            ) -> Result<Option<(Cow<'a, Array2<f64>>, Cow<'a, Array2<f64>>)>, String> {
                self.exact_joint_dense_block_designs(specs)
            }

            fn ws_psi_direction(
                &self,
                block_states: &[ParameterBlockState],
                derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
                psi_index: usize,
                design_loc: &Array2<f64>,
                design_scale: &Array2<f64>,
                policy: &crate::resource::ResourcePolicy,
            ) -> Result<Option<LocationScaleJointPsiDirection>, String> {
                self.exact_newton_joint_psi_direction(
                    block_states,
                    derivative_blocks,
                    psi_index,
                    design_loc,
                    design_scale,
                    policy,
                )
            }

            fn ws_psi_second_order_terms_from_parts(
                &self,
                block_states: &[ParameterBlockState],
                derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
                psi_a: &LocationScaleJointPsiDirection,
                psi_b: &LocationScaleJointPsiDirection,
                design_loc: &Array2<f64>,
                design_scale: &Array2<f64>,
                subsample: Option<&[crate::families::marginal_slope_shared::WeightedOuterRow]>,
            ) -> Result<ExactNewtonJointPsiSecondOrderTerms, String> {
                assert!(subsample.is_none());
                self.exact_newton_joint_psisecond_order_terms_from_parts(
                    block_states,
                    derivative_blocks,
                    psi_a,
                    psi_b,
                    design_loc,
                    design_scale,
                )
            }

            fn ws_psi_hessian_directional_from_parts(
                &self,
                block_states: &[ParameterBlockState],
                psi_dir: &LocationScaleJointPsiDirection,
                d_beta_flat: &Array1<f64>,
                design_loc: &Array2<f64>,
                design_scale: &Array2<f64>,
                subsample: Option<&[crate::families::marginal_slope_shared::WeightedOuterRow]>,
            ) -> Result<Array2<f64>, String> {
                assert!(subsample.is_none());
                self.exact_newton_joint_psihessian_directional_derivative_from_parts(
                    block_states,
                    psi_dir,
                    d_beta_flat,
                    design_loc,
                    design_scale,
                )
            }
        }
    };
}

impl_binomial_location_scale_joint_psi_family!(
    BinomialLocationScaleFamily,
    "BinomialLocationScaleFamily"
);
impl_binomial_location_scale_joint_psi_family!(
    BinomialLocationScaleWiggleFamily,
    "BinomialLocationScaleWiggleFamily"
);

type BinomialLocationScaleExactNewtonJointPsiWorkspace =
    LocationScaleJointPsiWorkspace<BinomialLocationScaleFamily>;
type BinomialLocationScaleWiggleExactNewtonJointPsiWorkspace =
    LocationScaleJointPsiWorkspace<BinomialLocationScaleWiggleFamily>;

impl BinomialLocationScaleFamily {
    pub const BLOCK_T: usize = 0;
    pub const BLOCK_LOG_SIGMA: usize = 1;

    pub fn parameternames() -> &'static [&'static str] {
        &["threshold", "log_sigma"]
    }

    pub fn parameter_links() -> &'static [ParameterLink] {
        &[ParameterLink::InverseLink, ParameterLink::Log]
    }

    pub fn metadata() -> FamilyMetadata {
        FamilyMetadata {
            name: "binomial_location_scale",
            parameternames: Self::parameternames(),
            parameter_links: Self::parameter_links(),
        }
    }

    fn exact_joint_supported(&self) -> bool {
        self.threshold_design.is_some() && self.log_sigma_design.is_some()
    }

    fn dense_block_designs(&self) -> Result<(Cow<'_, Array2<f64>>, Cow<'_, Array2<f64>>), String> {
        dense_locscale_block_designs_cached(
            self.threshold_design.as_ref(),
            self.log_sigma_design.as_ref(),
            "BinomialLocationScaleFamily",
            "BinomialLocationScale",
            "threshold",
            &self.policy.material_policy(),
        )
    }

    fn dense_block_designs_fromspecs<'a>(
        &self,
        specs: &'a [ParameterBlockSpec],
    ) -> Result<(Cow<'a, Array2<f64>>, Cow<'a, Array2<f64>>), String> {
        dense_locscale_block_designs_fromspecs(
            specs,
            2,
            "BinomialLocationScaleFamily",
            "BinomialLocationScale",
            Self::BLOCK_T,
            Self::BLOCK_LOG_SIGMA,
            "threshold",
            &self.policy.material_policy(),
        )
    }

    fn exact_joint_dense_block_designs<'a>(
        &'a self,
        specs: Option<&'a [ParameterBlockSpec]>,
    ) -> Result<Option<(Cow<'a, Array2<f64>>, Cow<'a, Array2<f64>>)>, String> {
        // The non-wiggle family is structurally capable of exact joint outer
        // rho-derivatives whenever the realized threshold and log-sigma
        // designs are available somewhere. Prefer cached family designs when
        // present, but allow the outer hyper code to recover the exact same
        // joint path from the realized `specs`.
        //
        // This is not a convenience fallback. The coupled profiled derivative
        // is defined in terms of the joint mode system
        //
        //   H u_k = -A_k beta,
        //
        // so if the block specs already determine the realized joint
        // curvature, forcing the code back onto a blockwise surrogate just
        // because the family did not cache duplicate dense designs would be
        // mathematically wrong.
        if self.threshold_design.is_some() && self.log_sigma_design.is_some() {
            return self.dense_block_designs().map(Some);
        }
        if let Some(specs) = specs {
            return self.dense_block_designs_fromspecs(specs).map(Some);
        }
        Ok(None)
    }

    fn exact_joint_block_designs_owned(
        &self,
        specs: Option<&[ParameterBlockSpec]>,
    ) -> Result<Option<(DesignMatrix, DesignMatrix)>, String> {
        let designs = if let (Some(x_t), Some(x_ls)) = (
            self.threshold_design.as_ref(),
            self.log_sigma_design.as_ref(),
        ) {
            Some((x_t.clone(), x_ls.clone()))
        } else if let Some(specs) = specs {
            if specs.len() != 2 {
                return Err(GamlssError::DimensionMismatch { reason: format!(
                    "BinomialLocationScaleFamily spec-aware operator path expects 2 specs, got {}",
                    specs.len()
                ) }.into());
            }
            Some((
                specs[Self::BLOCK_T].design.clone(),
                specs[Self::BLOCK_LOG_SIGMA].design.clone(),
            ))
        } else {
            None
        };
        let Some((x_t, x_ls)) = designs else {
            return Ok(None);
        };
        let n = self.y.len();
        if x_t.nrows() != n || x_ls.nrows() != n {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "BinomialLocationScaleFamily operator designs have row mismatch: y={}, threshold={}, log_sigma={}",
                n,
                x_t.nrows(),
                x_ls.nrows()
            ) }.into());
        }
        Ok(Some((x_t, x_ls)))
    }

    fn exact_newton_joint_gradient_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        x_t: &DesignMatrix,
        x_ls: &DesignMatrix,
    ) -> Result<ExactNewtonJointGradientEvaluation, String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        if eta_t.len() != n
            || eta_ls.len() != n
            || self.weights.len() != n
            || x_t.nrows() != n
            || x_ls.nrows() != n
        {
            return Err(
                "BinomialLocationScaleFamily joint gradient input size mismatch".to_string(),
            );
        }

        let core = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            None,
            &self.link_kind,
        )?;
        let mut grad_eta_t_v = vec![0.0_f64; n];
        let mut grad_eta_ls_v = vec![0.0_f64; n];
        let y_slice = self.y.as_slice().expect("y must be contiguous");
        let w_slice = self.weights.as_slice().expect("weights must be contiguous");
        let q0_slice = core.q0.as_slice().expect("q0 must be contiguous");
        let sigma_slice = core.sigma.as_slice().expect("sigma must be contiguous");
        let mu_slice = core.mu.as_slice().expect("mu must be contiguous");
        let dmu_slice = core.dmu_dq.as_slice().expect("dmu_dq must be contiguous");
        let d2mu_slice = core
            .d2mu_dq2
            .as_slice()
            .expect("d2mu_dq2 must be contiguous");
        let d3mu_slice = core
            .d3mu_dq3
            .as_slice()
            .expect("d3mu_dq3 must be contiguous");
        let eta_t_slice = eta_t.as_slice().expect("eta_t must be contiguous");
        let link_kind = &self.link_kind;
        grad_eta_t_v
            .par_iter_mut()
            .zip(grad_eta_ls_v.par_iter_mut())
            .enumerate()
            .for_each(|(i, (g_t, g_ls))| {
                let (m1, _, _) = binomial_neglog_q_derivatives_dispatch(
                    y_slice[i],
                    w_slice[i],
                    q0_slice[i],
                    mu_slice[i],
                    dmu_slice[i],
                    d2mu_slice[i],
                    d3mu_slice[i],
                    link_kind,
                );
                let q0d = nonwiggle_q_derivs(eta_t_slice[i], sigma_slice[i]);
                *g_t = -m1 * q0d.q_t;
                *g_ls = -m1 * q0d.q_ls;
            });
        let grad_eta_t = Array1::from_vec(grad_eta_t_v);
        let grad_eta_ls = Array1::from_vec(grad_eta_ls_v);
        let grad_t = x_t.transpose_vector_multiply(&grad_eta_t);
        let grad_ls = x_ls.transpose_vector_multiply(&grad_eta_ls);
        let total = grad_t.len() + grad_ls.len();
        let mut gradient = Array1::<f64>::zeros(total);
        gradient.slice_mut(s![0..grad_t.len()]).assign(&grad_t);
        gradient.slice_mut(s![grad_t.len()..total]).assign(&grad_ls);
        Ok(ExactNewtonJointGradientEvaluation {
            log_likelihood: core.log_likelihood,
            gradient,
        })
    }

    fn exact_newton_joint_hessian_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: Option<&[ParameterBlockSpec]>,
    ) -> Result<Option<Array2<f64>>, String> {
        let Some((x_t, x_ls)) = self.exact_joint_block_designs_owned(specs)? else {
            return Ok(None);
        };
        self.exact_newton_joint_hessian_from_design_matrices(block_states, &x_t, &x_ls)
    }

    fn exact_newton_joint_hessian_directional_derivative_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: Option<&[ParameterBlockSpec]>,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        let Some((x_t, x_ls)) = self.exact_joint_dense_block_designs(specs)? else {
            return Ok(None);
        };
        self.exact_newton_joint_hessian_directional_derivative_from_designs(
            block_states,
            &x_t,
            &x_ls,
            d_beta_flat,
        )
    }

    fn exact_newton_joint_hessian_second_directional_derivative_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: Option<&[ParameterBlockSpec]>,
        d_beta_u_flat: &Array1<f64>,
        d_betav_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        let Some((x_t, x_ls)) = self.exact_joint_dense_block_designs(specs)? else {
            return Ok(None);
        };
        self.exact_newton_joint_hessiansecond_directional_derivative_from_designs(
            block_states,
            &x_t,
            &x_ls,
            d_beta_u_flat,
            d_betav_flat,
        )
    }

    fn exact_newton_joint_psi_terms_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiTerms>, String> {
        let Some((x_t, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        self.exact_newton_joint_psi_terms_from_designs(
            block_states,
            specs,
            derivative_blocks,
            psi_index,
            &x_t,
            &x_ls,
        )
    }

    fn exact_newton_joint_psisecond_order_terms_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_i: usize,
        psi_j: usize,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms>, String> {
        let Some((x_t, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        self.exact_newton_joint_psisecond_order_terms_from_designs(
            block_states,
            derivative_blocks,
            psi_i,
            psi_j,
            &x_t,
            &x_ls,
        )
    }

    fn exact_newton_joint_psihessian_directional_derivative_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        let Some((x_t, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        self.exact_newton_joint_psihessian_directional_derivative_from_designs(
            block_states,
            derivative_blocks,
            psi_index,
            d_beta_flat,
            &x_t,
            &x_ls,
        )
    }

    /// Compute the rowwise joint curvature coefficients (D_tt, D_tl, D_ll)
    /// shared by the dense joint Hessian path and the matrix-free workspace.
    fn exact_newton_joint_hessian_row_coefficients(
        &self,
        block_states: &[ParameterBlockState],
    ) -> Result<(Array1<f64>, Array1<f64>, Array1<f64>), String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        if eta_t.len() != n || eta_ls.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialLocationScaleFamily input size mismatch".to_string(),
            }
            .into());
        }

        let core = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            None,
            &self.link_kind,
        )?;
        let mut coeff_tt = vec![0.0_f64; n];
        let mut coeff_tl = vec![0.0_f64; n];
        let mut coeff_ll = vec![0.0_f64; n];
        let y_slice = self.y.as_slice().expect("y must be contiguous");
        let w_slice = self.weights.as_slice().expect("weights must be contiguous");
        let q0_slice = core.q0.as_slice().expect("q0 must be contiguous");
        let sigma_slice = core.sigma.as_slice().expect("sigma must be contiguous");
        let dsigma_slice = core
            .dsigma_deta
            .as_slice()
            .expect("dsigma_deta must be contiguous");
        let mu_slice = core.mu.as_slice().expect("mu must be contiguous");
        let dmu_slice = core.dmu_dq.as_slice().expect("dmu_dq must be contiguous");
        let d2mu_slice = core
            .d2mu_dq2
            .as_slice()
            .expect("d2mu_dq2 must be contiguous");
        let d3mu_slice = core
            .d3mu_dq3
            .as_slice()
            .expect("d3mu_dq3 must be contiguous");
        let link_kind = &self.link_kind;
        coeff_tt
            .par_iter_mut()
            .zip(coeff_tl.par_iter_mut())
            .zip(coeff_ll.par_iter_mut())
            .enumerate()
            .for_each(|(i, ((c_tt, c_tl), c_ll))| {
                let q = q0_slice[i];
                let r = 1.0 / sigma_slice[i];
                let kappa = dsigma_slice[i] / sigma_slice[i];
                let (m1, m2, _) = binomial_neglog_q_derivatives_dispatch(
                    y_slice[i],
                    w_slice[i],
                    q,
                    mu_slice[i],
                    dmu_slice[i],
                    d2mu_slice[i],
                    d3mu_slice[i],
                    link_kind,
                );
                *c_tt = m2 * r * r;
                *c_tl = kappa * r * (m1 + q * m2);
                *c_ll = kappa * kappa * q * (m1 + q * m2);
            });
        Ok((
            Array1::from_vec(coeff_tt),
            Array1::from_vec(coeff_tl),
            Array1::from_vec(coeff_ll),
        ))
    }

    /// Exact diagonal-block-only Hessians (h_tt, h_ll) used by `evaluate()`
    /// to populate per-block working sets without ever materializing the
    /// dense p×p joint matrix.
    fn exact_newton_block_diagonal_hessians_from_design_matrices(
        &self,
        block_states: &[ParameterBlockState],
        x_t: &DesignMatrix,
        x_ls: &DesignMatrix,
    ) -> Result<(Array2<f64>, Array2<f64>), String> {
        let (coeff_tt, _coeff_tl, coeff_ll) =
            self.exact_newton_joint_hessian_row_coefficients(block_states)?;
        let h_tt = xt_diag_x_design(x_t, &coeff_tt)?;
        let h_ll = xt_diag_x_design(x_ls, &coeff_ll)?;
        Ok((h_tt, h_ll))
    }

    fn exact_newton_joint_hessian_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        x_t: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        // Exact joint coefficient-space Hessian for the probit, non-wiggle
        // location-scale family.
        //
        // At the fitted mode, the correct joint outer smoothing sensitivity is
        //
        //   H u_k = -g_k,
        //   g_k = A_k beta,
        //
        // so the solve must use the full joint working-curvature matrix `H`.
        // For this family the likelihood is coupled through
        //
        //   q = -eta_t * exp(-eta_ls),
        //
        // so the threshold and log-sigma blocks are not independent even if
        // the penalties are block-diagonal.
        //
        // Write for row i
        //
        //   t_i = x_i^T beta_t,
        //   s_i = z_i^T beta_ls,
        //   r_i = exp(-s_i),
        //   q_i = -t_i r_i,
        //   F_i(q) = -w_i [ y_i log Phi(q) + (1-y_i) log(1-Phi(q)) ].
        //
        // Let
        //
        //   m1_i = F_i'(q_i),
        //   m2_i = F_i''(q_i).
        //
        // The q-derivatives with respect to the two predictors are
        //
        //   q_t  = -r,
        //   q_ls = -q,
        //   q_tt = 0,
        //   q_t,ls = r,
        //   q_ls,ls = q.
        //
        // For any scalar-composition objective G(t,s)=F(q(t,s)), the Hessian
        // coefficients are
        //
        //   G_ab = m2 q_a q_b + m1 q_ab.
        //
        // Therefore the exact rowwise joint curvature in (eta_t, eta_ls) is
        //
        //   coeff_tt = m2 r^2,
        //   coeff_t,ls = r (m1 + q m2),
        //   coeff_ls,ls = q (m1 + q m2),
        //
        // and the full joint coefficient-space Hessian is assembled as
        //
        //   H_tt    = X_t^T diag(coeff_tt)    X_t,
        //   H_t,ls  = X_t^T diag(coeff_t,ls)  X_ls,
        //   H_ls,ls = X_ls^T diag(coeff_ls,ls) X_ls.
        //
        // The off-diagonal block is generally nonzero. That is exactly the
        // coupling term the broken blockwise outer-gradient path was dropping.
        let (coeff_tt, coeff_tl, coeff_ll) =
            self.exact_newton_joint_hessian_row_coefficients(block_states)?;
        let pt = x_t.ncols();
        let pls = x_ls.ncols();

        let h_tt = xt_diag_x_dense(x_t, &coeff_tt)?;
        let h_tl = xt_diag_y_dense(x_t, &coeff_tl, x_ls)?;
        let h_ll = xt_diag_x_dense(x_ls, &coeff_ll)?;
        let total = pt + pls;
        let mut h = Array2::<f64>::zeros((total, total));
        h.slice_mut(s![0..pt, 0..pt]).assign(&h_tt);
        h.slice_mut(s![0..pt, pt..total]).assign(&h_tl);
        h.slice_mut(s![pt..total, pt..total]).assign(&h_ll);
        mirror_upper_to_lower(&mut h);
        Ok(Some(h))
    }

    fn exact_newton_joint_hessian_from_design_matrices(
        &self,
        block_states: &[ParameterBlockState],
        x_t: &DesignMatrix,
        x_ls: &DesignMatrix,
    ) -> Result<Option<Array2<f64>>, String> {
        let (coeff_tt, coeff_tl, coeff_ll) =
            self.exact_newton_joint_hessian_row_coefficients(block_states)?;
        let pt = x_t.ncols();
        let pls = x_ls.ncols();

        let h_tt = xt_diag_x_design(x_t, &coeff_tt)?;
        let h_tl = xt_diag_y_design(x_t, &coeff_tl, x_ls)?;
        let h_ll = xt_diag_x_design(x_ls, &coeff_ll)?;
        let total = pt + pls;
        let mut h = Array2::<f64>::zeros((total, total));
        h.slice_mut(s![0..pt, 0..pt]).assign(&h_tt);
        h.slice_mut(s![0..pt, pt..total]).assign(&h_tl);
        h.slice_mut(s![pt..total, pt..total]).assign(&h_ll);
        mirror_upper_to_lower(&mut h);
        Ok(Some(h))
    }

    fn exact_newton_joint_hessian_directional_derivative_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        x_t: &Array2<f64>,
        x_ls: &Array2<f64>,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        // Exact first directional derivative D_beta H_L[u] of the joint
        // likelihood curvature.
        //
        // Write
        //
        //   t  = X_t beta_t,
        //   ls = X_ls beta_ls,
        //   s  = exp(-ls),
        //   q  = -t .* s.
        //
        // For a full coefficient-space direction
        //
        //   u = (u_t, u_ls),
        //   xi_t  = X_t u_t,
        //   xi_ls = X_ls u_ls,
        //
        // the induced q-direction is
        //
        //   alpha = D q[u] = -s .* xi_t - q .* xi_ls.
        //
        // The joint diagonal-working-curvature likelihood matrix is
        //
        //   H_L = J^T W J,
        //   J_t  = -diag(s) X_t,
        //   J_ls = -diag(q) X_ls.
        //
        // Differentiating once gives
        //
        //   D_beta H_L[u]
        //   = K[u]^T W J
        //     + J^T W K[u]
        //     + J^T diag(nu .* alpha) J,
        //
        // where
        //
        //   K_t[u]  = diag(s .* xi_ls) X_t,
        //   K_ls[u] = diag(s .* xi_t + q .* xi_ls) X_ls,
        //
        // and `nu = d'''(q)` is the third derivative of the scalar row loss.
        // This is exactly the joint curvature drift that enters the profiled
        // derivative through
        //
        //   dot H_k = A_k + D_beta H_L[u_k],
        //   dJ/drho_k
        //   = 0.5 beta^T A_k beta
        //     + 0.5 tr(H^{-1} dot H_k)
        //     - 0.5 tr(S^+ A_k).
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        if eta_t.len() != n || eta_ls.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialLocationScaleFamily input size mismatch".to_string(),
            }
            .into());
        }

        let pt = x_t.ncols();
        let pls = x_ls.ncols();
        if d_beta_flat.len() != pt + pls {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleFamily joint d_beta length mismatch: got {}, expected {}",
                    d_beta_flat.len(),
                    pt + pls
                ),
            }
            .into());
        }
        let d_eta_t = fast_av(x_t, &d_beta_flat.slice(s![0..pt]));
        let d_eta_ls = fast_av(x_ls, &d_beta_flat.slice(s![pt..pt + pls]));
        let core = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            None,
            &self.link_kind,
        )?;
        let (coeff_tt, coeff_tl, coeff_ll) = binomial_location_scale_first_directional_coefficients(
            &self.y,
            &self.weights,
            &core,
            &d_eta_t,
            &d_eta_ls,
            &self.link_kind,
        );

        let d_h_tt = xt_diag_x_dense(x_t, &coeff_tt)?;
        let d_h_tl = xt_diag_y_dense(x_t, &coeff_tl, x_ls)?;
        let d_h_ll = xt_diag_x_dense(x_ls, &coeff_ll)?;
        let total = pt + pls;
        let mut d_h = Array2::<f64>::zeros((total, total));
        d_h.slice_mut(s![0..pt, 0..pt]).assign(&d_h_tt);
        d_h.slice_mut(s![0..pt, pt..total]).assign(&d_h_tl);
        d_h.slice_mut(s![pt..total, pt..total]).assign(&d_h_ll);
        mirror_upper_to_lower(&mut d_h);
        Ok(Some(d_h))
    }

    fn exact_newton_joint_hessiansecond_directional_derivative_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        x_t: &Array2<f64>,
        x_ls: &Array2<f64>,
        d_beta_u_flat: &Array1<f64>,
        d_betav_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        // Exact mixed second directional derivative D_beta^2 H_L[u, v].
        //
        // This is the family-specific part of the total second curvature drift
        //
        //   ddot H_{k,l}
        //   = B_{k,l}
        //     + D_beta H_L[u_{k,l}]
        //     + D_beta^2 H_L[u_l, u_k],
        //
        // used in the profiled outer Hessian
        //
        //   d^2J/(drho_k drho_l)
        //   = u_l^T A_k beta
        //     + 0.5 beta^T B_{k,l} beta
        //     + 0.5 tr(H^{-1} ddot H_{k,l})
        //     - 0.5 tr(H^{-1} dot H_l H^{-1} dot H_k)
        //     - 0.5 d^2/drho_k drho_l log|S|_+.
        //
        // For directions
        //
        //   u = (u_t, u_ls),  v = (v_t, v_ls),
        //
        // define the rowwise predictor perturbations
        //
        //   xi_t^(u)  = X_t u_t,    xi_ls^(u)  = X_ls u_ls,
        //   xi_t^(v)  = X_t v_t,    xi_ls^(v)  = X_ls v_ls.
        //
        // With the exact exp sigma link,
        //
        //   s = exp(-eta_ls),
        //   q = -eta_t .* s,
        //
        // the first and second q-drifts are
        //
        //   alpha(u)   = D q[u]   = -s .* xi_t^(u) - q .* xi_ls^(u),
        //   alpha(v)   = D q[v]   = -s .* xi_t^(v) - q .* xi_ls^(v),
        //   alpha(u,v) = D^2 q[u,v]
        //              = s .* (xi_t^(u) .* xi_ls^(v) + xi_t^(v) .* xi_ls^(u))
        //                + q .* xi_ls^(u) .* xi_ls^(v).
        //
        // Differentiating the scalar-composition Hessian coefficients twice
        // yields the rowwise formulas below. Those formulas are exactly the
        // fourth-order beta-curvature contraction needed to make the joint
        // rho-Hessian path consistent with the first-order joint solve.
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        if eta_t.len() != n || eta_ls.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialLocationScaleFamily input size mismatch".to_string(),
            }
            .into());
        }

        let pt = x_t.ncols();
        let pls = x_ls.ncols();
        let total = pt + pls;
        if d_beta_u_flat.len() != total {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "BinomialLocationScaleFamily joint d_beta_u length mismatch: got {}, expected {}",
                d_beta_u_flat.len(),
                total
            ) }.into());
        }
        if d_betav_flat.len() != total {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "BinomialLocationScaleFamily joint d_betav length mismatch: got {}, expected {}",
                d_betav_flat.len(),
                total
            ) }.into());
        }
        let d_eta_t_u = fast_av(x_t, &d_beta_u_flat.slice(s![0..pt]));
        let d_eta_ls_u = fast_av(x_ls, &d_beta_u_flat.slice(s![pt..total]));
        let d_eta_tv = fast_av(x_t, &d_betav_flat.slice(s![0..pt]));
        let d_eta_lsv = fast_av(x_ls, &d_betav_flat.slice(s![pt..total]));
        let core = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            None,
            &self.link_kind,
        )?;
        let (coeff_tt, coeff_tl, coeff_ll) =
            binomial_location_scalesecond_directional_coefficients(
                &self.y,
                &self.weights,
                &core,
                &d_eta_t_u,
                &d_eta_ls_u,
                &d_eta_tv,
                &d_eta_lsv,
                &self.link_kind,
            )?;

        let d2_h_tt = xt_diag_x_dense(x_t, &coeff_tt)?;
        let d2_h_tl = xt_diag_y_dense(x_t, &coeff_tl, x_ls)?;
        let d2_h_ll = xt_diag_x_dense(x_ls, &coeff_ll)?;
        let mut d2_h = Array2::<f64>::zeros((total, total));
        d2_h.slice_mut(s![0..pt, 0..pt]).assign(&d2_h_tt);
        d2_h.slice_mut(s![0..pt, pt..total]).assign(&d2_h_tl);
        d2_h.slice_mut(s![pt..total, pt..total]).assign(&d2_h_ll);
        mirror_upper_to_lower(&mut d2_h);
        Ok(Some(d2_h))
    }

    fn exact_newton_joint_psi_direction(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        x_t: &Array2<f64>,
        x_ls: &Array2<f64>,
        policy: &crate::resource::ResourcePolicy,
    ) -> Result<Option<LocationScaleJointPsiDirection>, String> {
        let Some(parts) = locscale_joint_psi_direction_parts(
            block_states,
            derivative_blocks,
            psi_index,
            self.y.len(),
            x_t.ncols(),
            x_ls.ncols(),
            Self::BLOCK_T,
            Self::BLOCK_LOG_SIGMA,
            2,
            "BinomialLocationScaleFamily",
            "threshold",
            policy,
        )?
        else {
            return Ok(None);
        };
        Ok(Some(LocationScaleJointPsiDirection {
            block_idx: parts.block_idx,
            local_idx: parts.local_idx,
            x_primary_psi: parts.primary_psi,
            x_ls_psi: parts.log_sigma_psi,
            z_primary_psi: parts.primary_z,
            z_ls_psi: parts.log_sigma_z,
        }))
    }

    fn exact_newton_joint_psisecond_design_drifts(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_a: &LocationScaleJointPsiDirection,
        psi_b: &LocationScaleJointPsiDirection,
        x_t: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<LocationScaleJointPsiSecondDrifts, String> {
        locscale_joint_psisecond_design_drifts(
            block_states,
            derivative_blocks,
            psi_a,
            psi_b,
            LocScalePsiDriftConfig {
                n: self.y.len(),
                p_primary: x_t.ncols(),
                p_log_sigma: x_ls.ncols(),
                primary_block_idx: Self::BLOCK_T,
                log_sigma_block_idx: Self::BLOCK_LOG_SIGMA,
                family_name: "BinomialLocationScaleFamily",
                primary_label: "threshold",
                policy: &self.policy,
            },
        )
    }

    fn exact_newton_joint_psi_terms_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        x_t: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiTerms>, String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        if specs.len() != 2 || derivative_blocks.len() != 2 {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "BinomialLocationScaleFamily joint psi terms expect 2 specs and 2 derivative blocks, got {} and {}",
                specs.len(),
                derivative_blocks.len()
            ) }.into());
        }
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        if eta_t.len() != n || eta_ls.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialLocationScaleFamily input size mismatch".to_string(),
            }
            .into());
        }

        // Joint fixed-beta psi terms for the coupled 2-block probit model.
        //
        // We work over the flattened coefficient vector beta = [beta_t; beta_ls]
        // and one realized spatial coordinate psi_a. The exact profiled/Laplace
        // outer calculus needs the family-side explicit objects
        //
        //   V_psi^explicit,  g_psi^explicit,  H_psi^explicit,
        //
        // all in this flattened coefficient space. These are likelihood-only
        // objects:
        //
        //   D_psi, D_{beta psi}, D_{beta beta psi}
        //
        // Generic exact-joint code adds the realized penalty motion
        //
        //   0.5 beta^T S_psi beta,  S_psi beta,  S_psi
        //
        // when forming V_i, g_i, H_i. Keeping the family hook likelihood-only
        // is what makes the unified S(theta) outer calculus correct for both
        // psi-moving designs and psi-moving penalties.
        //
        // Model:
        //   eta_t  = X_t beta_t,
        //   eta_ls = X_ls beta_ls,
        //   r      = exp(-eta_ls),
        //   q      = -eta_t .* r.
        //
        // A single realized psi_a may move either block design, so define the
        // fixed-beta predictor drifts
        //
        //   z_t  = X_{t,psi}  beta_t   (zero if psi_a is not a threshold psi)
        //   z_ls = X_{ls,psi} beta_ls  (zero if psi_a is not a log-sigma psi).
        //
        // Then the explicit q-drift is
        //
        //   q_psi = -r .* z_t - q .* z_ls.
        //
        // Rowwise scalar derivatives of the negative Bernoulli-probit loss are
        //
        //   a = dF/dq,
        //   b = d²F/dq²,
        //   c = d³F/dq³.
        //
        // Predictor-space score pieces:
        //
        //   r_t  = dF/deta_t  = -a r,
        //   r_ls = dF/deta_ls = -a q.
        //
        // Their explicit psi derivatives at fixed beta are
        //
        //   d_psi r_t  = -b q_psi r + a r z_ls,
        //   d_psi r_ls = -(a + q b) q_psi.
        //
        // Hence the exact joint score derivative is
        //
        //   g_psi
        //   = [ X_{t,psi}^T r_t  + X_t^T d_psi r_t,
        //       X_{ls,psi}^T r_ls + X_ls^T d_psi r_ls ].
        //
        // The exact envelope term is
        //
        //   V_psi^explicit = r_t^T z_t + r_ls^T z_ls.
        //
        // For the Laplace trace we also need the explicit Hessian drift. The
        // joint exact Hessian has block coefficients
        //
        //   h_tt = b r²,
        //   h_tl = r (a + q b),
        //   h_ll = q (a + q b),
        //
        // so differentiating those coefficients at fixed beta gives
        //
        //   d_psi h_tt = r² (c q_psi - 2 b z_ls),
        //   d_psi h_tl = r [ (2 b + c q) q_psi - (a + q b) z_ls ],
        //   d_psi h_ll = (a + 3 q b + q² c) q_psi.
        //
        // The full joint explicit Hessian drift is then
        //
        //   H_tt,psi
        //   = X_{t,psi}^T diag(h_tt) X_t
        //     + X_t^T diag(h_tt) X_{t,psi}
        //     + X_t^T diag(d_psi h_tt) X_t,
        //
        //   H_tl,psi
        //   = X_{t,psi}^T diag(h_tl) X_ls
        //     + X_t^T diag(h_tl) X_{ls,psi}
        //     + X_t^T diag(d_psi h_tl) X_ls,
        //
        //   H_ll,psi
        //   = X_{ls,psi}^T diag(h_ll) X_ls
        //     + X_ls^T diag(h_ll) X_{ls,psi}
        //     + X_ls^T diag(d_psi h_ll) X_ls.
        //
        // Even when only one block moves explicitly, the resulting score and
        // Hessian objects are joint because q couples eta_t and eta_ls.
        let core = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            None,
            &self.link_kind,
        )?;
        let pt = x_t.ncols();
        let pls = x_ls.ncols();
        let total = pt + pls;
        let Some(dir_a) = self.exact_newton_joint_psi_direction(
            block_states,
            derivative_blocks,
            psi_index,
            x_t,
            x_ls,
            &self.policy,
        )?
        else {
            return Ok(None);
        };
        let (z_t, z_ls) = (&dir_a.z_primary_psi, &dir_a.z_ls_psi);

        // Per-row scalars assembled in parallel. The probit/inverse-link
        // derivatives are O(n) at biobank scale and are called O(K) times per
        // outer REML gradient (K = number of psi coords), so a parallel pass is
        // worthwhile here.
        struct PsiTermsRow {
            r_t: f64,
            r_ls: f64,
            dr_t: f64,
            dr_ls: f64,
            h_tt: f64,
            h_tl: f64,
            h_ll: f64,
            dh_tt: f64,
            dh_tl: f64,
            dh_ll: f64,
            obj: f64,
        }
        let y_p = self.y.as_slice().expect("y must be contiguous");
        let w_p = self.weights.as_slice().expect("weights must be contiguous");
        let q0_p = core.q0.as_slice().expect("q0 must be contiguous");
        let sigma_p = core.sigma.as_slice().expect("sigma must be contiguous");
        let dsigma_p = core
            .dsigma_deta
            .as_slice()
            .expect("dsigma_deta must be contiguous");
        let mu_p = core.mu.as_slice().expect("mu must be contiguous");
        let dmu_p = core.dmu_dq.as_slice().expect("dmu_dq must be contiguous");
        let d2mu_p = core
            .d2mu_dq2
            .as_slice()
            .expect("d2mu_dq2 must be contiguous");
        let d3mu_p = core
            .d3mu_dq3
            .as_slice()
            .expect("d3mu_dq3 must be contiguous");
        let z_t_p = z_t.as_slice().expect("z_t must be contiguous");
        let z_ls_p = z_ls.as_slice().expect("z_ls must be contiguous");
        let link_kind_p = &self.link_kind;
        let rows: Vec<PsiTermsRow> = (0..n)
            .into_par_iter()
            .map(|i| {
                let q = q0_p[i];
                let r = 1.0 / sigma_p[i];
                let s = dsigma_p[i] / sigma_p[i];
                let sz = s * z_ls_p[i];
                let q_psi = -r * z_t_p[i] - q * sz;
                let (a, b, c) = binomial_neglog_q_derivatives_dispatch(
                    y_p[i],
                    w_p[i],
                    q,
                    mu_p[i],
                    dmu_p[i],
                    d2mu_p[i],
                    d3mu_p[i],
                    link_kind_p,
                );
                let r_t = -a * r;
                let r_ls = -a * q * s;
                PsiTermsRow {
                    r_t,
                    r_ls,
                    dr_t: -b * q_psi * r + a * r * sz,
                    dr_ls: -(a + q * b) * q_psi,
                    h_tt: b * r * r,
                    h_tl: r * (a + q * b),
                    h_ll: q * (a + q * b),
                    dh_tt: r * r * (c * q_psi - 2.0 * b * sz),
                    dh_tl: r * ((2.0 * b + c * q) * q_psi - (a + q * b) * sz),
                    dh_ll: (a + 3.0 * q * b + q * q * c) * q_psi,
                    obj: r_t * z_t_p[i] + r_ls * z_ls_p[i],
                }
            })
            .collect();
        let mut r_t = Array1::<f64>::zeros(n);
        let mut r_ls = Array1::<f64>::zeros(n);
        let mut dr_t = Array1::<f64>::zeros(n);
        let mut dr_ls = Array1::<f64>::zeros(n);
        let mut h_tt = Array1::<f64>::zeros(n);
        let mut h_tl = Array1::<f64>::zeros(n);
        let mut h_ll = Array1::<f64>::zeros(n);
        let mut dh_tt = Array1::<f64>::zeros(n);
        let mut dh_tl = Array1::<f64>::zeros(n);
        let mut dh_ll = Array1::<f64>::zeros(n);
        let mut objective_psi = 0.0_f64;
        for (i, row) in rows.into_iter().enumerate() {
            r_t[i] = row.r_t;
            r_ls[i] = row.r_ls;
            dr_t[i] = row.dr_t;
            dr_ls[i] = row.dr_ls;
            h_tt[i] = row.h_tt;
            h_tl[i] = row.h_tl;
            h_ll[i] = row.h_ll;
            dh_tt[i] = row.dh_tt;
            dh_tl[i] = row.dh_tl;
            dh_ll[i] = row.dh_ll;
            objective_psi += row.obj;
        }

        let hessian_psi_operator = build_two_block_custom_family_joint_psi_operator_from_actions(
            dir_a.x_primary_psi.cloned_first_action(),
            dir_a.x_ls_psi.cloned_first_action(),
            0..pt,
            pt..pt + pls,
            x_t,
            x_ls,
            &h_tt,
            &h_tl,
            &h_ll,
            &dh_tt,
            &dh_tl,
            &dh_ll,
        )?;
        let x_t_map = dir_a.x_primary_psi.as_linear_map_ref();
        let x_ls_map = dir_a.x_ls_psi.as_linear_map_ref();
        let score_t = x_t_map.transpose_mul(r_t.view()) + fast_atv(x_t, &dr_t);
        let score_ls = x_ls_map.transpose_mul(r_ls.view()) + fast_atv(x_ls, &dr_ls);
        let mut score_psi = Array1::<f64>::zeros(total);
        score_psi.slice_mut(s![0..pt]).assign(&score_t);
        score_psi.slice_mut(s![pt..pt + pls]).assign(&score_ls);
        let hessian_psi = if hessian_psi_operator.is_some() {
            Array2::zeros((0, 0))
        } else {
            let h_tt_block = weighted_crossprod_psi_maps(
                x_t_map,
                h_tt.view(),
                CustomFamilyPsiLinearMapRef::Dense(x_t),
            )? + &weighted_crossprod_psi_maps(
                CustomFamilyPsiLinearMapRef::Dense(x_t),
                h_tt.view(),
                x_t_map,
            )? + &xt_diag_x_dense(x_t, &dh_tt)?;
            let h_tl_block = weighted_crossprod_psi_maps(
                x_t_map,
                h_tl.view(),
                CustomFamilyPsiLinearMapRef::Dense(x_ls),
            )? + &weighted_crossprod_psi_maps(
                CustomFamilyPsiLinearMapRef::Dense(x_t),
                h_tl.view(),
                x_ls_map,
            )? + &xt_diag_y_dense(x_t, &dh_tl, x_ls)?;
            let h_ll_block = weighted_crossprod_psi_maps(
                x_ls_map,
                h_ll.view(),
                CustomFamilyPsiLinearMapRef::Dense(x_ls),
            )? + &weighted_crossprod_psi_maps(
                CustomFamilyPsiLinearMapRef::Dense(x_ls),
                h_ll.view(),
                x_ls_map,
            )? + &xt_diag_x_dense(x_ls, &dh_ll)?;

            let mut hessian_psi = Array2::<f64>::zeros((total, total));
            hessian_psi.slice_mut(s![0..pt, 0..pt]).assign(&h_tt_block);
            hessian_psi
                .slice_mut(s![0..pt, pt..pt + pls])
                .assign(&h_tl_block);
            hessian_psi
                .slice_mut(s![pt..pt + pls, pt..pt + pls])
                .assign(&h_ll_block);
            mirror_upper_to_lower(&mut hessian_psi);
            hessian_psi
        };

        Ok(Some(crate::custom_family::ExactNewtonJointPsiTerms {
            objective_psi,
            score_psi,
            hessian_psi,
            hessian_psi_operator,
        }))
    }

    fn exact_newton_joint_psisecond_order_terms_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_i: usize,
        psi_j: usize,
        x_t: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms>, String> {
        let Some(dir_i) = self.exact_newton_joint_psi_direction(
            block_states,
            derivative_blocks,
            psi_i,
            x_t,
            x_ls,
            &self.policy,
        )?
        else {
            return Ok(None);
        };
        let Some(dir_j) = self.exact_newton_joint_psi_direction(
            block_states,
            derivative_blocks,
            psi_j,
            x_t,
            x_ls,
            &self.policy,
        )?
        else {
            return Ok(None);
        };
        Ok(Some(
            self.exact_newton_joint_psisecond_order_terms_from_parts(
                block_states,
                derivative_blocks,
                &dir_i,
                &dir_j,
                x_t,
                x_ls,
            )?,
        ))
    }

    fn exact_newton_joint_psisecond_order_terms_from_parts(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        dir_i: &LocationScaleJointPsiDirection,
        dir_j: &LocationScaleJointPsiDirection,
        x_t: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms, String> {
        let second_drifts = self.exact_newton_joint_psisecond_design_drifts(
            block_states,
            derivative_blocks,
            dir_i,
            dir_j,
            x_t,
            x_ls,
        )?;
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let core = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            None,
            &self.link_kind,
        )?;
        let pt = x_t.ncols();
        let pls = x_ls.ncols();
        let total = pt + pls;
        let x_t_i_map = dir_i.x_primary_psi.as_linear_map_ref();
        let x_t_j_map = dir_j.x_primary_psi.as_linear_map_ref();
        let x_ls_i_map = dir_i.x_ls_psi.as_linear_map_ref();
        let x_ls_j_map = dir_j.x_ls_psi.as_linear_map_ref();
        let x_t_ab_map = second_psi_linear_map(
            second_drifts.x_primary_ab_action.as_ref(),
            second_drifts.x_primary_ab.as_ref(),
            n,
            pt,
        );
        let x_ls_ab_map = second_psi_linear_map(
            second_drifts.x_ls_ab_action.as_ref(),
            second_drifts.x_ls_ab.as_ref(),
            n,
            pls,
        );

        // Exact fixed-beta psi/psi terms for the coupled non-wiggle probit
        // family.
        //
        // For two realized spatial coordinates psi_a, psi_b define
        //
        //   z_t,a  = X_{t,a} beta_t,    z_ls,a  = X_{ls,a} beta_ls,
        //   z_t,b  = X_{t,b} beta_t,    z_ls,b  = X_{ls,b} beta_ls,
        //   z_t,ab = X_{t,ab} beta_t,   z_ls,ab = X_{ls,ab} beta_ls.
        //
        // On the smooth interior branch, with r = exp(-eta_ls) and q = -eta_t r,
        //
        //   q_a  = -r z_t,a - q z_ls,a,
        //   q_b  = -r z_t,b - q z_ls,b,
        //   q_ab = -r z_t,ab
        //          + r(z_t,a z_ls,b + z_t,b z_ls,a)
        //          + q(z_ls,a z_ls,b - z_ls,ab).
        //
        // For scalar row loss derivatives
        //
        //   a = dF/dq,  b = d²F/dq²,  c = d³F/dq³,  d = d⁴F/dq⁴,
        //
        // the exact fixed-beta psi/psi objects are
        //
        //   V_ab = sum [ a q_ab + b q_a q_b ],
        //
        //   g_ab = [ X_{t,ab}^T r_t + X_{t,a}^T d_b r_t + X_{t,b}^T d_a r_t + X_t^T d_ab r_t,
        //            X_{ls,ab}^T r_ls + X_{ls,a}^T d_b r_ls + X_{ls,b}^T d_a r_ls + X_ls^T d_ab r_ls ],
        //
        // where
        //
        //   r_t  = -a r,
        //   r_ls = -a q,
        //
        //   d_a r_t  = -b q_a r + a r z_ls,a,
        //   d_a r_ls = -(a + q b) q_a,
        //
        //   d_ab r_t
        //   = r[
        //       -c q_a q_b - b q_ab
        //       + b(q_a z_ls,b + q_b z_ls,a)
        //       - a z_ls,a z_ls,b
        //       + a z_ls,ab
        //     ],
        //
        //   d_ab r_ls
        //   = -[(2b + q c) q_a q_b + (a + q b) q_ab].
        //
        // The exact Hessian psi/psi drift comes from the second derivatives of
        // the joint Hessian coefficients. In the notation of the unified outer
        // calculus, these rowwise coefficient drifts are precisely the
        // likelihood-side pieces of
        //
        //   D_{beta beta psi_a psi_b},
        //
        // before the generic assembler adds any realized-penalty contribution
        //
        //   S_ab = partial_{psi_a psi_b} S(theta).
        //
        // So this helper returns likelihood-only
        //
        //   D_ab, D_{beta ab}, D_{beta beta ab},
        //
        // and the unified exact assembler in custom_family.rs forms
        //
        //   V_ab = D_ab + 0.5 beta^T S_ab beta,
        //   g_ab = D_{beta ab} + S_ab beta,
        //   H_ab = D_{beta beta ab} + S_ab.
        //
        // Once H_ab is known, the outer assembler combines it with the joint
        // mode responses beta_a, beta_b, beta_ab and the contractions
        //
        //   T_a[beta_b], T_b[beta_a], D_beta H[beta_ab], D_beta^2 H[beta_a, beta_b]
        //
        // to form
        //
        //   ddot H_ab
        //   = H_ab + T_a[beta_b] + T_b[beta_a]
        //     + D_beta H[beta_ab] + D_beta^2 H[beta_a, beta_b].
        //
        // That is why this helper computes only the fixed-beta psi/psi object:
        // the total profiled/Laplace Hessian drift is assembled generically in
        // custom_family.rs after the joint solves.
        //
        // Concretely, the rowwise coefficient identities below are
        //
        //   h_tt = b r²,
        //   h_tl = r(a + q b),
        //   h_ll = q(a + q b),
        //
        // namely
        //
        //   d_ab h_tt
        //   = r²[
        //       d q_a q_b + c q_ab
        //       - 2c(q_b z_ls,a + q_a z_ls,b)
        //       + 4b z_ls,a z_ls,b
        //       - 2b z_ls,ab
        //     ],
        //
        //   d_ab h_tl
        //   = r[
        //       ((3c + q d) q_b) q_a
        //       + (2b + q c) q_ab
        //       - (2b + q c)(q_b z_ls,a + q_a z_ls,b)
        //       + (a + q b)(z_ls,a z_ls,b - z_ls,ab)
        //     ],
        //
        //   d_ab h_ll
        //   = (4b + 5q c + q² d) q_a q_b
        //     + (a + 3q b + q² c) q_ab.
        //
        // Differentiating X^T diag(h) X twice then gives the explicit joint
        // psi/psi Hessian blocks.
        let mut r_t = Array1::<f64>::zeros(n);
        let mut r_ls = Array1::<f64>::zeros(n);
        let mut dr_t_i = Array1::<f64>::zeros(n);
        let mut dr_t_j = Array1::<f64>::zeros(n);
        let mut dr_ls_i = Array1::<f64>::zeros(n);
        let mut dr_ls_j = Array1::<f64>::zeros(n);
        let mut d2r_t = Array1::<f64>::zeros(n);
        let mut d2r_ls = Array1::<f64>::zeros(n);
        let mut h_tt = Array1::<f64>::zeros(n);
        let mut h_tl = Array1::<f64>::zeros(n);
        let mut h_ll = Array1::<f64>::zeros(n);
        let mut dh_tt_i = Array1::<f64>::zeros(n);
        let mut dh_tt_j = Array1::<f64>::zeros(n);
        let mut dh_tl_i = Array1::<f64>::zeros(n);
        let mut dh_tl_j = Array1::<f64>::zeros(n);
        let mut dh_ll_i = Array1::<f64>::zeros(n);
        let mut dh_ll_j = Array1::<f64>::zeros(n);
        let mut d2h_tt = Array1::<f64>::zeros(n);
        let mut d2h_tl = Array1::<f64>::zeros(n);
        let mut d2h_ll = Array1::<f64>::zeros(n);
        let mut objective_psi_psi = 0.0;
        struct PsiSecondRow {
            r_t: f64,
            r_ls: f64,
            dr_t_i: f64,
            dr_t_j: f64,
            dr_ls_i: f64,
            dr_ls_j: f64,
            d2r_t: f64,
            d2r_ls: f64,
            h_tt: f64,
            h_tl: f64,
            h_ll: f64,
            dh_tt_i: f64,
            dh_tt_j: f64,
            dh_tl_i: f64,
            dh_tl_j: f64,
            dh_ll_i: f64,
            dh_ll_j: f64,
            d2h_tt: f64,
            d2h_tl: f64,
            d2h_ll: f64,
            objective: f64,
        }
        let y_p = self.y.as_slice().expect("y must be contiguous");
        let w_p = self.weights.as_slice().expect("weights must be contiguous");
        let q_p = core.q0.as_slice().expect("q0 must be contiguous");
        let sigma_p = core.sigma.as_slice().expect("sigma must be contiguous");
        let mu_p = core.mu.as_slice().expect("mu must be contiguous");
        let dmu_p = core.dmu_dq.as_slice().expect("dmu_dq must be contiguous");
        let d2mu_p = core
            .d2mu_dq2
            .as_slice()
            .expect("d2mu_dq2 must be contiguous");
        let d3mu_p = core
            .d3mu_dq3
            .as_slice()
            .expect("d3mu_dq3 must be contiguous");
        let z_t_i = dir_i
            .z_primary_psi
            .as_slice()
            .expect("z_t_psi_i must be contiguous");
        let z_t_j = dir_j
            .z_primary_psi
            .as_slice()
            .expect("z_t_psi_j must be contiguous");
        let z_ls_i = dir_i
            .z_ls_psi
            .as_slice()
            .expect("z_ls_psi_i must be contiguous");
        let z_ls_j = dir_j
            .z_ls_psi
            .as_slice()
            .expect("z_ls_psi_j must be contiguous");
        let z_t_ab = second_drifts
            .z_primary_ab
            .as_slice()
            .expect("z_t_ab must be contiguous");
        let z_ls_ab = second_drifts
            .z_ls_ab
            .as_slice()
            .expect("z_ls_ab must be contiguous");
        let link_kind_p = &self.link_kind;
        let rows: Result<Vec<PsiSecondRow>, String> = (0..n)
            .into_par_iter()
            .map(|row| {
                let q = q_p[row];
                let r = 1.0 / sigma_p[row];
                let q_i = -r * z_t_i[row] - q * z_ls_i[row];
                let q_j = -r * z_t_j[row] - q * z_ls_j[row];
                let q_ij = -r * z_t_ab[row]
                    + r * (z_t_i[row] * z_ls_j[row] + z_t_j[row] * z_ls_i[row])
                    + q * (z_ls_i[row] * z_ls_j[row] - z_ls_ab[row]);
                let (a, b, c) = binomial_neglog_q_derivatives_dispatch(
                    y_p[row],
                    w_p[row],
                    q,
                    mu_p[row],
                    dmu_p[row],
                    d2mu_p[row],
                    d3mu_p[row],
                    link_kind_p,
                );
                let d = binomial_neglog_q_fourth_derivative_dispatch(
                    y_p[row],
                    w_p[row],
                    q,
                    mu_p[row],
                    dmu_p[row],
                    d2mu_p[row],
                    d3mu_p[row],
                    link_kind_p,
                )?;
                let u = a + q * b;
                let u_i = (2.0 * b + q * c) * q_i;
                let u_j = (2.0 * b + q * c) * q_j;
                Ok(PsiSecondRow {
                    r_t: -a * r,
                    r_ls: -a * q,
                    dr_t_i: -b * q_i * r + a * r * z_ls_i[row],
                    dr_t_j: -b * q_j * r + a * r * z_ls_j[row],
                    dr_ls_i: -u * q_i,
                    dr_ls_j: -u * q_j,
                    d2r_t: r
                        * (-c * q_i * q_j - b * q_ij + b * (q_i * z_ls_j[row] + q_j * z_ls_i[row])
                            - a * z_ls_i[row] * z_ls_j[row]
                            + a * z_ls_ab[row]),
                    d2r_ls: -((2.0 * b + q * c) * q_i * q_j + u * q_ij),
                    h_tt: b * r * r,
                    h_tl: r * u,
                    h_ll: q * u,
                    dh_tt_i: r * r * (c * q_i - 2.0 * b * z_ls_i[row]),
                    dh_tt_j: r * r * (c * q_j - 2.0 * b * z_ls_j[row]),
                    dh_tl_i: r * (u_i - u * z_ls_i[row]),
                    dh_tl_j: r * (u_j - u * z_ls_j[row]),
                    dh_ll_i: (a + 3.0 * q * b + q * q * c) * q_i,
                    dh_ll_j: (a + 3.0 * q * b + q * q * c) * q_j,
                    d2h_tt: r
                        * r
                        * (d * q_i * q_j + c * q_ij
                            - 2.0 * c * (q_j * z_ls_i[row] + q_i * z_ls_j[row])
                            + 4.0 * b * z_ls_i[row] * z_ls_j[row]
                            - 2.0 * b * z_ls_ab[row]),
                    d2h_tl: r
                        * (((3.0 * c + q * d) * q_j) * q_i + (2.0 * b + q * c) * q_ij
                            - (2.0 * b + q * c) * (q_j * z_ls_i[row] + q_i * z_ls_j[row])
                            + u * (z_ls_i[row] * z_ls_j[row] - z_ls_ab[row])),
                    d2h_ll: (4.0 * b + 5.0 * q * c + q * q * d) * q_i * q_j
                        + (a + 3.0 * q * b + q * q * c) * q_ij,
                    objective: a * q_ij + b * q_i * q_j,
                })
            })
            .collect();
        for (row, vals) in rows?.into_iter().enumerate() {
            r_t[row] = vals.r_t;
            r_ls[row] = vals.r_ls;
            dr_t_i[row] = vals.dr_t_i;
            dr_t_j[row] = vals.dr_t_j;
            dr_ls_i[row] = vals.dr_ls_i;
            dr_ls_j[row] = vals.dr_ls_j;
            d2r_t[row] = vals.d2r_t;
            d2r_ls[row] = vals.d2r_ls;
            h_tt[row] = vals.h_tt;
            h_tl[row] = vals.h_tl;
            h_ll[row] = vals.h_ll;
            dh_tt_i[row] = vals.dh_tt_i;
            dh_tt_j[row] = vals.dh_tt_j;
            dh_tl_i[row] = vals.dh_tl_i;
            dh_tl_j[row] = vals.dh_tl_j;
            dh_ll_i[row] = vals.dh_ll_i;
            dh_ll_j[row] = vals.dh_ll_j;
            d2h_tt[row] = vals.d2h_tt;
            d2h_tl[row] = vals.d2h_tl;
            d2h_ll[row] = vals.d2h_ll;
            objective_psi_psi += vals.objective;
        }
        let mut score_psi_psi = Array1::<f64>::zeros(total);
        score_psi_psi.slice_mut(s![0..pt]).assign(
            &(x_t_ab_map.transpose_mul(r_t.view())
                + x_t_i_map.transpose_mul(dr_t_j.view())
                + x_t_j_map.transpose_mul(dr_t_i.view())
                + fast_atv(x_t, &d2r_t)),
        );
        score_psi_psi.slice_mut(s![pt..pt + pls]).assign(
            &(x_ls_ab_map.transpose_mul(r_ls.view())
                + x_ls_i_map.transpose_mul(dr_ls_j.view())
                + x_ls_j_map.transpose_mul(dr_ls_i.view())
                + fast_atv(x_ls, &d2r_ls)),
        );

        let h_tt_block = weighted_crossprod_psi_maps(
            x_t_ab_map,
            h_tt.view(),
            CustomFamilyPsiLinearMapRef::Dense(x_t),
        )? + &weighted_crossprod_psi_maps(x_t_i_map, h_tt.view(), x_t_j_map)?
            + &weighted_crossprod_psi_maps(x_t_j_map, h_tt.view(), x_t_i_map)?
            + &weighted_crossprod_psi_maps(
                x_t_i_map,
                dh_tt_j.view(),
                CustomFamilyPsiLinearMapRef::Dense(x_t),
            )?
            + &weighted_crossprod_psi_maps(
                x_t_j_map,
                dh_tt_i.view(),
                CustomFamilyPsiLinearMapRef::Dense(x_t),
            )?
            + &weighted_crossprod_psi_maps(
                CustomFamilyPsiLinearMapRef::Dense(x_t),
                dh_tt_i.view(),
                x_t_j_map,
            )?
            + &weighted_crossprod_psi_maps(
                CustomFamilyPsiLinearMapRef::Dense(x_t),
                dh_tt_j.view(),
                x_t_i_map,
            )?
            + &xt_diag_x_dense(x_t, &d2h_tt)?
            + &weighted_crossprod_psi_maps(
                CustomFamilyPsiLinearMapRef::Dense(x_t),
                h_tt.view(),
                x_t_ab_map,
            )?;
        let h_tl_block = weighted_crossprod_psi_maps(
            x_t_ab_map,
            h_tl.view(),
            CustomFamilyPsiLinearMapRef::Dense(x_ls),
        )? + &weighted_crossprod_psi_maps(x_t_i_map, h_tl.view(), x_ls_j_map)?
            + &weighted_crossprod_psi_maps(x_t_j_map, h_tl.view(), x_ls_i_map)?
            + &weighted_crossprod_psi_maps(
                x_t_i_map,
                dh_tl_j.view(),
                CustomFamilyPsiLinearMapRef::Dense(x_ls),
            )?
            + &weighted_crossprod_psi_maps(
                x_t_j_map,
                dh_tl_i.view(),
                CustomFamilyPsiLinearMapRef::Dense(x_ls),
            )?
            + &weighted_crossprod_psi_maps(
                CustomFamilyPsiLinearMapRef::Dense(x_t),
                dh_tl_i.view(),
                x_ls_j_map,
            )?
            + &weighted_crossprod_psi_maps(
                CustomFamilyPsiLinearMapRef::Dense(x_t),
                dh_tl_j.view(),
                x_ls_i_map,
            )?
            + &xt_diag_y_dense(x_t, &d2h_tl, x_ls)?
            + &weighted_crossprod_psi_maps(
                CustomFamilyPsiLinearMapRef::Dense(x_t),
                h_tl.view(),
                x_ls_ab_map,
            )?;
        let h_ll_block = weighted_crossprod_psi_maps(
            x_ls_ab_map,
            h_ll.view(),
            CustomFamilyPsiLinearMapRef::Dense(x_ls),
        )? + &weighted_crossprod_psi_maps(x_ls_i_map, h_ll.view(), x_ls_j_map)?
            + &weighted_crossprod_psi_maps(x_ls_j_map, h_ll.view(), x_ls_i_map)?
            + &weighted_crossprod_psi_maps(
                x_ls_i_map,
                dh_ll_j.view(),
                CustomFamilyPsiLinearMapRef::Dense(x_ls),
            )?
            + &weighted_crossprod_psi_maps(
                x_ls_j_map,
                dh_ll_i.view(),
                CustomFamilyPsiLinearMapRef::Dense(x_ls),
            )?
            + &weighted_crossprod_psi_maps(
                CustomFamilyPsiLinearMapRef::Dense(x_ls),
                dh_ll_i.view(),
                x_ls_j_map,
            )?
            + &weighted_crossprod_psi_maps(
                CustomFamilyPsiLinearMapRef::Dense(x_ls),
                dh_ll_j.view(),
                x_ls_i_map,
            )?
            + &xt_diag_x_dense(x_ls, &d2h_ll)?
            + &weighted_crossprod_psi_maps(
                CustomFamilyPsiLinearMapRef::Dense(x_ls),
                h_ll.view(),
                x_ls_ab_map,
            )?;

        let mut hessian_psi_psi = Array2::<f64>::zeros((total, total));
        hessian_psi_psi
            .slice_mut(s![0..pt, 0..pt])
            .assign(&h_tt_block);
        hessian_psi_psi
            .slice_mut(s![0..pt, pt..pt + pls])
            .assign(&h_tl_block);
        hessian_psi_psi
            .slice_mut(s![pt..pt + pls, pt..pt + pls])
            .assign(&h_ll_block);
        mirror_upper_to_lower(&mut hessian_psi_psi);

        Ok(crate::custom_family::ExactNewtonJointPsiSecondOrderTerms {
            objective_psi_psi,
            score_psi_psi,
            hessian_psi_psi,
            hessian_psi_psi_operator: None,
        })
    }

    fn exact_newton_joint_psihessian_directional_derivative_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        d_beta_flat: &Array1<f64>,
        x_t: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        let Some(dir_a) = self.exact_newton_joint_psi_direction(
            block_states,
            derivative_blocks,
            psi_index,
            x_t,
            x_ls,
            &self.policy,
        )?
        else {
            return Ok(None);
        };
        Ok(Some(
            self.exact_newton_joint_psihessian_directional_derivative_from_parts(
                block_states,
                &dir_a,
                d_beta_flat,
                x_t,
                x_ls,
            )?,
        ))
    }

    fn exact_newton_joint_psihessian_directional_derivative_from_parts(
        &self,
        block_states: &[ParameterBlockState],
        dir_a: &LocationScaleJointPsiDirection,
        d_beta_flat: &Array1<f64>,
        x_t: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<Array2<f64>, String> {
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let core = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            None,
            &self.link_kind,
        )?;
        let pt = x_t.ncols();
        let pls = x_ls.ncols();
        let total = pt + pls;
        if d_beta_flat.len() != total {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "BinomialLocationScaleFamily joint psi hessian directional derivative length mismatch: got {}, expected {}",
                d_beta_flat.len(),
                total
            ) }.into());
        }
        let xi_t = fast_av(x_t, &d_beta_flat.slice(s![0..pt]));
        let xi_ls = fast_av(x_ls, &d_beta_flat.slice(s![pt..pt + pls]));
        let x_t_map = dir_a.x_primary_psi.as_linear_map_ref();
        let x_ls_map = dir_a.x_ls_psi.as_linear_map_ref();

        // Mixed contraction T_a[u] = D_beta H_{psi_a}[u].
        //
        // In the non-wiggle family the realized design derivatives X_{psi_a}
        // are fixed with respect to beta, so differentiating the explicit
        // Hessian drift H_{psi_a} only moves the rowwise coefficient arrays.
        // This helper therefore returns exactly the likelihood-side mixed drift
        // required by the unified outer Hessian formula
        //
        //   ddot H_{ij}
        //   = H_{ij}
        //     + T_i[beta_j]
        //     + T_j[beta_i]
        //     + D_beta H[beta_ij]
        //     + D_beta^2 H[beta_i, beta_j].
        //
        // For i = psi_a, the generic assembler supplies beta_j and any
        // realized-penalty piece S_{psi_a} itself; this family hook contributes
        // only the exact likelihood-side T_a[beta_j].
        //
        // With
        //   du   = D_beta q[u]   = -r xi_t - q xi_ls,
        //   q_a  = q_{psi_a}     = -r z_t,a - q z_ls,a,
        //   q_au = D_beta q_a[u] = r z_t,a xi_ls - du z_ls,a,
        //
        // the directional derivatives of the first-order Hessian-drift
        // coefficients are the mixed specializations of the exact psi/psi
        // formulas with z_ls,ab = 0 and q_ab = q_au:
        //
        //   D_u(d_a h_tt)
        //   = r²[
        //       d du q_a + c q_au
        //       - 2c(q_a xi_ls + du z_ls,a)
        //       + 4b xi_ls z_ls,a
        //     ],
        //
        //   D_u(d_a h_tl)
        //   = r[
        //       ((3c + q d) q_a) du
        //       + (2b + q c) q_au
        //       - (2b + q c)(q_a xi_ls + du z_ls,a)
        //       + (a + q b) xi_ls z_ls,a
        //     ],
        //
        //   D_u(d_a h_ll)
        //   = (4b + 5q c + q² d) du q_a
        //     + (a + 3q b + q² c) q_au.
        //
        // Since X_t, X_ls, X_{t,psi_a}, X_{ls,psi_a} are all beta-independent
        // here, the full matrix contraction is obtained by replacing the row
        // coefficient arrays in H_{psi_a} by their directional derivatives.
        let mut dh_tt_u = Array1::<f64>::zeros(n);
        let mut dh_tl_u = Array1::<f64>::zeros(n);
        let mut dh_ll_u = Array1::<f64>::zeros(n);
        let mut h_tt_u = Array1::<f64>::zeros(n);
        let mut h_tl_u = Array1::<f64>::zeros(n);
        let mut h_ll_u = Array1::<f64>::zeros(n);
        for row in 0..n {
            let q = core.q0[row];
            let r = 1.0 / core.sigma[row];
            let s = core.dsigma_deta[row] / core.sigma[row];
            let xi_ls_s = s * xi_ls[row];
            let z_ls_psi_s = s * dir_a.z_ls_psi[row];
            let du = -r * xi_t[row] - q * xi_ls_s;
            let q_a = -r * dir_a.z_primary_psi[row] - q * z_ls_psi_s;
            let q_au = r * dir_a.z_primary_psi[row] * xi_ls_s - du * z_ls_psi_s;
            let (a, b, c) = binomial_neglog_q_derivatives_dispatch(
                self.y[row],
                self.weights[row],
                q,
                core.mu[row],
                core.dmu_dq[row],
                core.d2mu_dq2[row],
                core.d3mu_dq3[row],
                &self.link_kind,
            );
            let d = binomial_neglog_q_fourth_derivative_dispatch(
                self.y[row],
                self.weights[row],
                q,
                core.mu[row],
                core.dmu_dq[row],
                core.d2mu_dq2[row],
                core.d3mu_dq3[row],
                &self.link_kind,
            )?;
            let u = a + q * b;
            h_tt_u[row] = r * r * (c * du - 2.0 * b * xi_ls_s);
            h_tl_u[row] = r * ((2.0 * b + q * c) * du - u * xi_ls_s);
            h_ll_u[row] = (a + 3.0 * q * b + q * q * c) * du;
            dh_tt_u[row] = r
                * r
                * (d * du * q_a + c * q_au - 2.0 * c * (q_a * xi_ls_s + du * z_ls_psi_s)
                    + 4.0 * b * xi_ls_s * z_ls_psi_s);
            dh_tl_u[row] = r
                * (((3.0 * c + q * d) * q_a) * du + (2.0 * b + q * c) * q_au
                    - (2.0 * b + q * c) * (q_a * xi_ls_s + du * z_ls_psi_s)
                    + u * xi_ls_s * z_ls_psi_s);
            dh_ll_u[row] = (4.0 * b + 5.0 * q * c + q * q * d) * du * q_a
                + (a + 3.0 * q * b + q * q * c) * q_au;
        }

        let tt_block = weighted_crossprod_psi_maps(
            x_t_map,
            h_tt_u.view(),
            CustomFamilyPsiLinearMapRef::Dense(x_t),
        )? + &weighted_crossprod_psi_maps(
            CustomFamilyPsiLinearMapRef::Dense(x_t),
            h_tt_u.view(),
            x_t_map,
        )? + &xt_diag_x_dense(x_t, &dh_tt_u)?;
        let tl_block = weighted_crossprod_psi_maps(
            x_t_map,
            h_tl_u.view(),
            CustomFamilyPsiLinearMapRef::Dense(x_ls),
        )? + &weighted_crossprod_psi_maps(
            CustomFamilyPsiLinearMapRef::Dense(x_t),
            h_tl_u.view(),
            x_ls_map,
        )? + &xt_diag_y_dense(x_t, &dh_tl_u, x_ls)?;
        let ll_block = weighted_crossprod_psi_maps(
            x_ls_map,
            h_ll_u.view(),
            CustomFamilyPsiLinearMapRef::Dense(x_ls),
        )? + &weighted_crossprod_psi_maps(
            CustomFamilyPsiLinearMapRef::Dense(x_ls),
            h_ll_u.view(),
            x_ls_map,
        )? + &xt_diag_x_dense(x_ls, &dh_ll_u)?;
        let mut out = Array2::<f64>::zeros((total, total));
        out.slice_mut(s![0..pt, 0..pt]).assign(&tt_block);
        out.slice_mut(s![0..pt, pt..pt + pls]).assign(&tl_block);
        out.slice_mut(s![pt..pt + pls, pt..pt + pls])
            .assign(&ll_block);
        mirror_upper_to_lower(&mut out);
        Ok(out)
    }

    /// Build the [`BlockEffectiveJacobian`] for block `block_idx`.
    ///
    /// The two-output map is (η_threshold, η_log_sigma):
    /// - block 0 (threshold):  output 0 = design rows, output 1 = zeros
    /// - block 1 (log_sigma):  output 0 = zeros, output 1 = design rows
    pub fn block_effective_jacobian(
        specs: &[ParameterBlockSpec],
        block_idx: usize,
    ) -> Result<Box<dyn BlockEffectiveJacobian>, String> {
        crate::util::block_jacobian::AdditiveWiggleBlockLayout {
            family: "BinomialLocationScaleFamily",
            n_outputs: 2,
            additive_blocks: &[Self::BLOCK_T, Self::BLOCK_LOG_SIGMA],
            wiggle_block: None,
        }
        .block_effective_jacobian(specs, block_idx)
    }
}

impl CustomFamily for BinomialLocationScaleFamily {
    /// The Binomial location-scale joint Hessian depends on β because the
    /// Hessian blocks are functions of q = -t/σ and the link derivatives,
    /// all of which change when β_t or β_{log σ} move.
    fn exact_newton_joint_hessian_beta_dependent(&self) -> bool {
        true
    }

    fn coefficient_hessian_cost(&self, specs: &[ParameterBlockSpec]) -> u64 {
        // Operator-aware: matrix-free workspace applies joint Hv at
        // O(n · (p_t + p_ℓ)); only fall back to the dense build cost when
        // `use_joint_matrix_free_path` declines the operator path.
        crate::families::location_scale_engine::location_scale_coefficient_hessian_cost(
            self.y.len() as u64,
            specs,
        )
    }

    fn evaluate(&self, block_states: &[ParameterBlockState]) -> Result<FamilyEvaluation, String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        if eta_t.len() != n || eta_ls.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialLocationScaleFamily input size mismatch".to_string(),
            }
            .into());
        }

        let core = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            None,
            &self.link_kind,
        )?;
        if !self.exact_joint_supported() {
            return Err(
                "BinomialLocationScaleFamily requires exact curvature designs; diagonal fallback has been removed"
                    .to_string(),
            );
        }
        let threshold_design = self.threshold_design.as_ref().ok_or_else(|| {
            "BinomialLocationScaleFamily exact path is missing threshold design".to_string()
        })?;
        let log_sigma_design = self.log_sigma_design.as_ref().ok_or_else(|| {
            "BinomialLocationScaleFamily exact path is missing log-sigma design".to_string()
        })?;

        // Per-block gradients from the eta-space score.
        //
        //   score_q = -m1   (m1 = dF/dq, F = -ℓ)
        //   grad_eta_t[i]  = score_q * q_t
        //   grad_eta_ls[i] = score_q * q_ls
        let mut grad_eta_t_v = vec![0.0_f64; n];
        let mut grad_eta_ls_v = vec![0.0_f64; n];
        let y_slice_e = self.y.as_slice().expect("y must be contiguous");
        let w_slice_e = self.weights.as_slice().expect("weights must be contiguous");
        let q0_slice_e = core.q0.as_slice().expect("q0 must be contiguous");
        let sigma_slice_e = core.sigma.as_slice().expect("sigma must be contiguous");
        let mu_slice_e = core.mu.as_slice().expect("mu must be contiguous");
        let dmu_slice_e = core.dmu_dq.as_slice().expect("dmu_dq must be contiguous");
        let d2mu_slice_e = core
            .d2mu_dq2
            .as_slice()
            .expect("d2mu_dq2 must be contiguous");
        let d3mu_slice_e = core
            .d3mu_dq3
            .as_slice()
            .expect("d3mu_dq3 must be contiguous");
        let eta_t_slice_e = eta_t.as_slice().expect("eta_t must be contiguous");
        let link_kind_e = &self.link_kind;
        grad_eta_t_v
            .par_iter_mut()
            .zip(grad_eta_ls_v.par_iter_mut())
            .enumerate()
            .for_each(|(i, (g_t, g_ls))| {
                let (m1, _, _) = binomial_neglog_q_derivatives_dispatch(
                    y_slice_e[i],
                    w_slice_e[i],
                    q0_slice_e[i],
                    mu_slice_e[i],
                    dmu_slice_e[i],
                    d2mu_slice_e[i],
                    d3mu_slice_e[i],
                    link_kind_e,
                );
                let q0d = nonwiggle_q_derivs(eta_t_slice_e[i], sigma_slice_e[i]);
                *g_t = -m1 * q0d.q_t;
                *g_ls = -m1 * q0d.q_ls;
            });
        let grad_eta_t = Array1::from_vec(grad_eta_t_v);
        let grad_eta_ls = Array1::from_vec(grad_eta_ls_v);
        let grad_t = threshold_design.transpose_vector_multiply(&grad_eta_t);
        let grad_ls = log_sigma_design.transpose_vector_multiply(&grad_eta_ls);

        // Per-block Hessians without ever materializing the full p×p joint
        // matrix — the off-diagonal cross block is unused for IRLS-style block
        // working sets and would cost O(p_t * p_ls * n) to form. The diagonal
        // blocks are computed from the same row coefficients as the joint.
        let (h_tt, h_ll) = self.exact_newton_block_diagonal_hessians_from_design_matrices(
            block_states,
            threshold_design,
            log_sigma_design,
        )?;
        Ok(FamilyEvaluation {
            log_likelihood: core.log_likelihood,
            blockworking_sets: vec![
                BlockWorkingSet::ExactNewton {
                    gradient: grad_t,
                    hessian: SymmetricMatrix::Dense(h_tt),
                },
                BlockWorkingSet::ExactNewton {
                    gradient: grad_ls,
                    hessian: SymmetricMatrix::Dense(h_ll),
                },
            ],
        })
    }

    fn log_likelihood_only(&self, block_states: &[ParameterBlockState]) -> Result<f64, String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        if eta_t.len() != n || eta_ls.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialLocationScaleFamily input size mismatch".to_string(),
            }
            .into());
        }
        // Zero-allocation O(n) scalar loop — no working sets, no n-vector intermediates.
        binomial_location_scale_ll_only(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            None,
            &self.link_kind,
        )
    }

    /// Outer-only log-likelihood with optional row subsample.
    ///
    /// When `options.outer_score_subsample` is `Some`, only the sampled rows
    /// contribute; each row's per-row log-likelihood term is multiplied by
    /// `WeightedOuterRow.weight`, the Horvitz–Thompson inverse-inclusion
    /// factor 1/π_i (uniform or stratified sampling both supported), so the
    /// partial sum is an unbiased estimator of the full-data log-likelihood.
    /// When `None`, this returns the full-data `log_likelihood_only`. Inner
    /// PIRLS line searches never install the subsample option, so they
    /// continue to score the exact full-data log-likelihood.
    fn log_likelihood_only_with_options(
        &self,
        block_states: &[ParameterBlockState],
        options: &BlockwiseFitOptions,
    ) -> Result<f64, String> {
        let Some(subsample) = options.outer_score_subsample.as_ref() else {
            return self.log_likelihood_only(block_states);
        };
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        if eta_t.len() != n || eta_ls.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialLocationScaleFamily input size mismatch".to_string(),
            }
            .into());
        }
        use rayon::iter::ParallelIterator;
        let link_kind = &self.link_kind;
        let ll: Result<f64, String> = subsample
            .rows
            .par_iter()
            .try_fold(
                || 0.0_f64,
                |acc, row| -> Result<f64, String> {
                    let i = row.index;
                    let wi = self.weights[i];
                    if wi == 0.0 {
                        return Ok(acc);
                    }
                    let SigmaJet1 { sigma, .. } = exp_sigma_jet1_scalar(eta_ls[i]);
                    let q = binomial_location_scale_q0(eta_t[i], sigma);
                    let mu = if matches!(link_kind, InverseLink::Standard(StandardLink::Probit)) {
                        0.5
                    } else {
                        let jet = inverse_link_jet_for_inverse_link(link_kind, q).map_err(|e| {
                            format!("location-scale inverse-link evaluation failed: {e}")
                        })?;
                        jet.mu
                    };
                    let term =
                        binomial_location_scale_log_likelihood(self.y[i], wi, q, link_kind, mu)?;
                    Ok(acc + row.weight * term)
                },
            )
            .try_reduce(|| 0.0_f64, |a, b| Ok(a + b));
        ll
    }

    fn requires_joint_outer_hyper_path(&self) -> bool {
        true
    }

    fn diagonalworking_weights_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        idx: usize,
        arr: &Array1<f64>,
    ) -> Result<Option<Array1<f64>>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        assert!(idx < usize::MAX);
        assert!(arr.iter().all(|v| !v.is_nan()));
        Err(
            "BinomialLocationScaleFamily no longer supports diagonal working weights; exact curvature is required"
                .to_string(),
        )
    }

    fn exact_newton_joint_psi_terms(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiTerms>, String> {
        self.exact_newton_joint_psi_terms_for_specs(
            block_states,
            specs,
            derivative_blocks,
            psi_index,
        )
    }

    fn exact_newton_joint_psisecond_order_terms(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_i: usize,
        psi_j: usize,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms>, String> {
        self.exact_newton_joint_psisecond_order_terms_for_specs(
            block_states,
            specs,
            derivative_blocks,
            psi_i,
            psi_j,
        )
    }

    fn exact_newton_joint_psihessian_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_psihessian_directional_derivative_for_specs(
            block_states,
            specs,
            derivative_blocks,
            psi_index,
            d_beta_flat,
        )
    }

    fn exact_newton_joint_psi_workspace(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
    ) -> Result<Option<Arc<dyn ExactNewtonJointPsiWorkspace>>, String> {
        if !self.exact_joint_supported() {
            return Ok(None);
        }
        Ok(Some(Arc::new(
            BinomialLocationScaleExactNewtonJointPsiWorkspace::new(
                self.clone(),
                block_states.to_vec(),
                specs,
                derivative_blocks.to_vec(),
            )?,
        )))
    }

    fn exact_newton_hessian_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        block_idx: usize,
        d_beta: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        if !self.exact_joint_supported() {
            return Ok(None);
        }
        let pt = self
            .threshold_design
            .as_ref()
            .ok_or_else(|| {
                "BinomialLocationScaleFamily exact path is missing threshold design".to_string()
            })?
            .ncols();
        let pls = self
            .log_sigma_design
            .as_ref()
            .ok_or_else(|| {
                "BinomialLocationScaleFamily exact path is missing log-sigma design".to_string()
            })?
            .ncols();
        let total = pt + pls;
        let (start, end, joint_direction) = match block_idx {
            Self::BLOCK_T => {
                if d_beta.len() != pt {
                    return Err(GamlssError::DimensionMismatch { reason: format!(
                        "BinomialLocationScaleFamily threshold d_beta length mismatch: got {}, expected {}",
                        d_beta.len(),
                        pt
                    ) }.into());
                }
                let mut dir = Array1::<f64>::zeros(total);
                dir.slice_mut(s![0..pt]).assign(d_beta);
                (0usize, pt, dir)
            }
            Self::BLOCK_LOG_SIGMA => {
                if d_beta.len() != pls {
                    return Err(GamlssError::DimensionMismatch { reason: format!(
                        "BinomialLocationScaleFamily log-sigma d_beta length mismatch: got {}, expected {}",
                        d_beta.len(),
                        pls
                    ) }.into());
                }
                let mut dir = Array1::<f64>::zeros(total);
                dir.slice_mut(s![pt..pt + pls]).assign(d_beta);
                (pt, pt + pls, dir)
            }
            _ => return Ok(None),
        };
        let joint = self
            .exact_newton_joint_hessian_directional_derivative(block_states, &joint_direction)?
            .ok_or_else(|| {
                format!("missing joint exact-newton directional Hessian for block {block_idx}")
            })?;
        Ok(Some(joint.slice(s![start..end, start..end]).to_owned()))
    }

    fn exact_newton_joint_hessian(
        &self,
        block_states: &[ParameterBlockState],
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_for_specs(block_states, None)
    }

    fn has_explicit_joint_hessian(&self) -> bool {
        true
    }

    fn exact_newton_joint_hessian_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_directional_derivative_for_specs(
            block_states,
            None,
            d_beta_flat,
        )
    }

    fn exact_newton_joint_hessiansecond_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        d_beta_u_flat: &Array1<f64>,
        d_betav_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_second_directional_derivative_for_specs(
            block_states,
            None,
            d_beta_u_flat,
            d_betav_flat,
        )
    }

    fn exact_newton_joint_hessian_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_for_specs(block_states, Some(specs))
    }

    fn exact_newton_joint_hessian_directional_derivative_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_directional_derivative_for_specs(
            block_states,
            Some(specs),
            d_beta_flat,
        )
    }

    fn exact_newton_joint_hessian_second_directional_derivative_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        d_beta_u_flat: &Array1<f64>,
        d_betav_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_second_directional_derivative_for_specs(
            block_states,
            Some(specs),
            d_beta_u_flat,
            d_betav_flat,
        )
    }

    fn exact_newton_joint_gradient_evaluation(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
    ) -> Result<Option<ExactNewtonJointGradientEvaluation>, String> {
        let Some((x_t, x_ls)) = self.exact_joint_block_designs_owned(Some(specs))? else {
            return Ok(None);
        };
        self.exact_newton_joint_gradient_from_designs(block_states, &x_t, &x_ls)
            .map(Some)
    }

    fn exact_newton_joint_hessian_workspace(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
    ) -> Result<Option<Arc<dyn ExactNewtonJointHessianWorkspace>>, String> {
        let Some((x_t, x_ls)) = self.exact_joint_block_designs_owned(Some(specs))? else {
            return Ok(None);
        };
        let workspace = BinomialLocationScaleHessianWorkspace::new(
            self.clone(),
            block_states.to_vec(),
            x_t,
            x_ls,
        )?;
        Ok(Some(Arc::new(workspace)))
    }

    /// Outer-aware joint-Hessian workspace with optional row subsample.
    ///
    /// When `options.outer_score_subsample` is `None`, this is byte-identical
    /// to `exact_newton_joint_hessian_workspace`. When `Some`, the precomputed
    /// per-row coefficient arrays (`coeff_tt`, `coeff_tl`, `coeff_ll`) — which
    /// every downstream assembly (`hessian_dense`, `hessian_matvec`,
    /// `hessian_diagonal`) consumes row-linearly via `Xᵀ diag(W) X` — are
    /// replaced by a Horvitz–Thompson mask: each sampled row's coefficient is
    /// multiplied by `WeightedOuterRow.weight` (the inverse-inclusion factor
    /// 1/π_i; uniform or stratified sampling both supported), and non-sampled
    /// rows are zeroed. The resulting joint Hessian is an unbiased estimator
    /// of the full-data joint Hessian. Inner PIRLS never installs the option,
    /// so the inner solve continues to consume the exact full-data Hessian.
    fn exact_newton_joint_hessian_workspace_with_options(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        options: &BlockwiseFitOptions,
    ) -> Result<Option<Arc<dyn ExactNewtonJointHessianWorkspace>>, String> {
        let Some((x_t, x_ls)) = self.exact_joint_block_designs_owned(Some(specs))? else {
            return Ok(None);
        };
        let mut workspace = BinomialLocationScaleHessianWorkspace::new(
            self.clone(),
            block_states.to_vec(),
            x_t,
            x_ls,
        )?;
        if let Some(subsample) = options.outer_score_subsample.as_ref() {
            workspace.apply_outer_subsample(subsample.rows.as_ref());
        }
        Ok(Some(Arc::new(workspace)))
    }

    /// Outer-derivative policy: declare HT-subsample capability.
    ///
    /// BinomialLocationScaleFamily overrides
    /// `log_likelihood_only_with_options` and
    /// `exact_newton_joint_hessian_workspace_with_options` to consume
    /// `options.outer_score_subsample` with per-row Horvitz–Thompson weights
    /// (each sampled row's contribution is multiplied by
    /// `WeightedOuterRow.weight = 1/π_i`; non-sampled rows are zeroed),
    /// yielding unbiased estimators of the full-data log-likelihood and
    /// joint Hessian. The ψ-workspace path is not yet subsample-aware: it
    /// builds the exact full-data ψ Hessian blocks, which are trivially
    /// unbiased; so the outer-score components are a sum of HT-unbiased and
    /// exact-unbiased pieces and the total remains an unbiased estimator of
    /// the full-data outer score. Inner-PIRLS and final-covariance paths
    /// never install the option, so they continue to consume the exact
    /// full-data quantities.
    fn outer_derivative_subsample_capable(&self) -> bool {
        true
    }

    fn inner_coefficient_hessian_hvp_available(&self, specs: &[ParameterBlockSpec]) -> bool {
        // Representation support means the realized two-block designs can be
        // applied as β-space operators. It does not imply that exact
        // second-order outer θ work is cheap.
        if specs.len() != 2 {
            return false;
        }
        let n = self.y.len();
        specs[Self::BLOCK_T].design.nrows() == n && specs[Self::BLOCK_LOG_SIGMA].design.nrows() == n
    }

    /// Batched analytic-gradient hook (Fix #8).
    ///
    /// Falls through to `None` (generic per-θ_j path) whenever any θ_j is a
    /// ψ coordinate; the design-drift composition for ψ is handled by the
    /// existing unified evaluator. ρ-only is the common warm-start regime
    /// and the dominant biobank-scale cost.
    fn batched_outer_gradient_terms(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
        rho: &ndarray::Array1<f64>,
        options: &BlockwiseFitOptions,
        workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
    ) -> Result<Option<BatchedOuterGradientTerms>, String> {
        use crate::faer_ndarray::FaerCholesky;
        use faer::Side;

        if options.outer_score_subsample.is_some() {
            return Ok(None);
        }

        // ψ-coords fall back to the generic path; the leverage form here is
        // ρ-only (penalty hyperparameters).
        let psi_dim: usize = derivative_blocks.iter().map(Vec::len).sum();
        if psi_dim != 0 {
            return Ok(None);
        }

        if !self.exact_joint_supported() {
            return Ok(None);
        }
        if block_states.len() != 2 || specs.len() != 2 {
            return Ok(None);
        }

        // Designs and dimensions.
        let Some((x_t_cow, x_ls_cow)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        let x_t = x_t_cow.into_owned();
        let x_ls = x_ls_cow.into_owned();
        let pt = x_t.ncols();
        let pls = x_ls.ncols();
        let total = pt + pls;
        let n = self.y.len();

        // Operator-aware downgrade: in the matrix-free regime the unified
        // per-θ_j path uses the family's `ExactNewtonJointHessianWorkspace`
        // (matvec + dH/d²H operators) and never materializes the dense
        // total×total joint Hessian, the dense Cholesky factor, or the
        // total×n leverage panels (`Q_t`, `Q_l`) that this batched fast-path
        // builds below. At biobank scale (e.g. n≈4·10⁵, total≈120) those
        // dense allocations and the n·total² leverage solve dominate
        // wall-clock time and inflate resident memory by ~6 GiB. Decline
        // the batched path when the joint dimensions cross the same gate
        // used for matrix-free outer routing — the unified evaluator will
        // produce identical gradient values via the operator workspace.
        if crate::custom_family::use_joint_matrix_free_path(total, n) {
            return Ok(None);
        }

        // ── Step 1: build dense joint Hessian H_L (unpenalized).
        let h_l = if let Some(workspace) = workspace.as_ref() {
            if let Some(hessian) = workspace.hessian_dense()? {
                hessian
            } else {
                self.exact_newton_joint_hessian_from_designs(block_states, &x_t, &x_ls)?
                    .ok_or_else(|| {
                        "BinomialLocationScaleFamily: unable to assemble joint Hessian for batched gradient"
                            .to_string()
                    })?
            }
        } else {
            self.exact_newton_joint_hessian_from_designs(block_states, &x_t, &x_ls)?
                .ok_or_else(|| {
                "BinomialLocationScaleFamily: unable to assemble joint Hessian for batched gradient"
                    .to_string()
            })?
        };

        // ── Step 2: assemble penalty `S_λ` and add to H.
        // Match the unified evaluator's per-block convention.
        let mut h = h_l.clone();
        let total_pen: usize = specs.iter().map(|s| s.penalties.len()).sum();
        if rho.len() != total_pen {
            return Ok(None);
        }
        // Per-block per-penalty lambdas.
        let mut per_block_rho: Vec<Vec<f64>> = Vec::with_capacity(specs.len());
        let mut cursor = 0;
        for spec in specs {
            let cnt = spec.penalties.len();
            let mut row = Vec::with_capacity(cnt);
            for k in 0..cnt {
                row.push(rho[cursor + k]);
            }
            per_block_rho.push(row);
            cursor += cnt;
        }
        // Ranges in flattened β.
        let ranges: Vec<(usize, usize)> = {
            let mut out = Vec::with_capacity(specs.len());
            let mut s_pos = 0usize;
            for spec in specs {
                let p = spec.design.ncols();
                out.push((s_pos, s_pos + p));
                s_pos += p;
            }
            out
        };
        // Add S_λ block-wise.
        for (b, spec) in specs.iter().enumerate() {
            let (start, end) = ranges[b];
            let p = end - start;
            let mut s_b = ndarray::Array2::<f64>::zeros((p, p));
            for (k, pen) in spec.penalties.iter().enumerate() {
                let lambda = per_block_rho[b][k].exp();
                pen.add_scaled_to(lambda, &mut s_b);
            }
            // Add to H.
            let mut h_block = h.slice_mut(s![start..end, start..end]);
            h_block += &s_b;
        }

        // ── Step 3: Cholesky-factor H.
        let factor = h
            .cholesky(Side::Lower)
            .map_err(|e| format!("BinomialLocationScale batched gradient: Cholesky failed: {e}"))?;

        // β flattened.
        let beta_flat = {
            let mut out = ndarray::Array1::<f64>::zeros(total);
            for b in 0..specs.len() {
                let (start, end) = ranges[b];
                out.slice_mut(s![start..end]).assign(&block_states[b].beta);
            }
            out
        };

        // ── Step 4: leverage blocks L_i = Z_i H⁻¹ Z_iᵀ (2×2 per row).
        // Solve H · M = Zᵀ where Z stacks both block designs into n × total
        // logical rows, but each row is (x_t_i, 0) for the threshold direction
        // and (0, x_ls_i) for the log-σ direction. We materialize Q_t and Q_l
        // as two (total × n) panels, one per channel.
        const LEVERAGE_CHUNK_ROWS: usize = 1024;
        const MIN_PARALLEL_LEVERAGE_ROWS: usize = 2 * LEVERAGE_CHUNK_ROWS;
        let leverage_chunk_rows = if n >= MIN_PARALLEL_LEVERAGE_ROWS {
            LEVERAGE_CHUNK_ROWS
        } else {
            n.max(1)
        };
        let leverage_chunks = n.div_ceil(leverage_chunk_rows);

        struct LeverageScratch {
            rhs_t: ndarray::Array2<f64>,
            rhs_l: ndarray::Array2<f64>,
        }

        impl LeverageScratch {
            fn new(total: usize, chunk_rows: usize) -> Self {
                Self {
                    rhs_t: ndarray::Array2::<f64>::zeros((total, chunk_rows)),
                    rhs_l: ndarray::Array2::<f64>::zeros((total, chunk_rows)),
                }
            }
        }

        let leverage_parts: Vec<(
            usize,
            ndarray::Array1<f64>,
            ndarray::Array1<f64>,
            ndarray::Array1<f64>,
        )> = (0..leverage_chunks)
            .into_par_iter()
            .map_init(
                || LeverageScratch::new(total, leverage_chunk_rows),
                |scratch, chunk_idx| {
                    let row_start = chunk_idx * leverage_chunk_rows;
                    let row_end = (row_start + leverage_chunk_rows).min(n);
                    let m = row_end - row_start;
                    let mut rhs_t = scratch.rhs_t.slice_mut(s![.., 0..m]);
                    let mut rhs_l = scratch.rhs_l.slice_mut(s![.., 0..m]);
                    rhs_t.fill(0.0);
                    rhs_l.fill(0.0);
                    for j in 0..m {
                        let i = row_start + j;
                        for c in 0..pt {
                            rhs_t[[c, j]] = x_t[[i, c]];
                        }
                        for c in 0..pls {
                            rhs_l[[pt + c, j]] = x_ls[[i, c]];
                        }
                    }
                    let q_t = factor.solve_mat(&rhs_t.to_owned());
                    let q_l = factor.solve_mat(&rhs_l.to_owned());
                    let mut chunk_00 = ndarray::Array1::<f64>::zeros(m);
                    let mut chunk_01 = ndarray::Array1::<f64>::zeros(m);
                    let mut chunk_11 = ndarray::Array1::<f64>::zeros(m);
                    for j in 0..m {
                        let i = row_start + j;
                        let mut l00 = 0.0;
                        let mut l11 = 0.0;
                        let mut l01 = 0.0;
                        for c in 0..pt {
                            l00 += x_t[[i, c]] * q_t[[c, j]];
                            l01 += x_t[[i, c]] * q_l[[c, j]];
                        }
                        for c in 0..pls {
                            l11 += x_ls[[i, c]] * q_l[[pt + c, j]];
                        }
                        chunk_00[j] = l00;
                        chunk_01[j] = l01;
                        chunk_11[j] = l11;
                    }
                    (row_start, chunk_00, chunk_01, chunk_11)
                },
            )
            .collect();

        // L00, L01, L11: per-row leverage entries.
        let mut leverage_00 = ndarray::Array1::<f64>::zeros(n);
        let mut leverage_01 = ndarray::Array1::<f64>::zeros(n);
        let mut leverage_11 = ndarray::Array1::<f64>::zeros(n);
        for (row_start, chunk_00, chunk_01, chunk_11) in leverage_parts {
            let row_end = row_start + chunk_00.len();
            leverage_00
                .slice_mut(s![row_start..row_end])
                .assign(&chunk_00);
            leverage_01
                .slice_mut(s![row_start..row_end])
                .assign(&chunk_01);
            leverage_11
                .slice_mut(s![row_start..row_end])
                .assign(&chunk_11);
        }

        // ── Step 5: per-coordinate accumulation.
        // Build (H^{-1})_{b,b} once per block; this amortizes
        // tr(H^{-1} A_k) across all penalties supported in block b.
        let h_inv_block_diag: Vec<ndarray::Array2<f64>> = (0..specs.len())
            .into_par_iter()
            .map(|b| {
                let (start, end) = ranges[b];
                let p_b = end - start;
                let mut rhs = ndarray::Array2::<f64>::zeros((total, p_b));
                for c in 0..p_b {
                    rhs[[start + c, c]] = 1.0;
                }
                let m_full = factor.solve_mat(&rhs);
                let mut block = ndarray::Array2::<f64>::zeros((p_b, p_b));
                for r in 0..p_b {
                    for c in 0..p_b {
                        block[[r, c]] = m_full[[start + r, c]];
                    }
                }
                block
            })
            .collect();

        // Pseudologdet helper for the penalty pseudo-inverse trace.
        let mut s_pseudologdet_blocks: Vec<
            crate::solver::estimate::reml::penalty_logdet::PenaltyPseudologdet,
        > = Vec::with_capacity(specs.len());
        for b in 0..specs.len() {
            let (start, end) = ranges[b];
            let p_b = end - start;
            let mut s_b = ndarray::Array2::<f64>::zeros((p_b, p_b));
            for (k, pen) in specs[b].penalties.iter().enumerate() {
                let lambda = per_block_rho[b][k].exp();
                pen.add_scaled_to(lambda, &mut s_b);
            }
            // No metadata-based structural-nullity hint: the
            // PenaltyPseudologdet classifier derives the positive eigenspace
            // from the assembled spectrum alone (issues #192/#318).
            s_pseudologdet_blocks.push(
                crate::solver::estimate::reml::penalty_logdet::PenaltyPseudologdet::from_assembled(
                    s_b, None,
                )?,
            );
        }

        // Cache the family core once: per-row scalars are independent of u_k.
        let core = binomial_location_scale_core(
            &self.y,
            &self.weights,
            &block_states[Self::BLOCK_T].eta,
            &block_states[Self::BLOCK_LOG_SIGMA].eta,
            None,
            &self.link_kind,
        )?;
        // Pre-compute per-row m1/m2/m3, r, s_factor, q.
        let mut row_m1 = ndarray::Array1::<f64>::zeros(n);
        let mut row_m2 = ndarray::Array1::<f64>::zeros(n);
        let mut row_m3 = ndarray::Array1::<f64>::zeros(n);
        let mut row_r = ndarray::Array1::<f64>::zeros(n);
        let mut row_s = ndarray::Array1::<f64>::zeros(n);
        let mut row_q = ndarray::Array1::<f64>::zeros(n);
        let row_scalars: Vec<(f64, f64, f64, f64, f64, f64)> = (0..n)
            .into_par_iter()
            .map(|i| {
                let q = core.q0[i];
                let r = 1.0 / core.sigma[i];
                let s_factor = core.dsigma_deta[i] / core.sigma[i];
                let (m1, m2, m3) = binomial_neglog_q_derivatives_dispatch(
                    self.y[i],
                    self.weights[i],
                    q,
                    core.mu[i],
                    core.dmu_dq[i],
                    core.d2mu_dq2[i],
                    core.d3mu_dq3[i],
                    &self.link_kind,
                );
                (m1, m2, m3, r, s_factor, q)
            })
            .collect();
        for (i, (m1, m2, m3, r, s_factor, q)) in row_scalars.into_iter().enumerate() {
            row_m1[i] = m1;
            row_m2[i] = m2;
            row_m3[i] = m3;
            row_r[i] = r;
            row_s[i] = s_factor;
            row_q[i] = q;
        }

        let mut objective_theta = ndarray::Array1::<f64>::zeros(total_pen);
        let mut trace_h_inv_hdot = ndarray::Array1::<f64>::zeros(total_pen);
        let mut trace_s_pinv_sdot = ndarray::Array1::<f64>::zeros(total_pen);

        const MIN_PARALLEL_PENALTY_COORDS: usize = 2;
        let mut penalty_coords = Vec::with_capacity(total_pen);
        let mut flat_idx = 0usize;
        for b in 0..specs.len() {
            for k_local in 0..specs[b].penalties.len() {
                penalty_coords.push((flat_idx, b, k_local));
                flat_idx += 1;
            }
        }
        let penalty_coord_chunk_size = if penalty_coords.len() >= MIN_PARALLEL_PENALTY_COORDS {
            1
        } else {
            penalty_coords.len().max(1)
        };

        struct PenaltyGradientPart {
            flat_idx: usize,
            objective_theta: f64,
            trace_h_inv_hdot: f64,
            trace_s_pinv_sdot: f64,
        }

        let penalty_parts: Vec<Result<Vec<PenaltyGradientPart>, String>> = penalty_coords
            .par_chunks(penalty_coord_chunk_size)
            .map(|chunk| {
                let mut chunk_parts = Vec::with_capacity(chunk.len());
                for &(flat_idx, b, k_local) in chunk {
                    let (start, end) = ranges[b];
                    let p_b = end - start;
                    let beta_b = beta_flat.slice(s![start..end]).to_owned();
                    let pen = &specs[b].penalties[k_local];
                    let lambda_k = per_block_rho[b][k_local].exp();
                    let mut s_k_local = ndarray::Array2::<f64>::zeros((p_b, p_b));
                    pen.add_scaled_to(lambda_k, &mut s_k_local);
                    let s_k_beta_local = s_k_local.dot(&beta_b);
                    let objective_theta = 0.5 * beta_b.dot(&s_k_beta_local);

                    // u_k = -H^{-1} (A_k β).
                    let mut a_k_beta_full = ndarray::Array1::<f64>::zeros(total);
                    a_k_beta_full
                        .slice_mut(s![start..end])
                        .assign(&s_k_beta_local);
                    let mut u_k = factor.solvevec(&a_k_beta_full);
                    u_k.mapv_inplace(|v| -v);

                    // tr(H^{-1} A_k) = tr( (H^{-1})_{b,b} · (λ_k S_k) ).
                    let m_block = &h_inv_block_diag[b];
                    let mut tr_pen = 0.0;
                    for r in 0..p_b {
                        for c in 0..p_b {
                            tr_pen += m_block[[r, c]] * s_k_local[[c, r]];
                        }
                    }

                    // Drift trace: Σ_i tr(C_i(u_k) · L_i).
                    let u_k_t = u_k.slice(s![0..pt]).to_owned();
                    let u_k_ls = u_k.slice(s![pt..total]).to_owned();
                    let d_eta_t = fast_av(&x_t, &u_k_t);
                    let d_eta_ls = fast_av(&x_ls, &u_k_ls);
                    let mut drift_trace = 0.0;
                    for i in 0..n {
                        let q = row_q[i];
                        let r_val = row_r[i];
                        let s_factor = row_s[i];
                        let m1 = row_m1[i];
                        let m2 = row_m2[i];
                        let m3 = row_m3[i];
                        let a_eta = d_eta_t[i];
                        let b_eta = d_eta_ls[i];
                        let sb = s_factor * b_eta;
                        let du = -r_val * a_eta - q * sb;
                        let c_tt = r_val * r_val * (m3 * du - 2.0 * m2 * sb);
                        let c_tl =
                            s_factor * r_val * (q * m3 * du + m2 * (2.0 * du - q * sb) - m1 * sb);
                        let c_ll = s_factor * s_factor * (m1 + 3.0 * q * m2 + q * q * m3) * du;
                        drift_trace += c_tt * leverage_00[i]
                            + 2.0 * c_tl * leverage_01[i]
                            + c_ll * leverage_11[i];
                    }

                    // Penalty pseudo-logdet derivative: tr(S^+ · λ_k S_k) (block-local).
                    let trace_s_pinv_sdot =
                        s_pseudologdet_blocks[b].tau_gradient_component(&s_k_local);

                    chunk_parts.push(PenaltyGradientPart {
                        flat_idx,
                        objective_theta,
                        trace_h_inv_hdot: tr_pen + drift_trace,
                        trace_s_pinv_sdot,
                    });
                }
                Ok(chunk_parts)
            })
            .collect();

        for chunk in penalty_parts {
            for part in chunk? {
                objective_theta[part.flat_idx] = part.objective_theta;
                trace_h_inv_hdot[part.flat_idx] = part.trace_h_inv_hdot;
                trace_s_pinv_sdot[part.flat_idx] = part.trace_s_pinv_sdot;
            }
        }

        Ok(Some(BatchedOuterGradientTerms {
            objective_theta,
            trace_h_inv_hdot,
            trace_s_pinv_sdot,
        }))
    }
}

impl CustomFamilyGenerative for BinomialLocationScaleFamily {
    fn generativespec(
        &self,
        block_states: &[ParameterBlockState],
    ) -> Result<GenerativeSpec, String> {
        if block_states.len() != 2 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleFamily expects 2 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        if eta_t.len() != self.y.len() || eta_ls.len() != self.y.len() {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialLocationScaleFamily generative size mismatch".to_string(),
            }
            .into());
        }
        let mean = gamlss_rowwise_map_result(self.y.len(), |i| {
            let sigma = exp_sigma_from_eta_scalar(eta_ls[i]);
            let q = binomial_location_scale_q0(eta_t[i], sigma);
            let jet = inverse_link_jet_for_inverse_link(&self.link_kind, q)
                .map_err(|e| format!("location-scale inverse-link evaluation failed: {e}"))?;
            Ok(jet.mu)
        })?;
        Ok(GenerativeSpec {
            mean,
            noise: NoiseModel::Bernoulli,
        })
    }
}

/// Matrix-free joint-Hessian operator for the two-block binomial
/// location-scale family.
///
/// The dense joint Hessian is `H = [[X_t^T D_tt X_t, X_t^T D_tl X_ls],
///                                  [X_ls^T D_tl X_t, X_ls^T D_ll X_ls]]`
/// where `D_tt`, `D_tl`, `D_ll` are diagonal weight vectors derived from the
/// rowwise scalar-composition Hessian. For a flattened direction
/// `v = (v_t, v_ls)`, `H v` is computed as
///
///   u_t = X_t v_t,  u_ls = X_ls v_ls,
///   r_t = D_tt .* u_t + D_tl .* u_ls,
///   r_ls = D_tl .* u_t + D_ll .* u_ls,
///   H v = (X_t^T r_t, X_ls^T r_ls).
///
/// Cost is Θ(n (p_t + p_ls)) per matvec versus Θ(n (p_t + p_ls)^2) to form
/// the dense matrix. The same block-operator structure is used for first and
/// second directional derivatives.
struct BinomialLocationScaleHessianWorkspace {
    family: BinomialLocationScaleFamily,
    x_t: DesignMatrix,
    x_ls: DesignMatrix,
    core: BinomialLocationScaleCore,
    coeff_tt: Array1<f64>,
    coeff_tl: Array1<f64>,
    coeff_ll: Array1<f64>,
    direction_eta_cache: Mutex<HashMap<BinomialDirectionKey, Arc<BinomialDirectionEta>>>,
    first_coeff_cache: Mutex<HashMap<BinomialDirectionKey, Arc<BinomialRowCoeffTriple>>>,
    // No `second_coeff_cache` deliberately: see `second_coefficients` for why
    // the per-pair cache was a memory-only loss at biobank shape.
}

#[derive(Clone, Eq, Hash, PartialEq)]
struct BinomialDirectionKey {
    bits: Vec<u64>,
}

impl BinomialDirectionKey {
    fn from_array(v: &Array1<f64>) -> Self {
        Self {
            bits: v.iter().map(|value| value.to_bits()).collect(),
        }
    }
}

struct BinomialDirectionEta {
    t: Array1<f64>,
    ls: Array1<f64>,
}

struct BinomialRowCoeffTriple {
    tt: Arc<Array1<f64>>,
    tl: Arc<Array1<f64>>,
    ll: Arc<Array1<f64>>,
}

impl BinomialLocationScaleHessianWorkspace {
    fn new(
        family: BinomialLocationScaleFamily,
        block_states: Vec<ParameterBlockState>,
        x_t: DesignMatrix,
        x_ls: DesignMatrix,
    ) -> Result<Self, String> {
        let eta_t = &block_states[BinomialLocationScaleFamily::BLOCK_T].eta;
        let eta_ls = &block_states[BinomialLocationScaleFamily::BLOCK_LOG_SIGMA].eta;
        let core = binomial_location_scale_core(
            &family.y,
            &family.weights,
            eta_t,
            eta_ls,
            None,
            &family.link_kind,
        )?;
        let (coeff_tt, coeff_tl, coeff_ll) =
            family.exact_newton_joint_hessian_row_coefficients(&block_states)?;
        Ok(Self {
            family,
            x_t,
            x_ls,
            core,
            coeff_tt,
            coeff_tl,
            coeff_ll,
            direction_eta_cache: Mutex::new(HashMap::new()),
            first_coeff_cache: Mutex::new(HashMap::new()),
        })
    }

    fn direction_eta(
        &self,
        key: &BinomialDirectionKey,
        d_beta: &Array1<f64>,
        pt: usize,
        total: usize,
    ) -> Arc<BinomialDirectionEta> {
        if let Some(value) = self
            .direction_eta_cache
            .lock()
            .expect("binomial direction eta cache lock poisoned")
            .get(key)
            .cloned()
        {
            return value;
        }
        let value = Arc::new(BinomialDirectionEta {
            t: self
                .x_t
                .matrixvectormultiply(&d_beta.slice(s![0..pt]).to_owned()),
            ls: self
                .x_ls
                .matrixvectormultiply(&d_beta.slice(s![pt..total]).to_owned()),
        });
        let mut cache = self
            .direction_eta_cache
            .lock()
            .expect("binomial direction eta cache lock poisoned");
        cache
            .entry(key.clone())
            .or_insert_with(|| value.clone())
            .clone()
    }

    fn first_coefficients(
        &self,
        key: &BinomialDirectionKey,
        eta: &BinomialDirectionEta,
    ) -> Arc<BinomialRowCoeffTriple> {
        if let Some(value) = self
            .first_coeff_cache
            .lock()
            .expect("binomial first coefficient cache lock poisoned")
            .get(key)
            .cloned()
        {
            return value;
        }
        let (tt, tl, ll) = binomial_location_scale_first_directional_coefficients(
            &self.family.y,
            &self.family.weights,
            &self.core,
            &eta.t,
            &eta.ls,
            &self.family.link_kind,
        );
        let value = Arc::new(BinomialRowCoeffTriple {
            tt: Arc::new(tt),
            tl: Arc::new(tl),
            ll: Arc::new(ll),
        });
        let mut cache = self
            .first_coeff_cache
            .lock()
            .expect("binomial first coefficient cache lock poisoned");
        cache
            .entry(key.clone())
            .or_insert_with(|| value.clone())
            .clone()
    }

    /// No caching here, deliberately: at biobank shape (n=320k, K=14 outer
    /// coords) the K² ≈ 196 unique direction-pairs are queried exactly once
    /// per outer Hessian eval, and each cached entry stored 3·n f64s
    /// = ~7.7 MB → ~1.5 GB peak per eval with zero practical hit-rate.
    /// Across outer evals the directions shift with ρ/ψ so cross-eval hits
    /// are nil. Computing on demand is O(n) — under 10 ms at this scale,
    /// dwarfed by the (n × p²) trace work that consumes the result.
    fn second_coefficients(
        &self,
        eta_u: &BinomialDirectionEta,
        eta_v: &BinomialDirectionEta,
    ) -> Result<Arc<BinomialRowCoeffTriple>, String> {
        let (tt, tl, ll) = binomial_location_scalesecond_directional_coefficients(
            &self.family.y,
            &self.family.weights,
            &self.core,
            &eta_u.t,
            &eta_u.ls,
            &eta_v.t,
            &eta_v.ls,
            &self.family.link_kind,
        )?;
        Ok(Arc::new(BinomialRowCoeffTriple {
            tt: Arc::new(tt),
            tl: Arc::new(tl),
            ll: Arc::new(ll),
        }))
    }

    /// Apply a Horvitz–Thompson outer-row subsample mask to the precomputed
    /// per-row coefficient arrays in place. Each sampled row's `coeff_*[i]`
    /// is multiplied by its `WeightedOuterRow.weight` (the HT inverse-
    /// inclusion factor 1/π_i); non-sampled rows are zeroed. Because every
    /// downstream assembly (`hessian_dense`, `hessian_matvec`,
    /// `hessian_diagonal`) is row-linear in these arrays via `Xᵀ diag(W) X`,
    /// the resulting joint-Hessian is an unbiased estimator of the full-data
    /// joint Hessian.
    fn apply_outer_subsample(
        &mut self,
        rows: &[crate::families::marginal_slope_shared::WeightedOuterRow],
    ) {
        let n = self.coeff_tt.len();
        let mut mask_tt = Array1::<f64>::zeros(n);
        let mut mask_tl = Array1::<f64>::zeros(n);
        let mut mask_ll = Array1::<f64>::zeros(n);
        for r in rows {
            let i = r.index;
            mask_tt[i] = self.coeff_tt[i] * r.weight;
            mask_tl[i] = self.coeff_tl[i] * r.weight;
            mask_ll[i] = self.coeff_ll[i] * r.weight;
        }
        self.coeff_tt = mask_tt;
        self.coeff_tl = mask_tl;
        self.coeff_ll = mask_ll;
    }
}

impl ExactNewtonJointHessianWorkspace for BinomialLocationScaleHessianWorkspace {
    fn hessian_dense(&self) -> Result<Option<Array2<f64>>, String> {
        // Same Hv structure as `hessian_matvec`, built once via 3 GEMMs:
        //   H_tt = X_tᵀ diag(coeff_tt) X_t,
        //   H_tl = X_tᵀ diag(coeff_tl) X_ls,
        //   H_ll = X_lsᵀ diag(coeff_ll) X_ls,
        // versus letting `MatrixFreeSpdOperator::materialize_dense_operator`
        // reconstruct the dense Hessian via `total` canonical-basis HVPs. At
        // biobank scale, canonical-basis materialization costs p_total full
        // Hessian-vector products. The design helpers below stream row chunks,
        // so the only dense object retained here is the small p_total×p_total
        // coefficient Hessian.
        let pt = self.x_t.ncols();
        let pls = self.x_ls.ncols();
        let total = pt + pls;
        let h_tt = xt_diag_x_design(&self.x_t, &self.coeff_tt)?;
        let h_tl = xt_diag_y_design(&self.x_t, &self.coeff_tl, &self.x_ls)?;
        let h_ll = xt_diag_x_design(&self.x_ls, &self.coeff_ll)?;
        let mut h = Array2::<f64>::zeros((total, total));
        h.slice_mut(s![0..pt, 0..pt]).assign(&h_tt);
        h.slice_mut(s![0..pt, pt..total]).assign(&h_tl);
        h.slice_mut(s![pt..total, pt..total]).assign(&h_ll);
        mirror_upper_to_lower(&mut h);
        Ok(Some(h))
    }

    fn hessian_matvec_available(&self) -> bool {
        true
    }

    fn hessian_matvec(&self, v: &Array1<f64>) -> Result<Option<Array1<f64>>, String> {
        let pt = self.x_t.ncols();
        let pls = self.x_ls.ncols();
        let total = pt + pls;
        if v.len() != total {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScale matvec dimension mismatch: got {}, expected {}",
                    v.len(),
                    total
                ),
            }
            .into());
        }
        // u_t = X_t v_t, u_ls = X_ls v_ls
        let u_t = self
            .x_t
            .matrixvectormultiply(&v.slice(s![0..pt]).to_owned());
        let u_ls = self
            .x_ls
            .matrixvectormultiply(&v.slice(s![pt..total]).to_owned());
        // r_t = D_tt .* u_t + D_tl .* u_ls; r_ls = D_tl .* u_t + D_ll .* u_ls
        let r_t = &self.coeff_tt * &u_t + &self.coeff_tl * &u_ls;
        let r_ls = &self.coeff_tl * &u_t + &self.coeff_ll * &u_ls;
        // (X_t^T r_t, X_ls^T r_ls)
        let out_t = self.x_t.transpose_vector_multiply(&r_t);
        let out_ls = self.x_ls.transpose_vector_multiply(&r_ls);
        let mut out = Array1::<f64>::zeros(total);
        out.slice_mut(s![0..pt]).assign(&out_t);
        out.slice_mut(s![pt..total]).assign(&out_ls);
        Ok(Some(out))
    }

    fn hessian_diagonal(&self) -> Result<Option<Array1<f64>>, String> {
        let pt = self.x_t.ncols();
        let pls = self.x_ls.ncols();
        let total = pt + pls;
        let mut diag = Array1::<f64>::zeros(total);
        let diag_t = design_weighted_column_squares(&self.x_t, &self.coeff_tt)?;
        let diag_ls = design_weighted_column_squares(&self.x_ls, &self.coeff_ll)?;
        diag.slice_mut(s![0..pt]).assign(&diag_t);
        diag.slice_mut(s![pt..total]).assign(&diag_ls);
        Ok(Some(diag))
    }

    fn directional_derivative(
        &self,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        Ok(self
            .directional_derivative_operator(d_beta_flat)?
            .map(|operator| operator.to_dense()))
    }

    fn directional_derivative_operator(
        &self,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Arc<dyn crate::solver::estimate::reml::unified::HyperOperator>>, String>
    {
        let pt = self.x_t.ncols();
        let pls = self.x_ls.ncols();
        let total = pt + pls;
        if d_beta_flat.len() != total {
            return Err(GamlssError::InvalidInput {
                reason: format!(
                    "BinomialLocationScale dH operator: d_beta length {} != {}",
                    d_beta_flat.len(),
                    total
                ),
            }
            .into());
        }
        let key = BinomialDirectionKey::from_array(d_beta_flat);
        let eta = self.direction_eta(&key, d_beta_flat, pt, total);
        let coeffs = self.first_coefficients(&key, &eta);
        Ok(Some(Arc::new(make_two_block_design_row_coeff_operator(
            self.x_t.clone(),
            self.x_ls.clone(),
            coeffs.tt.clone(),
            coeffs.tl.clone(),
            coeffs.ll.clone(),
        )?)))
    }

    fn second_directional_derivative(
        &self,
        d_beta_u_flat: &Array1<f64>,
        d_beta_v_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        Ok(self
            .second_directional_derivative_operator(d_beta_u_flat, d_beta_v_flat)?
            .map(|operator| operator.to_dense()))
    }

    fn second_directional_derivative_operator(
        &self,
        d_beta_u: &Array1<f64>,
        d_beta_v: &Array1<f64>,
    ) -> Result<Option<Arc<dyn crate::solver::estimate::reml::unified::HyperOperator>>, String>
    {
        let pt = self.x_t.ncols();
        let pls = self.x_ls.ncols();
        let total = pt + pls;
        if d_beta_u.len() != total || d_beta_v.len() != total {
            return Err(GamlssError::InvalidInput {
                reason: format!(
                    "BinomialLocationScale d2H operator: d_beta_{{u,v}} length {}/{} != {}",
                    d_beta_u.len(),
                    d_beta_v.len(),
                    total
                ),
            }
            .into());
        }
        let key_u = BinomialDirectionKey::from_array(d_beta_u);
        let key_v = BinomialDirectionKey::from_array(d_beta_v);
        let eta_u = self.direction_eta(&key_u, d_beta_u, pt, total);
        let eta_v = self.direction_eta(&key_v, d_beta_v, pt, total);
        let coeffs = self.second_coefficients(&eta_u, &eta_v)?;
        Ok(Some(Arc::new(make_two_block_design_row_coeff_operator(
            self.x_t.clone(),
            self.x_ls.clone(),
            coeffs.tt.clone(),
            coeffs.tl.clone(),
            coeffs.ll.clone(),
        )?)))
    }
}

/// Built-in binomial location-scale family with a configurable inverse link and learnable wiggle on q.
///
/// Block structure:
/// - Block 0: threshold T(covariates)
/// - Block 1: log sigma(covariates)
/// - Block 2: wiggle(q) represented by B-spline coefficients on q
#[derive(Clone)]
pub struct BinomialLocationScaleWiggleFamily {
    pub y: Array1<f64>,
    pub weights: Array1<f64>,
    pub link_kind: InverseLink,
    pub threshold_design: Option<DesignMatrix>,
    pub log_sigma_design: Option<DesignMatrix>,
    pub wiggle_knots: Array1<f64>,
    pub wiggle_degree: usize,
    /// Resource policy threaded into PsiDesignMap construction (and any other
    /// per-call materialization decision) made during exact-Newton joint psi
    /// derivative evaluation. Defaults to `ResourcePolicy::default_library()`
    /// when the family is built without an explicit policy.
    pub policy: crate::resource::ResourcePolicy,
}

impl BinomialLocationScaleWiggleFamily {
    pub const BLOCK_T: usize = 0;
    pub const BLOCK_LOG_SIGMA: usize = 1;
    pub const BLOCK_WIGGLE: usize = 2;

    pub fn parameternames() -> &'static [&'static str] {
        &["threshold", "log_sigma", "wiggle"]
    }

    pub fn parameter_links() -> &'static [ParameterLink] {
        &[
            ParameterLink::InverseLink,
            ParameterLink::Log,
            ParameterLink::Wiggle,
        ]
    }

    pub fn metadata() -> FamilyMetadata {
        FamilyMetadata {
            name: "binomial_location_scalewiggle",
            parameternames: Self::parameternames(),
            parameter_links: Self::parameter_links(),
        }
    }

    fn exact_joint_supported(&self) -> bool {
        self.threshold_design.is_some() && self.log_sigma_design.is_some()
    }

    pub fn initializewiggle_knots_from_q(
        q_seed: ArrayView1<'_, f64>,
        degree: usize,
        num_internal_knots: usize,
    ) -> Result<Array1<f64>, String> {
        initializewiggle_knots_from_seed(q_seed, degree, num_internal_knots)
    }

    fn wiggle_basiswith_options(
        &self,
        q0: ArrayView1<'_, f64>,
        basis_options: BasisOptions,
    ) -> Result<Array2<f64>, String> {
        monotone_wiggle_basis_with_derivative_order(
            q0,
            &self.wiggle_knots,
            self.wiggle_degree,
            basis_options.derivative_order,
        )
    }

    fn wiggle_design(&self, q0: ArrayView1<'_, f64>) -> Result<Array2<f64>, String> {
        self.wiggle_basiswith_options(q0, BasisOptions::value())
    }

    fn wiggle_dq_dq0(
        &self,
        q0: ArrayView1<'_, f64>,
        beta_link_wiggle: ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, String> {
        let d_constrained = self.wiggle_basiswith_options(q0, BasisOptions::first_derivative())?;
        if d_constrained.ncols() != beta_link_wiggle.len() {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "wiggle derivative col mismatch: got {}, expected {}",
                    d_constrained.ncols(),
                    beta_link_wiggle.len()
                ),
            }
            .into());
        }
        Ok(d_constrained.dot(&beta_link_wiggle) + 1.0)
    }

    fn wiggle_d2q_dq02(
        &self,
        q0: ArrayView1<'_, f64>,
        beta_link_wiggle: ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, String> {
        let d2_constrained =
            self.wiggle_basiswith_options(q0, BasisOptions::second_derivative())?;
        if d2_constrained.ncols() != beta_link_wiggle.len() {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "wiggle second-derivative col mismatch: got {}, expected {}",
                    d2_constrained.ncols(),
                    beta_link_wiggle.len()
                ),
            }
            .into());
        }
        Ok(d2_constrained.dot(&beta_link_wiggle))
    }

    fn wiggle_d3q_dq03(
        &self,
        q0: ArrayView1<'_, f64>,
        beta_link_wiggle: ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, String> {
        let d3_constrained = self.wiggle_d3basis_constrained(q0)?;
        if d3_constrained.ncols() != beta_link_wiggle.len() {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "wiggle third-derivative col mismatch: got {}, expected {}",
                    d3_constrained.ncols(),
                    beta_link_wiggle.len()
                ),
            }
            .into());
        }
        Ok(d3_constrained.dot(&beta_link_wiggle))
    }

    fn wiggle_d3basis_constrained(&self, q0: ArrayView1<'_, f64>) -> Result<Array2<f64>, String> {
        monotone_wiggle_basis_with_derivative_order(q0, &self.wiggle_knots, self.wiggle_degree, 3)
    }

    fn wiggle_d4q_dq04(
        &self,
        q0: ArrayView1<'_, f64>,
        beta_link_wiggle: ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, String> {
        let d4 = monotone_wiggle_basis_with_derivative_order(
            q0,
            &self.wiggle_knots,
            self.wiggle_degree,
            4,
        )?;
        if d4.ncols() != beta_link_wiggle.len() {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "wiggle fourth-derivative col mismatch: got {}, expected {}",
                    d4.ncols(),
                    beta_link_wiggle.len()
                ),
            }
            .into());
        }
        Ok(d4.dot(&beta_link_wiggle))
    }

    fn dense_block_designs(&self) -> Result<(Cow<'_, Array2<f64>>, Cow<'_, Array2<f64>>), String> {
        dense_locscale_block_designs_cached(
            self.threshold_design.as_ref(),
            self.log_sigma_design.as_ref(),
            "BinomialLocationScaleWiggleFamily",
            "BinomialLocationScaleWiggle",
            "threshold",
            &self.policy.material_policy(),
        )
    }

    fn dense_block_designs_fromspecs<'a>(
        &self,
        specs: &'a [ParameterBlockSpec],
    ) -> Result<(Cow<'a, Array2<f64>>, Cow<'a, Array2<f64>>), String> {
        dense_locscale_block_designs_fromspecs(
            specs,
            3,
            "BinomialLocationScaleWiggleFamily",
            "BinomialLocationScaleWiggle",
            Self::BLOCK_T,
            Self::BLOCK_LOG_SIGMA,
            "threshold",
            &self.policy.material_policy(),
        )
    }

    fn exact_joint_dense_block_designs<'a>(
        &'a self,
        specs: Option<&'a [ParameterBlockSpec]>,
    ) -> Result<Option<(Cow<'a, Array2<f64>>, Cow<'a, Array2<f64>>)>, String> {
        if self.threshold_design.is_some() && self.log_sigma_design.is_some() {
            return self.dense_block_designs().map(Some);
        }
        if let Some(specs) = specs {
            return self.dense_block_designs_fromspecs(specs).map(Some);
        }
        Ok(None)
    }

    fn shadow_with_exact_joint_designs(
        &self,
        specs: &[ParameterBlockSpec],
    ) -> Result<Option<Self>, String> {
        let Some((x_t, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        Ok(Some(Self {
            y: self.y.clone(),
            weights: self.weights.clone(),
            link_kind: self.link_kind.clone(),
            threshold_design: Some(DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
                x_t.into_owned(),
            ))),
            log_sigma_design: Some(DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
                x_ls.into_owned(),
            ))),
            wiggle_knots: self.wiggle_knots.clone(),
            wiggle_degree: self.wiggle_degree,
            policy: self.policy.clone(),
        }))
    }

    fn exact_newton_joint_psi_terms_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiTerms>, String> {
        let Some((x_t, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        self.exact_newton_joint_psi_terms_from_designs(
            block_states,
            derivative_blocks,
            psi_index,
            &x_t,
            &x_ls,
        )
    }

    fn exact_newton_joint_psisecond_order_terms_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_i: usize,
        psi_j: usize,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms>, String> {
        let Some((x_t, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        self.exact_newton_joint_psisecond_order_terms_from_designs(
            block_states,
            derivative_blocks,
            psi_i,
            psi_j,
            &x_t,
            &x_ls,
        )
    }

    fn exact_newton_joint_psihessian_directional_derivative_for_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        let Some((x_t, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        self.exact_newton_joint_psihessian_directional_derivative_from_designs(
            block_states,
            derivative_blocks,
            psi_index,
            d_beta_flat,
            &x_t,
            &x_ls,
        )
    }

    fn exact_newton_joint_psi_direction(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        x_t: &Array2<f64>,
        x_ls: &Array2<f64>,
        policy: &crate::resource::ResourcePolicy,
    ) -> Result<Option<LocationScaleJointPsiDirection>, String> {
        let Some(parts) = locscale_joint_psi_direction_parts(
            block_states,
            derivative_blocks,
            psi_index,
            self.y.len(),
            x_t.ncols(),
            x_ls.ncols(),
            Self::BLOCK_T,
            Self::BLOCK_LOG_SIGMA,
            3,
            "BinomialLocationScaleWiggleFamily",
            "threshold",
            policy,
        )?
        else {
            return Ok(None);
        };
        Ok(Some(LocationScaleJointPsiDirection {
            block_idx: parts.block_idx,
            local_idx: parts.local_idx,
            z_primary_psi: parts.primary_z,
            z_ls_psi: parts.log_sigma_z,
            x_primary_psi: parts.primary_psi,
            x_ls_psi: parts.log_sigma_psi,
        }))
    }

    fn exact_newton_joint_psisecond_design_drifts(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_a: &LocationScaleJointPsiDirection,
        psi_b: &LocationScaleJointPsiDirection,
        x_t: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<LocationScaleJointPsiSecondDrifts, String> {
        locscale_joint_psisecond_design_drifts(
            block_states,
            derivative_blocks,
            psi_a,
            psi_b,
            LocScalePsiDriftConfig {
                n: self.y.len(),
                p_primary: x_t.ncols(),
                p_log_sigma: x_ls.ncols(),
                primary_block_idx: Self::BLOCK_T,
                log_sigma_block_idx: Self::BLOCK_LOG_SIGMA,
                family_name: "BinomialLocationScaleWiggleFamily",
                primary_label: "threshold",
                policy: &self.policy,
            },
        )
    }

    fn exact_newton_joint_psi_terms_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        x_t: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiTerms>, String> {
        if self
            .exact_newton_joint_psi_direction(
                block_states,
                derivative_blocks,
                psi_index,
                x_t,
                x_ls,
                &self.policy,
            )?
            .is_none()
        {
            return Ok(None);
        }
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        let betaw = &block_states[Self::BLOCK_WIGGLE].beta;
        let core = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            Some(etaw),
            &self.link_kind,
        )?;
        let base_core = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            None,
            &self.link_kind,
        )?;
        let b0 = self.wiggle_design(base_core.q0.view())?;
        let d0 =
            self.wiggle_basiswith_options(base_core.q0.view(), BasisOptions::first_derivative())?;
        let dd0 =
            self.wiggle_basiswith_options(base_core.q0.view(), BasisOptions::second_derivative())?;
        let d3q = self.wiggle_d3q_dq03(base_core.q0.view(), betaw.view())?;
        let m = d0.dot(betaw) + 1.0;
        let g2 = self.wiggle_d2q_dq02(base_core.q0.view(), betaw.view())?;
        let g3 = d3q;
        let (sigma, ..) = exp_sigma_derivs_up_to_third(eta_ls.view());

        let pt = x_t.ncols();
        let pls = x_ls.ncols();
        let pw = b0.ncols();
        let total = pt + pls + pw;
        let Some(dir_a) = self.exact_newton_joint_psi_direction(
            block_states,
            derivative_blocks,
            psi_index,
            x_t,
            x_ls,
            &self.policy,
        )?
        else {
            return Ok(None);
        };
        let (z_t_psi, z_ls_psi) = (&dir_a.z_primary_psi, &dir_a.z_ls_psi);
        let mut objective_psi = 0.0;

        let mut score_t_xa = Array1::<f64>::zeros(n);
        let mut score_t_x = Array1::<f64>::zeros(n);
        let mut score_ls_xa = Array1::<f64>::zeros(n);
        let mut score_ls_x = Array1::<f64>::zeros(n);
        let mut score_w_b = Array1::<f64>::zeros(n);
        let mut score_w_d1 = Array1::<f64>::zeros(n);

        let mut coeff_tt_w = Array1::<f64>::zeros(n);
        let mut coeff_tt_d = Array1::<f64>::zeros(n);
        let mut coeff_tl_w = Array1::<f64>::zeros(n);
        let mut coeff_tl_d = Array1::<f64>::zeros(n);
        let mut coeff_ll_w = Array1::<f64>::zeros(n);
        let mut coeff_ll_d = Array1::<f64>::zeros(n);
        let mut coeff_tw_b_w = Array1::<f64>::zeros(n);
        let mut coeff_tw_b_d = Array1::<f64>::zeros(n);
        let mut coeff_tw_d1_w = Array1::<f64>::zeros(n);
        let mut coeff_tw_d1_d = Array1::<f64>::zeros(n);
        let mut coeff_tw_d2_d = Array1::<f64>::zeros(n);
        let mut coeff_lw_b_w = Array1::<f64>::zeros(n);
        let mut coeff_lw_b_d = Array1::<f64>::zeros(n);
        let mut coeff_lw_d1_w = Array1::<f64>::zeros(n);
        let mut coeff_lw_d1_d = Array1::<f64>::zeros(n);
        let mut coeff_lw_d2_d = Array1::<f64>::zeros(n);
        let mut coeff_ww_bb = Array1::<f64>::zeros(n);
        let mut coeff_ww_db = Array1::<f64>::zeros(n);

        // Exact likelihood-only joint psi terms for the probit wiggle family.
        //
        // This helper is intentionally the same generic rowwise kernel as the
        // non-wiggle family. The only difference is the location-side row:
        //
        //   gamma = [beta_t; betaw],
        //   delta = beta_ls,
        //   z_r   = [x_{t,r}; B_r(q0)],
        //   x_r   = x_{ls,r},
        //   a_r   = z_r^T gamma,
        //   ell_r = x_r^T delta,
        //   q_r   = -a_r * exp(-ell_r).
        //
        // In this wiggle family we realize the same kernel through the chain
        //
        //   q = q0 + betaw^T B(q0),
        //   q0 = -eta_t * exp(-eta_ls),
        //   m  = dq/dq0   = 1 + betaw^T B'(q0),
        //   g2 = d²q/dq0² = betaw^T B''(q0),
        //   g3 = d³q/dq0³ = betaw^T B'''(q0).
        //
        // For a realized hyperdirection psi_a:
        //
        //   h_a     = q_{psi_a},
        //   c_a     = q_{beta psi_a},
        //   R_a     = q_{beta beta psi_a},
        //
        // and the generic scalar-loss identities are
        //
        //   D_a            = sum_r r_r h_{r,a},
        //   D_{beta a}     = sum_r [ w_r h_{r,a} b_r + r_r c_{r,a} ],
        //   D_{beta beta a}
        //                  = sum_r [ nu_r h_{r,a} b_r b_r^T
        //                              + w_r(c_{r,a} b_r^T + b_r c_{r,a}^T + h_{r,a} Q_r)
        //                              + r_r R_{r,a} ].
        //
        // Generic exact-joint code adds all realized penalty motion S_a after
        // the fact, so this family hook must stay likelihood-only.
        //
        // The rowwise objects below are the wiggle specialization of the same
        // q_r = -a_r exp(-ell_r) kernel. All wiggle-specific complexity is
        // localized to the realized row B_r(q0) and its q0-derivatives.
        for row in 0..n {
            let q0 = base_core.q0[row];
            let q = q0 + etaw[row];
            let q0_geom = nonwiggle_q_derivs(eta_t[row], sigma[row]);
            let r_sigma = 1.0 / sigma[row];
            let q0_a = -r_sigma * z_t_psi[row] - q0 * z_ls_psi[row];
            let q0_t_a = q0_geom.q_tl * z_ls_psi[row];
            let q0_ls_a = q0_geom.q_tl * z_t_psi[row] + q0_geom.q_ll * z_ls_psi[row];
            let q0_tl_a = q0_geom.q_tl_ls * z_ls_psi[row];
            let q0_ll_a = q0_geom.q_tl_ls * z_t_psi[row] + q0_geom.q_ll_ls * z_ls_psi[row];

            let q_t = m[row] * q0_geom.q_t;
            let q_ls = m[row] * q0_geom.q_ls;
            let q_tt = g2[row] * q0_geom.q_t * q0_geom.q_t;
            let q_tl = g2[row] * q0_geom.q_t * q0_geom.q_ls + m[row] * q0_geom.q_tl;
            let q_ll = g2[row] * q0_geom.q_ls * q0_geom.q_ls + m[row] * q0_geom.q_ll;
            let q_t_a = g2[row] * q0_a * q0_geom.q_t + m[row] * q0_t_a;
            let q_ls_a = g2[row] * q0_a * q0_geom.q_ls + m[row] * q0_ls_a;
            let q_tt_a =
                g3[row] * q0_a * q0_geom.q_t * q0_geom.q_t + g2[row] * (2.0 * q0_geom.q_t * q0_t_a);
            let q_tl_a = g3[row] * q0_a * q0_geom.q_t * q0_geom.q_ls
                + g2[row] * (q0_t_a * q0_geom.q_ls + q0_geom.q_t * q0_ls_a + q0_a * q0_geom.q_tl)
                + m[row] * q0_tl_a;
            let q_ll_a = g3[row] * q0_a * q0_geom.q_ls * q0_geom.q_ls
                + g2[row] * (2.0 * q0_geom.q_ls * q0_ls_a + q0_a * q0_geom.q_ll)
                + m[row] * q0_ll_a;

            let (loss_1, loss_2, loss_3) = binomial_neglog_q_derivatives_dispatch(
                self.y[row],
                self.weights[row],
                q,
                core.mu[row],
                core.dmu_dq[row],
                core.d2mu_dq2[row],
                core.d3mu_dq3[row],
                &self.link_kind,
            );
            let alpha = m[row] * q0_a;
            objective_psi += loss_1 * alpha;

            score_t_xa[row] = loss_1 * q_t;
            score_t_x[row] = loss_2 * alpha * q_t + loss_1 * q_t_a;
            score_ls_xa[row] = loss_1 * q_ls;
            score_ls_x[row] = loss_2 * alpha * q_ls + loss_1 * q_ls_a;
            score_w_b[row] = loss_2 * alpha;
            score_w_d1[row] = loss_1 * q0_a;

            coeff_tt_w[row] = loss_2 * q_t * q_t + loss_1 * q_tt;
            coeff_tt_d[row] = loss_3 * alpha * q_t * q_t
                + 2.0 * loss_2 * q_t * q_t_a
                + loss_2 * alpha * q_tt
                + loss_1 * q_tt_a;
            coeff_tl_w[row] = loss_2 * q_t * q_ls + loss_1 * q_tl;
            coeff_tl_d[row] = loss_3 * alpha * q_t * q_ls
                + loss_2 * (q_t_a * q_ls + q_t * q_ls_a)
                + loss_2 * alpha * q_tl
                + loss_1 * q_tl_a;
            coeff_ll_w[row] = loss_2 * q_ls * q_ls + loss_1 * q_ll;
            coeff_ll_d[row] = loss_3 * alpha * q_ls * q_ls
                + 2.0 * loss_2 * q_ls * q_ls_a
                + loss_2 * alpha * q_ll
                + loss_1 * q_ll_a;

            coeff_tw_b_w[row] = loss_2 * q_t;
            coeff_tw_b_d[row] = loss_3 * alpha * q_t + loss_2 * q_t_a;
            coeff_tw_d1_w[row] = loss_1 * q0_geom.q_t;
            coeff_tw_d1_d[row] = loss_2 * (q_t * q0_a + alpha * q0_geom.q_t) + loss_1 * q0_t_a;
            coeff_tw_d2_d[row] = loss_1 * q0_a * q0_geom.q_t;

            coeff_lw_b_w[row] = loss_2 * q_ls;
            coeff_lw_b_d[row] = loss_3 * alpha * q_ls + loss_2 * q_ls_a;
            coeff_lw_d1_w[row] = loss_1 * q0_geom.q_ls;
            coeff_lw_d1_d[row] = loss_2 * (q_ls * q0_a + alpha * q0_geom.q_ls) + loss_1 * q0_ls_a;
            coeff_lw_d2_d[row] = loss_1 * q0_a * q0_geom.q_ls;

            coeff_ww_bb[row] = loss_3 * alpha;
            coeff_ww_db[row] = loss_2 * q0_a;
        }
        let x_t_map = dir_a.x_primary_psi.as_linear_map_ref();
        let x_ls_map = dir_a.x_ls_psi.as_linear_map_ref();
        let score_t = x_t_map.transpose_mul(score_t_xa.view()) + fast_atv(x_t, &score_t_x);
        let score_ls = x_ls_map.transpose_mul(score_ls_xa.view()) + fast_atv(x_ls, &score_ls_x);
        let score_w = fast_atv(&b0, &score_w_b) + fast_atv(&d0, &score_w_d1);
        let mut score_psi = Array1::<f64>::zeros(total);
        score_psi.slice_mut(s![0..pt]).assign(&score_t);
        score_psi.slice_mut(s![pt..pt + pls]).assign(&score_ls);
        score_psi.slice_mut(s![pt + pls..total]).assign(&score_w);

        let x_t_action_opt = dir_a.x_primary_psi.cloned_first_action();
        let x_ls_action_opt = dir_a.x_ls_psi.cloned_first_action();
        if x_t_action_opt.is_some() || x_ls_action_opt.is_some() {
            let basis_arc = Arc::new(b0.clone());
            let basis_d1_arc = Arc::new(d0.clone());
            let basis_d2_arc = Arc::new(dd0.clone());
            let zeros = Array1::<f64>::zeros(n);
            let operator = CustomFamilyJointPsiOperator::new(
                total,
                vec![
                    CustomFamilyJointDesignChannel::new(
                        0..pt,
                        shared_dense_arc(x_t),
                        x_t_action_opt,
                    ),
                    CustomFamilyJointDesignChannel::new(
                        pt..pt + pls,
                        shared_dense_arc(x_ls),
                        x_ls_action_opt,
                    ),
                    CustomFamilyJointDesignChannel::new(
                        pt + pls..total,
                        Arc::clone(&basis_arc),
                        None,
                    ),
                    CustomFamilyJointDesignChannel::new(
                        pt + pls..total,
                        Arc::clone(&basis_d1_arc),
                        None,
                    ),
                    CustomFamilyJointDesignChannel::new(
                        pt + pls..total,
                        Arc::clone(&basis_d2_arc),
                        None,
                    ),
                ],
                vec![
                    CustomFamilyJointDesignPairContribution::new(
                        0,
                        0,
                        coeff_tt_w.clone(),
                        coeff_tt_d.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        0,
                        1,
                        coeff_tl_w.clone(),
                        coeff_tl_d.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        1,
                        0,
                        coeff_tl_w.clone(),
                        coeff_tl_d.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        1,
                        1,
                        coeff_ll_w.clone(),
                        coeff_ll_d.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        0,
                        2,
                        coeff_tw_b_w.clone(),
                        coeff_tw_b_d.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        2,
                        0,
                        coeff_tw_b_w.clone(),
                        coeff_tw_b_d.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        0,
                        3,
                        coeff_tw_d1_w.clone(),
                        coeff_tw_d1_d.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        3,
                        0,
                        coeff_tw_d1_w.clone(),
                        coeff_tw_d1_d.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        0,
                        4,
                        zeros.clone(),
                        coeff_tw_d2_d.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        4,
                        0,
                        zeros.clone(),
                        coeff_tw_d2_d.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        1,
                        2,
                        coeff_lw_b_w.clone(),
                        coeff_lw_b_d.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        2,
                        1,
                        coeff_lw_b_w.clone(),
                        coeff_lw_b_d.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        1,
                        3,
                        coeff_lw_d1_w.clone(),
                        coeff_lw_d1_d.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        3,
                        1,
                        coeff_lw_d1_w.clone(),
                        coeff_lw_d1_d.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        1,
                        4,
                        zeros.clone(),
                        coeff_lw_d2_d.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        4,
                        1,
                        zeros.clone(),
                        coeff_lw_d2_d.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        2,
                        2,
                        zeros.clone(),
                        coeff_ww_bb.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(
                        3,
                        2,
                        zeros.clone(),
                        coeff_ww_db.clone(),
                    ),
                    CustomFamilyJointDesignPairContribution::new(2, 3, zeros, coeff_ww_db.clone()),
                ],
            );
            return Ok(Some(crate::custom_family::ExactNewtonJointPsiTerms {
                objective_psi,
                score_psi,
                hessian_psi: Array2::zeros((0, 0)),
                hessian_psi_operator: Some(std::sync::Arc::new(operator)),
            }));
        }
        let h_tt_block = weighted_crossprod_psi_maps(
            x_t_map,
            coeff_tt_w.view(),
            CustomFamilyPsiLinearMapRef::Dense(x_t),
        )? + &weighted_crossprod_psi_maps(
            CustomFamilyPsiLinearMapRef::Dense(x_t),
            coeff_tt_w.view(),
            x_t_map,
        )? + &xt_diag_x_dense(x_t, &coeff_tt_d)?;
        let h_tl_block = weighted_crossprod_psi_maps(
            x_t_map,
            coeff_tl_w.view(),
            CustomFamilyPsiLinearMapRef::Dense(x_ls),
        )? + &weighted_crossprod_psi_maps(
            CustomFamilyPsiLinearMapRef::Dense(x_t),
            coeff_tl_w.view(),
            x_ls_map,
        )? + &xt_diag_y_dense(x_t, &coeff_tl_d, x_ls)?;
        let h_ll_block = weighted_crossprod_psi_maps(
            x_ls_map,
            coeff_ll_w.view(),
            CustomFamilyPsiLinearMapRef::Dense(x_ls),
        )? + &weighted_crossprod_psi_maps(
            CustomFamilyPsiLinearMapRef::Dense(x_ls),
            coeff_ll_w.view(),
            x_ls_map,
        )? + &xt_diag_x_dense(x_ls, &coeff_ll_d)?;
        let h_tw = weighted_crossprod_psi_maps(
            x_t_map,
            coeff_tw_b_w.view(),
            CustomFamilyPsiLinearMapRef::Dense(&b0),
        )? + &xt_diag_y_dense(x_t, &coeff_tw_b_d, &b0)?
            + &weighted_crossprod_psi_maps(
                x_t_map,
                coeff_tw_d1_w.view(),
                CustomFamilyPsiLinearMapRef::Dense(&d0),
            )?
            + &xt_diag_y_dense(x_t, &coeff_tw_d1_d, &d0)?
            + &xt_diag_y_dense(x_t, &coeff_tw_d2_d, &dd0)?;
        let h_lw = weighted_crossprod_psi_maps(
            x_ls_map,
            coeff_lw_b_w.view(),
            CustomFamilyPsiLinearMapRef::Dense(&b0),
        )? + &xt_diag_y_dense(x_ls, &coeff_lw_b_d, &b0)?
            + &weighted_crossprod_psi_maps(
                x_ls_map,
                coeff_lw_d1_w.view(),
                CustomFamilyPsiLinearMapRef::Dense(&d0),
            )?
            + &xt_diag_y_dense(x_ls, &coeff_lw_d1_d, &d0)?
            + &xt_diag_y_dense(x_ls, &coeff_lw_d2_d, &dd0)?;
        let a_ww = xt_diag_y_dense(&d0, &coeff_ww_db, &b0)?;
        let h_ww = xt_diag_x_dense(&b0, &coeff_ww_bb)? + &a_ww + a_ww.t();

        let mut hessian_psi = Array2::<f64>::zeros((total, total));
        hessian_psi.slice_mut(s![0..pt, 0..pt]).assign(&h_tt_block);
        hessian_psi
            .slice_mut(s![0..pt, pt..pt + pls])
            .assign(&h_tl_block);
        hessian_psi
            .slice_mut(s![pt..pt + pls, pt..pt + pls])
            .assign(&h_ll_block);
        hessian_psi
            .slice_mut(s![0..pt, pt + pls..total])
            .assign(&h_tw);
        hessian_psi
            .slice_mut(s![pt..pt + pls, pt + pls..total])
            .assign(&h_lw);
        hessian_psi
            .slice_mut(s![pt + pls..total, pt + pls..total])
            .assign(&h_ww);
        mirror_upper_to_lower(&mut hessian_psi);

        Ok(Some(crate::custom_family::ExactNewtonJointPsiTerms {
            objective_psi,
            score_psi,
            hessian_psi,
            hessian_psi_operator: None,
        }))
    }

    fn exact_newton_joint_psisecond_order_terms_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_i: usize,
        psi_j: usize,
        x_t: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms>, String> {
        if block_states.len() != 3 || derivative_blocks.len() != 3 {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "BinomialLocationScaleWiggleFamily joint psi second-order terms expect 3 blocks and 3 derivative block lists, got {} and {}",
                block_states.len(),
                derivative_blocks.len()
            ) }.into());
        }
        let Some(dir_a) = self.exact_newton_joint_psi_direction(
            block_states,
            derivative_blocks,
            psi_i,
            x_t,
            x_ls,
            &self.policy,
        )?
        else {
            return Ok(None);
        };
        let Some(dir_b) = self.exact_newton_joint_psi_direction(
            block_states,
            derivative_blocks,
            psi_j,
            x_t,
            x_ls,
            &self.policy,
        )?
        else {
            return Ok(None);
        };
        Ok(Some(
            self.exact_newton_joint_psisecond_order_terms_from_parts(
                block_states,
                derivative_blocks,
                &dir_a,
                &dir_b,
                x_t,
                x_ls,
            )?,
        ))
    }

    fn exact_newton_joint_psisecond_order_terms_from_parts(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        dir_a: &LocationScaleJointPsiDirection,
        dir_b: &LocationScaleJointPsiDirection,
        x_t: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms, String> {
        let second_drifts = self.exact_newton_joint_psisecond_design_drifts(
            block_states,
            derivative_blocks,
            dir_a,
            dir_b,
            x_t,
            x_ls,
        )?;
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        let betaw = &block_states[Self::BLOCK_WIGGLE].beta;
        let core = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            Some(etaw),
            &self.link_kind,
        )?;
        let base_core = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            None,
            &self.link_kind,
        )?;
        let b0 = self.wiggle_design(base_core.q0.view())?;
        let d0 =
            self.wiggle_basiswith_options(base_core.q0.view(), BasisOptions::first_derivative())?;
        let dd0 =
            self.wiggle_basiswith_options(base_core.q0.view(), BasisOptions::second_derivative())?;
        let d3_basis = self.wiggle_d3basis_constrained(base_core.q0.view())?;
        let d3q = self.wiggle_d3q_dq03(base_core.q0.view(), betaw.view())?;
        let d4q = self.wiggle_d4q_dq04(base_core.q0.view(), betaw.view())?;
        if b0.ncols() != betaw.len()
            || d0.ncols() != betaw.len()
            || dd0.ncols() != betaw.len()
            || d3_basis.ncols() != betaw.len()
        {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "wiggle derivative/beta mismatch in joint psi psi terms: B={} B'={} B''={} B'''={} betaw={}",
                b0.ncols(),
                d0.ncols(),
                dd0.ncols(),
                d3_basis.ncols(),
                betaw.len()
            ) }.into());
        }
        let m = d0.dot(betaw) + 1.0;
        let g2 = dd0.dot(betaw);
        let g3 = d3q;
        let g4 = d4q;
        let (sigma, ds, d2s, d3s) = exp_sigma_derivs_up_to_third(eta_ls.view());

        let pt = x_t.ncols();
        let pls = x_ls.ncols();
        let pw = b0.ncols();
        let total = pt + pls + pw;
        let x_t_a_map = dir_a.x_primary_psi.as_linear_map_ref();
        let x_t_b_map = dir_b.x_primary_psi.as_linear_map_ref();
        let x_ls_a_map = dir_a.x_ls_psi.as_linear_map_ref();
        let x_ls_b_map = dir_b.x_ls_psi.as_linear_map_ref();
        let x_t_ab_map = second_psi_linear_map(
            second_drifts.x_primary_ab_action.as_ref(),
            second_drifts.x_primary_ab.as_ref(),
            n,
            pt,
        );
        let x_ls_ab_map = second_psi_linear_map(
            second_drifts.x_ls_ab_action.as_ref(),
            second_drifts.x_ls_ab.as_ref(),
            n,
            pls,
        );
        let mut objective_psi_psi = 0.0;
        let mut score_psi_psi = Array1::<f64>::zeros(total);
        let mut hessian_psi_psi = Array2::<f64>::zeros((total, total));

        // Likelihood-only exact psi/psi terms for the wiggle family.
        //
        // This is the same generic second-order kernel as the non-wiggle path,
        // still over the flattened coefficients beta = [beta_t; beta_ls; betaw].
        // The family provides only the likelihood-side fixed-beta objects
        //
        //   D_ab, D_{beta ab}, D_{beta beta ab},
        //
        // while generic exact-joint code in custom_family.rs adds all realized
        // penalty motion S_ab.
        //
        // Using the generic rowwise notation
        //
        //   h_a   = q_{psi_a},      h_b   = q_{psi_b},
        //   h_ab  = q_{psi_a psi_b},
        //   c_a   = q_{beta psi_a}, c_b   = q_{beta psi_b},
        //   c_ab  = q_{beta psi_a psi_b},
        //   R_a   = q_{beta beta psi_a},
        //   R_b   = q_{beta beta psi_b},
        //   R_ab  = q_{beta beta psi_a psi_b},
        //
        // the exact scalar-loss kernel is
        //
        //   D_ab
        //   = sum_r [ w_r h_{r,a} h_{r,b} + r_r h_{r,ab} ],
        //
        //   D_{beta ab}
        //   = sum_r [
        //       r_r c_{r,ab}
        //       + w_r h_{r,b} c_{r,a}
        //       + w_r h_{r,a} c_{r,b}
        //       + (w_r h_{r,ab} + nu_r h_{r,a} h_{r,b}) b_r
        //     ],
        //
        //   D_{beta beta ab}
        //   = sum_r [
        //       r_r R_{r,ab}
        //       + w_r h_{r,b} R_{r,a}
        //       + w_r h_{r,a} R_{r,b}
        //       + w_r(c_{r,ab} b_r^T + b_r c_{r,ab}^T
        //             + c_{r,a} c_{r,b}^T + c_{r,b} c_{r,a}^T
        //             + h_{r,ab} Q_r)
        //       + nu_r h_{r,b}(c_{r,a} b_r^T + b_r c_{r,a}^T)
        //       + nu_r h_{r,a}(c_{r,b} b_r^T + b_r c_{r,b}^T)
        //       + nu_r h_{r,a} h_{r,b} Q_r
        //       + (tau_r h_{r,a} h_{r,b} + nu_r h_{r,ab}) b_r b_r^T
        //     ].
        //
        // The wiggle specialization enters only through the rowwise q-objects
        // built below from the combined location-side row z_r = [x_{t,r}; B_r(q0)].
        let mut b = Array1::<f64>::zeros(total);
        let mut c_a = Array1::<f64>::zeros(total);
        let mut c_b = Array1::<f64>::zeros(total);
        let mut c_ab = Array1::<f64>::zeros(total);
        let mut q_mat = Array2::<f64>::zeros((total, total));
        let mut r_a = Array2::<f64>::zeros((total, total));
        let mut r_b = Array2::<f64>::zeros((total, total));
        let mut r_ab = Array2::<f64>::zeros((total, total));
        let mut qw_a = Array1::<f64>::zeros(pw);
        let mut qw_b = Array1::<f64>::zeros(pw);
        let mut qw_ab = Array1::<f64>::zeros(pw);
        let mut q_tw_a = Array1::<f64>::zeros(pw);
        let mut q_tw_b = Array1::<f64>::zeros(pw);
        let mut q_lw_a = Array1::<f64>::zeros(pw);
        let mut q_lw_b = Array1::<f64>::zeros(pw);
        let mut d0_ab = Array1::<f64>::zeros(pw);
        let mut q_tw_ab = Array1::<f64>::zeros(pw);
        let mut q_lw_ab = Array1::<f64>::zeros(pw);
        for row in 0..n {
            let q0 = base_core.q0[row];
            let q = q0 + etaw[row];
            let q0_geom = nonwiggle_q_derivs(eta_t[row], sigma[row]);
            let s_safe = sigma[row];
            let s2 = s_safe * s_safe;
            let s3 = s2 * s_safe;
            let s4 = s3 * s_safe;
            let q0_tl_ls_ls =
                d3s[row] / s2 - 6.0 * ds[row] * d2s[row] / s3 + 6.0 * ds[row].powi(3) / s4;
            let r_sigma = 1.0 / s_safe;

            let q0_a = -r_sigma * dir_a.z_primary_psi[row] - q0 * dir_a.z_ls_psi[row];
            let q0_b = -r_sigma * dir_b.z_primary_psi[row] - q0 * dir_b.z_ls_psi[row];
            let q0_ab = -r_sigma * second_drifts.z_primary_ab[row]
                + r_sigma
                    * (dir_a.z_primary_psi[row] * dir_b.z_ls_psi[row]
                        + dir_b.z_primary_psi[row] * dir_a.z_ls_psi[row])
                + q0 * (dir_a.z_ls_psi[row] * dir_b.z_ls_psi[row] - second_drifts.z_ls_ab[row]);

            let q0_t_a = q0_geom.q_tl * dir_a.z_ls_psi[row];
            let q0_t_b = q0_geom.q_tl * dir_b.z_ls_psi[row];
            let q0_t_ab = q0_geom.q_tl_ls * dir_a.z_ls_psi[row] * dir_b.z_ls_psi[row]
                + q0_geom.q_tl * second_drifts.z_ls_ab[row];
            let q0_ls_a =
                q0_geom.q_tl * dir_a.z_primary_psi[row] + q0_geom.q_ll * dir_a.z_ls_psi[row];
            let q0_ls_b =
                q0_geom.q_tl * dir_b.z_primary_psi[row] + q0_geom.q_ll * dir_b.z_ls_psi[row];
            let q0_ls_ab = -q0_ab;
            let q0_tl_a = q0_geom.q_tl_ls * dir_a.z_ls_psi[row];
            let q0_tl_b = q0_geom.q_tl_ls * dir_b.z_ls_psi[row];
            let q0_tl_ab = q0_tl_ls_ls * dir_a.z_ls_psi[row] * dir_b.z_ls_psi[row]
                + q0_geom.q_tl_ls * second_drifts.z_ls_ab[row];
            let q0_ll_a =
                q0_geom.q_tl_ls * dir_a.z_primary_psi[row] + q0_geom.q_ll_ls * dir_a.z_ls_psi[row];
            let q0_ll_b =
                q0_geom.q_tl_ls * dir_b.z_primary_psi[row] + q0_geom.q_ll_ls * dir_b.z_ls_psi[row];
            let q0_ll_ab = q0_ab;

            let m_a = g2[row] * q0_a;
            let m_b = g2[row] * q0_b;
            let m_ab = g3[row] * q0_a * q0_b + g2[row] * q0_ab;
            let g2_a = g3[row] * q0_a;
            let g2_b = g3[row] * q0_b;
            let g2_ab = g4[row] * q0_a * q0_b + g3[row] * q0_ab;

            let q_a = m[row] * q0_a;
            let q_b = m[row] * q0_b;
            let q_ab = m[row] * q0_ab + g2[row] * q0_a * q0_b;
            let q_t = m[row] * q0_geom.q_t;
            let q_ls = m[row] * q0_geom.q_ls;
            let q_tt = g2[row] * q0_geom.q_t * q0_geom.q_t;
            let q_tl = g2[row] * q0_geom.q_t * q0_geom.q_ls + m[row] * q0_geom.q_tl;
            let q_ll = g2[row] * q0_geom.q_ls * q0_geom.q_ls + m[row] * q0_geom.q_ll;
            let q_t_a = m_a * q0_geom.q_t + m[row] * q0_t_a;
            let q_t_b = m_b * q0_geom.q_t + m[row] * q0_t_b;
            let q_ls_a = m_a * q0_geom.q_ls + m[row] * q0_ls_a;
            let q_ls_b = m_b * q0_geom.q_ls + m[row] * q0_ls_b;
            let q_t_ab = m_ab * q0_geom.q_t + m_a * q0_t_b + m_b * q0_t_a + m[row] * q0_t_ab;
            let q_ls_ab = m_ab * q0_geom.q_ls + m_a * q0_ls_b + m_b * q0_ls_a + m[row] * q0_ls_ab;
            let q_tt_a = g2_a * q0_geom.q_t * q0_geom.q_t + g2[row] * 2.0 * q0_geom.q_t * q0_t_a;
            let q_tt_b = g2_b * q0_geom.q_t * q0_geom.q_t + g2[row] * 2.0 * q0_geom.q_t * q0_t_b;
            let q_tt_ab = g2_ab * q0_geom.q_t * q0_geom.q_t
                + g2_a * 2.0 * q0_geom.q_t * q0_t_b
                + g2_b * 2.0 * q0_geom.q_t * q0_t_a
                + g2[row] * (2.0 * q0_t_a * q0_t_b + 2.0 * q0_geom.q_t * q0_t_ab);
            let q_tl_a = g2_a * q0_geom.q_t * q0_geom.q_ls
                + g2[row] * (q0_t_a * q0_geom.q_ls + q0_geom.q_t * q0_ls_a)
                + m_a * q0_geom.q_tl
                + m[row] * q0_tl_a;
            let q_tl_b = g2_b * q0_geom.q_t * q0_geom.q_ls
                + g2[row] * (q0_t_b * q0_geom.q_ls + q0_geom.q_t * q0_ls_b)
                + m_b * q0_geom.q_tl
                + m[row] * q0_tl_b;
            let q_tl_ab = g2_ab * q0_geom.q_t * q0_geom.q_ls
                + g2_a * (q0_t_b * q0_geom.q_ls + q0_geom.q_t * q0_ls_b)
                + g2_b * (q0_t_a * q0_geom.q_ls + q0_geom.q_t * q0_ls_a)
                + g2[row]
                    * (q0_t_ab * q0_geom.q_ls
                        + q0_t_a * q0_ls_b
                        + q0_t_b * q0_ls_a
                        + q0_geom.q_t * q0_ls_ab)
                + m_ab * q0_geom.q_tl
                + m_a * q0_tl_b
                + m_b * q0_tl_a
                + m[row] * q0_tl_ab;
            let q_ll_a = g2_a * q0_geom.q_ls * q0_geom.q_ls
                + g2[row] * 2.0 * q0_geom.q_ls * q0_ls_a
                + m_a * q0_geom.q_ll
                + m[row] * q0_ll_a;
            let q_ll_b = g2_b * q0_geom.q_ls * q0_geom.q_ls
                + g2[row] * 2.0 * q0_geom.q_ls * q0_ls_b
                + m_b * q0_geom.q_ll
                + m[row] * q0_ll_b;
            let q_ll_ab = g2_ab * q0_geom.q_ls * q0_geom.q_ls
                + g2_a * 2.0 * q0_geom.q_ls * q0_ls_b
                + g2_b * 2.0 * q0_geom.q_ls * q0_ls_a
                + g2[row] * (2.0 * q0_ls_a * q0_ls_b + 2.0 * q0_geom.q_ls * q0_ls_ab)
                + m_ab * q0_geom.q_ll
                + m_a * q0_ll_b
                + m_b * q0_ll_a
                + m[row] * q0_ll_ab;

            let brow = b0.row(row);
            let drow = d0.row(row);
            let ddrow = dd0.row(row);
            let d3row = d3_basis.row(row);
            qw_a.fill(0.0);
            qw_a.scaled_add(q0_a, &drow);
            qw_b.fill(0.0);
            qw_b.scaled_add(q0_b, &drow);
            qw_ab.fill(0.0);
            qw_ab.scaled_add(q0_a * q0_b, &ddrow);
            qw_ab.scaled_add(q0_ab, &drow);
            q_tw_a.fill(0.0);
            q_tw_a.scaled_add(q0_a * q0_geom.q_t, &ddrow);
            q_tw_a.scaled_add(q0_t_a, &drow);
            q_tw_b.fill(0.0);
            q_tw_b.scaled_add(q0_b * q0_geom.q_t, &ddrow);
            q_tw_b.scaled_add(q0_t_b, &drow);
            q_lw_a.fill(0.0);
            q_lw_a.scaled_add(q0_a * q0_geom.q_ls, &ddrow);
            q_lw_a.scaled_add(q0_ls_a, &drow);
            q_lw_b.fill(0.0);
            q_lw_b.scaled_add(q0_b * q0_geom.q_ls, &ddrow);
            q_lw_b.scaled_add(q0_ls_b, &drow);
            d0_ab.fill(0.0);
            d0_ab.scaled_add(q0_a * q0_b, &d3row);
            d0_ab.scaled_add(q0_ab, &ddrow);
            q_tw_ab.fill(0.0);
            q_tw_ab.scaled_add(q0_geom.q_t, &d0_ab);
            q_tw_ab.scaled_add(q0_b * q0_t_a, &ddrow);
            q_tw_ab.scaled_add(q0_a * q0_t_b, &ddrow);
            q_tw_ab.scaled_add(q0_t_ab, &drow);
            q_lw_ab.fill(0.0);
            q_lw_ab.scaled_add(q0_geom.q_ls, &d0_ab);
            q_lw_ab.scaled_add(q0_b * q0_ls_a, &ddrow);
            q_lw_ab.scaled_add(q0_a * q0_ls_b, &ddrow);
            q_lw_ab.scaled_add(q0_ls_ab, &drow);

            let (loss_1, loss_2, loss_3) = binomial_neglog_q_derivatives_dispatch(
                self.y[row],
                self.weights[row],
                q,
                core.mu[row],
                core.dmu_dq[row],
                core.d2mu_dq2[row],
                core.d3mu_dq3[row],
                &self.link_kind,
            );
            let loss_4 = binomial_neglog_q_fourth_derivative_dispatch(
                self.y[row],
                self.weights[row],
                q,
                core.mu[row],
                core.dmu_dq[row],
                core.d2mu_dq2[row],
                core.d3mu_dq3[row],
                &self.link_kind,
            )?;
            objective_psi_psi += loss_2 * q_a * q_b + loss_1 * q_ab;

            let xtr = x_t.row(row);
            let xlsr = x_ls.row(row);
            let xta = x_t_a_map.row_vector(row)?;
            let xtb = x_t_b_map.row_vector(row)?;
            let xlsa = x_ls_a_map.row_vector(row)?;
            let xlsb = x_ls_b_map.row_vector(row)?;
            let xtab = x_t_ab_map.row_vector(row)?;
            let xlsab = x_ls_ab_map.row_vector(row)?;

            b.fill(0.0);
            b.slice_mut(s![0..pt]).scaled_add(q_t, &xtr);
            b.slice_mut(s![pt..pt + pls]).scaled_add(q_ls, &xlsr);
            b.slice_mut(s![pt + pls..]).assign(&brow);
            c_a.fill(0.0);
            c_a.slice_mut(s![0..pt]).scaled_add(q_t_a, &xtr);
            c_a.slice_mut(s![0..pt]).scaled_add(q_t, &xta.view());
            c_a.slice_mut(s![pt..pt + pls]).scaled_add(q_ls_a, &xlsr);
            c_a.slice_mut(s![pt..pt + pls])
                .scaled_add(q_ls, &xlsa.view());
            c_a.slice_mut(s![pt + pls..]).assign(&qw_a);
            c_b.fill(0.0);
            c_b.slice_mut(s![0..pt]).scaled_add(q_t_b, &xtr);
            c_b.slice_mut(s![0..pt]).scaled_add(q_t, &xtb.view());
            c_b.slice_mut(s![pt..pt + pls]).scaled_add(q_ls_b, &xlsr);
            c_b.slice_mut(s![pt..pt + pls])
                .scaled_add(q_ls, &xlsb.view());
            c_b.slice_mut(s![pt + pls..]).assign(&qw_b);
            c_ab.fill(0.0);
            c_ab.slice_mut(s![0..pt]).scaled_add(q_t_ab, &xtr);
            c_ab.slice_mut(s![0..pt]).scaled_add(q_t_b, &xta.view());
            c_ab.slice_mut(s![0..pt]).scaled_add(q_t_a, &xtb.view());
            c_ab.slice_mut(s![0..pt]).scaled_add(q_t, &xtab.view());
            c_ab.slice_mut(s![pt..pt + pls]).scaled_add(q_ls_ab, &xlsr);
            c_ab.slice_mut(s![pt..pt + pls])
                .scaled_add(q_ls_b, &xlsa.view());
            c_ab.slice_mut(s![pt..pt + pls])
                .scaled_add(q_ls_a, &xlsb.view());
            c_ab.slice_mut(s![pt..pt + pls])
                .scaled_add(q_ls, &xlsab.view());
            c_ab.slice_mut(s![pt + pls..]).assign(&qw_ab);

            score_psi_psi.scaled_add(loss_1, &c_ab);
            score_psi_psi.scaled_add(loss_2 * q_b, &c_a);
            score_psi_psi.scaled_add(loss_2 * q_a, &c_b);
            score_psi_psi.scaled_add(loss_2 * q_ab + loss_3 * q_a * q_b, &b);

            q_mat.fill(0.0);
            r_a.fill(0.0);
            r_b.fill(0.0);
            r_ab.fill(0.0);
            scaled_outer_add(q_mat.slice_mut(s![0..pt, 0..pt]), q_tt, xtr, xtr);
            scaled_outer_add(q_mat.slice_mut(s![0..pt, pt..pt + pls]), q_tl, xtr, xlsr);
            scaled_outer_add(
                q_mat.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll,
                xlsr,
                xlsr,
            );
            scaled_outer_add(
                q_mat.slice_mut(s![0..pt, pt + pls..]),
                q0_geom.q_t,
                xtr,
                drow,
            );
            scaled_outer_add(
                q_mat.slice_mut(s![pt..pt + pls, pt + pls..]),
                q0_geom.q_ls,
                xlsr,
                drow,
            );
            mirror_upper_to_lower(&mut q_mat);

            scaled_outer_add(r_a.slice_mut(s![0..pt, 0..pt]), q_tt_a, xtr, xtr);
            scaled_outer_add(r_a.slice_mut(s![0..pt, 0..pt]), q_tt, xta.view(), xtr);
            scaled_outer_add(r_a.slice_mut(s![0..pt, 0..pt]), q_tt, xtr, xta.view());
            scaled_outer_add(r_a.slice_mut(s![0..pt, pt..pt + pls]), q_tl_a, xtr, xlsr);
            scaled_outer_add(
                r_a.slice_mut(s![0..pt, pt..pt + pls]),
                q_tl,
                xta.view(),
                xlsr,
            );
            scaled_outer_add(
                r_a.slice_mut(s![0..pt, pt..pt + pls]),
                q_tl,
                xtr,
                xlsa.view(),
            );
            scaled_outer_add(
                r_a.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll_a,
                xlsr,
                xlsr,
            );
            scaled_outer_add(
                r_a.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll,
                xlsa.view(),
                xlsr,
            );
            scaled_outer_add(
                r_a.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll,
                xlsr,
                xlsa.view(),
            );
            scaled_outer_add(
                r_a.slice_mut(s![0..pt, pt + pls..]),
                q0_geom.q_t,
                xta.view(),
                drow,
            );
            scaled_outer_add(
                r_a.slice_mut(s![0..pt, pt + pls..]),
                1.0,
                xtr,
                q_tw_a.view(),
            );
            scaled_outer_add(
                r_a.slice_mut(s![pt..pt + pls, pt + pls..]),
                q0_geom.q_ls,
                xlsa.view(),
                drow,
            );
            scaled_outer_add(
                r_a.slice_mut(s![pt..pt + pls, pt + pls..]),
                1.0,
                xlsr,
                q_lw_a.view(),
            );
            mirror_upper_to_lower(&mut r_a);

            scaled_outer_add(r_b.slice_mut(s![0..pt, 0..pt]), q_tt_b, xtr, xtr);
            scaled_outer_add(r_b.slice_mut(s![0..pt, 0..pt]), q_tt, xtb.view(), xtr);
            scaled_outer_add(r_b.slice_mut(s![0..pt, 0..pt]), q_tt, xtr, xtb.view());
            scaled_outer_add(r_b.slice_mut(s![0..pt, pt..pt + pls]), q_tl_b, xtr, xlsr);
            scaled_outer_add(
                r_b.slice_mut(s![0..pt, pt..pt + pls]),
                q_tl,
                xtb.view(),
                xlsr,
            );
            scaled_outer_add(
                r_b.slice_mut(s![0..pt, pt..pt + pls]),
                q_tl,
                xtr,
                xlsb.view(),
            );
            scaled_outer_add(
                r_b.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll_b,
                xlsr,
                xlsr,
            );
            scaled_outer_add(
                r_b.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll,
                xlsb.view(),
                xlsr,
            );
            scaled_outer_add(
                r_b.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll,
                xlsr,
                xlsb.view(),
            );
            scaled_outer_add(
                r_b.slice_mut(s![0..pt, pt + pls..]),
                q0_geom.q_t,
                xtb.view(),
                drow,
            );
            scaled_outer_add(
                r_b.slice_mut(s![0..pt, pt + pls..]),
                1.0,
                xtr,
                q_tw_b.view(),
            );
            scaled_outer_add(
                r_b.slice_mut(s![pt..pt + pls, pt + pls..]),
                q0_geom.q_ls,
                xlsb.view(),
                drow,
            );
            scaled_outer_add(
                r_b.slice_mut(s![pt..pt + pls, pt + pls..]),
                1.0,
                xlsr,
                q_lw_b.view(),
            );
            mirror_upper_to_lower(&mut r_b);

            scaled_outer_add(r_ab.slice_mut(s![0..pt, 0..pt]), q_tt_ab, xtr, xtr);
            scaled_outer_add(r_ab.slice_mut(s![0..pt, 0..pt]), q_tt_b, xta.view(), xtr);
            scaled_outer_add(r_ab.slice_mut(s![0..pt, 0..pt]), q_tt_b, xtr, xta.view());
            scaled_outer_add(r_ab.slice_mut(s![0..pt, 0..pt]), q_tt_a, xtb.view(), xtr);
            scaled_outer_add(r_ab.slice_mut(s![0..pt, 0..pt]), q_tt_a, xtr, xtb.view());
            scaled_outer_add(r_ab.slice_mut(s![0..pt, 0..pt]), q_tt, xtab.view(), xtr);
            scaled_outer_add(r_ab.slice_mut(s![0..pt, 0..pt]), q_tt, xtr, xtab.view());
            scaled_outer_add(
                r_ab.slice_mut(s![0..pt, 0..pt]),
                q_tt,
                xta.view(),
                xtb.view(),
            );
            scaled_outer_add(
                r_ab.slice_mut(s![0..pt, 0..pt]),
                q_tt,
                xtb.view(),
                xta.view(),
            );

            scaled_outer_add(r_ab.slice_mut(s![0..pt, pt..pt + pls]), q_tl_ab, xtr, xlsr);
            scaled_outer_add(
                r_ab.slice_mut(s![0..pt, pt..pt + pls]),
                q_tl_b,
                xta.view(),
                xlsr,
            );
            scaled_outer_add(
                r_ab.slice_mut(s![0..pt, pt..pt + pls]),
                q_tl_b,
                xtr,
                xlsa.view(),
            );
            scaled_outer_add(
                r_ab.slice_mut(s![0..pt, pt..pt + pls]),
                q_tl_a,
                xtb.view(),
                xlsr,
            );
            scaled_outer_add(
                r_ab.slice_mut(s![0..pt, pt..pt + pls]),
                q_tl_a,
                xtr,
                xlsb.view(),
            );
            scaled_outer_add(
                r_ab.slice_mut(s![0..pt, pt..pt + pls]),
                q_tl,
                xtab.view(),
                xlsr,
            );
            scaled_outer_add(
                r_ab.slice_mut(s![0..pt, pt..pt + pls]),
                q_tl,
                xtr,
                xlsab.view(),
            );
            scaled_outer_add(
                r_ab.slice_mut(s![0..pt, pt..pt + pls]),
                q_tl,
                xta.view(),
                xlsb.view(),
            );
            scaled_outer_add(
                r_ab.slice_mut(s![0..pt, pt..pt + pls]),
                q_tl,
                xtb.view(),
                xlsa.view(),
            );

            scaled_outer_add(
                r_ab.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll_ab,
                xlsr,
                xlsr,
            );
            scaled_outer_add(
                r_ab.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll_b,
                xlsa.view(),
                xlsr,
            );
            scaled_outer_add(
                r_ab.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll_b,
                xlsr,
                xlsa.view(),
            );
            scaled_outer_add(
                r_ab.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll_a,
                xlsb.view(),
                xlsr,
            );
            scaled_outer_add(
                r_ab.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll_a,
                xlsr,
                xlsb.view(),
            );
            scaled_outer_add(
                r_ab.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll,
                xlsab.view(),
                xlsr,
            );
            scaled_outer_add(
                r_ab.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll,
                xlsr,
                xlsab.view(),
            );
            scaled_outer_add(
                r_ab.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll,
                xlsa.view(),
                xlsb.view(),
            );
            scaled_outer_add(
                r_ab.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll,
                xlsb.view(),
                xlsa.view(),
            );
            scaled_outer_add(
                r_ab.slice_mut(s![0..pt, pt + pls..]),
                q0_geom.q_t,
                xtab.view(),
                drow,
            );
            scaled_outer_add(
                r_ab.slice_mut(s![0..pt, pt + pls..]),
                1.0,
                xta.view(),
                q_tw_b.view(),
            );
            scaled_outer_add(
                r_ab.slice_mut(s![0..pt, pt + pls..]),
                1.0,
                xtb.view(),
                q_tw_a.view(),
            );
            scaled_outer_add(
                r_ab.slice_mut(s![0..pt, pt + pls..]),
                1.0,
                xtr,
                q_tw_ab.view(),
            );
            scaled_outer_add(
                r_ab.slice_mut(s![pt..pt + pls, pt + pls..]),
                q0_geom.q_ls,
                xlsab.view(),
                drow,
            );
            scaled_outer_add(
                r_ab.slice_mut(s![pt..pt + pls, pt + pls..]),
                1.0,
                xlsa.view(),
                q_lw_b.view(),
            );
            scaled_outer_add(
                r_ab.slice_mut(s![pt..pt + pls, pt + pls..]),
                1.0,
                xlsb.view(),
                q_lw_a.view(),
            );
            scaled_outer_add(
                r_ab.slice_mut(s![pt..pt + pls, pt + pls..]),
                1.0,
                xlsr,
                q_lw_ab.view(),
            );
            mirror_upper_to_lower(&mut r_ab);

            hessian_psi_psi.scaled_add(loss_1, &r_ab);
            hessian_psi_psi.scaled_add(loss_2 * q_b, &r_a);
            hessian_psi_psi.scaled_add(loss_2 * q_a, &r_b);
            scaled_outer_add(hessian_psi_psi.view_mut(), loss_2, c_ab.view(), b.view());
            scaled_outer_add(hessian_psi_psi.view_mut(), loss_2, b.view(), c_ab.view());
            scaled_outer_add(hessian_psi_psi.view_mut(), loss_2, c_a.view(), c_b.view());
            scaled_outer_add(hessian_psi_psi.view_mut(), loss_2, c_b.view(), c_a.view());
            hessian_psi_psi.scaled_add(loss_2 * q_ab, &q_mat);
            scaled_outer_add(
                hessian_psi_psi.view_mut(),
                loss_3 * q_b,
                c_a.view(),
                b.view(),
            );
            scaled_outer_add(
                hessian_psi_psi.view_mut(),
                loss_3 * q_b,
                b.view(),
                c_a.view(),
            );
            scaled_outer_add(
                hessian_psi_psi.view_mut(),
                loss_3 * q_a,
                c_b.view(),
                b.view(),
            );
            scaled_outer_add(
                hessian_psi_psi.view_mut(),
                loss_3 * q_a,
                b.view(),
                c_b.view(),
            );
            hessian_psi_psi.scaled_add(loss_3 * q_a * q_b, &q_mat);
            scaled_outer_add(
                hessian_psi_psi.view_mut(),
                loss_4 * q_a * q_b + loss_3 * q_ab,
                b.view(),
                b.view(),
            );
        }

        Ok(crate::custom_family::ExactNewtonJointPsiSecondOrderTerms {
            objective_psi_psi,
            score_psi_psi,
            hessian_psi_psi,
            hessian_psi_psi_operator: None,
        })
    }

    fn exact_newton_joint_psihessian_directional_derivative_from_designs(
        &self,
        block_states: &[ParameterBlockState],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        d_beta_flat: &Array1<f64>,
        x_t: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        let Some(dir_a) = self.exact_newton_joint_psi_direction(
            block_states,
            derivative_blocks,
            psi_index,
            x_t,
            x_ls,
            &self.policy,
        )?
        else {
            return Ok(None);
        };
        Ok(Some(
            self.exact_newton_joint_psihessian_directional_derivative_from_parts(
                block_states,
                &dir_a,
                d_beta_flat,
                x_t,
                x_ls,
            )?,
        ))
    }

    fn exact_newton_joint_psihessian_directional_derivative_from_parts(
        &self,
        block_states: &[ParameterBlockState],
        dir_a: &LocationScaleJointPsiDirection,
        d_beta_flat: &Array1<f64>,
        x_t: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<Array2<f64>, String> {
        let pt = x_t.ncols();
        let pls = x_ls.ncols();
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        let betaw = &block_states[Self::BLOCK_WIGGLE].beta;
        let core = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            Some(etaw),
            &self.link_kind,
        )?;
        let base_core = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            None,
            &self.link_kind,
        )?;
        let b0 = self.wiggle_design(base_core.q0.view())?;
        let d0 =
            self.wiggle_basiswith_options(base_core.q0.view(), BasisOptions::first_derivative())?;
        let dd0 =
            self.wiggle_basiswith_options(base_core.q0.view(), BasisOptions::second_derivative())?;
        let d3_basis = self.wiggle_d3basis_constrained(base_core.q0.view())?;
        let d4q = self.wiggle_d4q_dq04(base_core.q0.view(), betaw.view())?;
        let pw = b0.ncols();
        let layout = GamlssBetaLayout::withwiggle(pt, pls, pw);
        let (u_t, u_ls, uw) = layout.split_three(
            d_beta_flat,
            "wiggle joint psi hessian directional derivative",
        )?;
        let total = pt + pls + pw;
        if d0.ncols() != betaw.len()
            || dd0.ncols() != betaw.len()
            || d3_basis.ncols() != betaw.len()
        {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "wiggle derivative/beta mismatch in joint psi mixed drift: B'={} B''={} B'''={} betaw={}",
                d0.ncols(),
                dd0.ncols(),
                d3_basis.ncols(),
                betaw.len()
            ) }.into());
        }
        let xi_t = x_t.dot(&u_t);
        let xi_ls = x_ls.dot(&u_ls);
        let x_t_map = dir_a.x_primary_psi.as_linear_map_ref();
        let x_ls_map = dir_a.x_ls_psi.as_linear_map_ref();
        let m = d0.dot(betaw) + 1.0;
        let g2 = dd0.dot(betaw);
        let g3 = self.wiggle_d3q_dq03(base_core.q0.view(), betaw.view())?;
        let g4 = d4q;
        let (sigma, ds, d2s, d3s, d4s) = exp_sigma_derivs_up_to_fourth_array(eta_ls.view());

        // Exact likelihood-side mixed drift T_a[u] = D_beta H_{psi_a}^{(D)}[u].
        //
        // The unified outer Hessian in custom_family.rs uses
        //   ddot H_ij = H_ij + T_i[beta_j] + T_j[beta_i]
        //             + D_beta H[beta_ij] + D_beta^2 H[beta_i, beta_j].
        //
        // For wiggle we still use the same scalar-loss row kernel as non-wiggle;
        // only the location-side row changes to z_r = [x_{t,r}; B_r(q0)] with
        // q = q0 + betaw^T B(q0), q0 = -eta_t * exp(-eta_ls).
        let mut out = Array2::<f64>::zeros((total, total));
        let mut b = Array1::<f64>::zeros(total);
        let mut c_a = Array1::<f64>::zeros(total);
        let mut gamma = Array1::<f64>::zeros(total);
        let mut gamma_a = Array1::<f64>::zeros(total);
        let mut q_mat = Array2::<f64>::zeros((total, total));
        let mut r_a = Array2::<f64>::zeros((total, total));
        let mut c_u = Array2::<f64>::zeros((total, total));
        let mut delta_a = Array2::<f64>::zeros((total, total));
        let mut q_tw = Array1::<f64>::zeros(pw);
        let mut q_lw = Array1::<f64>::zeros(pw);
        let mut qw_a = Array1::<f64>::zeros(pw);
        let mut q_tw_a = Array1::<f64>::zeros(pw);
        let mut q_lw_a = Array1::<f64>::zeros(pw);
        let mut dq_tw_u = Array1::<f64>::zeros(pw);
        let mut dq_lw_u = Array1::<f64>::zeros(pw);
        let mut dq_tw_a_u = Array1::<f64>::zeros(pw);
        let mut dq_lw_a_u = Array1::<f64>::zeros(pw);
        for row in 0..n {
            let q = core.q0[row] + etaw[row];
            let (loss_1, loss_2, loss_3) = binomial_neglog_q_derivatives_dispatch(
                self.y[row],
                self.weights[row],
                q,
                core.mu[row],
                core.dmu_dq[row],
                core.d2mu_dq2[row],
                core.d3mu_dq3[row],
                &self.link_kind,
            );
            let loss_4 = binomial_neglog_q_fourth_derivative_dispatch(
                self.y[row],
                self.weights[row],
                q,
                core.mu[row],
                core.dmu_dq[row],
                core.d2mu_dq2[row],
                core.d3mu_dq3[row],
                &self.link_kind,
            )?;
            let q0 = nonwiggle_q_derivs(eta_t[row], sigma[row]);
            let s_safe = sigma[row];
            let s2 = s_safe * s_safe;
            let s3 = s2 * s_safe;
            let s4 = s3 * s_safe;
            let s5 = s4 * s_safe;
            let q0_tl_ls_ls = d3s[row] / s2 - 6.0 * ds[row] * d2s[row] / s3
                + 6.0 * ds[row] * ds[row] * ds[row] / s4;
            let q0_tl_ls_ls_ls =
                d4s[row] / s2 - 8.0 * ds[row] * d3s[row] / s3 - 6.0 * d2s[row] * d2s[row] / s3
                    + 36.0 * ds[row] * ds[row] * d2s[row] / s4
                    - 24.0 * ds[row] * ds[row] * ds[row] * ds[row] / s5;
            let q0_ll_ls_ls = eta_t[row] * q0_tl_ls_ls_ls;

            let xtr = x_t.row(row);
            let xlsr = x_ls.row(row);
            let xta = x_t_map.row_vector(row)?;
            let xlsa = x_ls_map.row_vector(row)?;
            let br = b0.row(row);
            let dr = d0.row(row);
            let ddr = dd0.row(row);
            let d3r = d3_basis.row(row);

            let xi_t_i = xi_t[row];
            let xi_ls_i = xi_ls[row];
            let xi_ta_i = xta.dot(&u_t);
            let xi_lsa_i = xlsa.dot(&u_ls);
            let d_dot_u = dr.dot(&uw);
            let dd_dot_u = ddr.dot(&uw);
            let d3_dot_u = d3r.dot(&uw);

            let dq0_u = q0.q_t * xi_t_i + q0.q_ls * xi_ls_i;
            let dq0_t_u = q0.q_tl * xi_ls_i;
            let dq0_ls_u = q0.q_tl * xi_t_i + q0.q_ll * xi_ls_i;
            let dq0_tl_u = q0.q_tl_ls * xi_ls_i;
            let dq0_ll_u = q0.q_tl_ls * xi_t_i + q0.q_ll_ls * xi_ls_i;
            let dq0_tl_ls_u = q0_tl_ls_ls * xi_ls_i;
            let dq0_ll_ls_u = q0_tl_ls_ls * xi_t_i + q0_ll_ls_ls * xi_ls_i;

            let q0_a = -q0.q_t * dir_a.z_primary_psi[row] - q0.q_ls * dir_a.z_ls_psi[row];
            let q0_t_a = q0.q_tl_ls * dir_a.z_ls_psi[row];
            let q0_ls_a = q0.q_tl_ls * dir_a.z_primary_psi[row] + q0.q_ll_ls * dir_a.z_ls_psi[row];
            let q0_tl_a = q0.q_tl_ls * dir_a.z_ls_psi[row];
            let q0_ll_a = q0.q_tl_ls * dir_a.z_primary_psi[row] + q0.q_ll_ls * dir_a.z_ls_psi[row];
            let dq0_a_u = q0_t_a * xi_t_i + q0_ls_a * xi_ls_i;
            let dq0_t_a_u = dq0_tl_ls_u * dir_a.z_ls_psi[row];
            let dq0_ls_a_u =
                dq0_tl_ls_u * dir_a.z_primary_psi[row] + dq0_ll_ls_u * dir_a.z_ls_psi[row];
            let dq0_tl_a_u = dq0_tl_ls_u * dir_a.z_ls_psi[row];
            let dq0_ll_a_u =
                dq0_tl_ls_u * dir_a.z_primary_psi[row] + dq0_ll_ls_u * dir_a.z_ls_psi[row];

            let q_t = m[row] * q0.q_t;
            let q_ls = m[row] * q0.q_ls;
            let q_tt = g2[row] * q0.q_t * q0.q_t;
            let q_tl = g2[row] * q0.q_t * q0.q_ls + m[row] * q0.q_tl;
            let q_ll = g2[row] * q0.q_ls * q0.q_ls + m[row] * q0.q_ll;
            q_tw.fill(0.0);
            q_tw.scaled_add(q0.q_t, &dr);
            q_lw.fill(0.0);
            q_lw.scaled_add(q0.q_ls, &dr);

            let dm_u = g2[row] * dq0_u + d_dot_u;
            let dg2_u = g3[row] * dq0_u + dd_dot_u;
            let dg3_u = g4[row] * dq0_u + d3_dot_u;

            let q_a = m[row] * q0_a;
            let q_t_a = g2[row] * q0_a * q0.q_t + m[row] * q0_t_a;
            let q_ls_a = g2[row] * q0_a * q0.q_ls + m[row] * q0_ls_a;
            let q_tt_a = g3[row] * q0_a * q0.q_t * q0.q_t + g2[row] * (2.0 * q0.q_t * q0_t_a);
            let q_tl_a = g3[row] * q0_a * q0.q_t * q0.q_ls
                + g2[row] * (q0_t_a * q0.q_ls + q0.q_t * q0_ls_a + q0_a * q0.q_tl)
                + m[row] * q0_tl_a;
            let q_ll_a = g3[row] * q0_a * q0.q_ls * q0.q_ls
                + g2[row] * (2.0 * q0.q_ls * q0_ls_a + q0_a * q0.q_ll)
                + m[row] * q0_ll_a;
            qw_a.fill(0.0);
            qw_a.scaled_add(q0_a, &dr);
            q_tw_a.fill(0.0);
            q_tw_a.scaled_add(q0_a * q0.q_t, &ddr);
            q_tw_a.scaled_add(q0_t_a, &dr);
            q_lw_a.fill(0.0);
            q_lw_a.scaled_add(q0_a * q0.q_ls, &ddr);
            q_lw_a.scaled_add(q0_ls_a, &dr);

            let dq_tt_u = dg2_u * q0.q_t * q0.q_t + g2[row] * (2.0 * q0.q_t * dq0_t_u);
            let dq_tl_u = dg2_u * q0.q_t * q0.q_ls
                + g2[row] * (dq0_t_u * q0.q_ls + q0.q_t * dq0_ls_u)
                + dm_u * q0.q_tl
                + m[row] * dq0_tl_u;
            let dq_ll_u = dg2_u * q0.q_ls * q0.q_ls
                + g2[row] * (2.0 * q0.q_ls * dq0_ls_u)
                + dm_u * q0.q_ll
                + m[row] * dq0_ll_u;
            dq_tw_u.fill(0.0);
            dq_tw_u.scaled_add(dq0_u * q0.q_t, &ddr);
            dq_tw_u.scaled_add(dq0_t_u, &dr);
            dq_lw_u.fill(0.0);
            dq_lw_u.scaled_add(dq0_u * q0.q_ls, &ddr);
            dq_lw_u.scaled_add(dq0_ls_u, &dr);

            let dq_tt_a_u = dg3_u * q0_a * q0.q_t * q0.q_t
                + g3[row] * (dq0_a_u * q0.q_t * q0.q_t + 2.0 * q0_a * q0.q_t * dq0_t_u)
                + dg2_u * (2.0 * q0.q_t * q0_t_a)
                + g2[row] * (2.0 * dq0_t_u * q0_t_a + 2.0 * q0.q_t * dq0_t_a_u);
            let dq_tl_a_u = dg3_u * q0_a * q0.q_t * q0.q_ls
                + g3[row]
                    * (dq0_a_u * q0.q_t * q0.q_ls
                        + q0_a * dq0_t_u * q0.q_ls
                        + q0_a * q0.q_t * dq0_ls_u)
                + dg2_u * (q0_t_a * q0.q_ls + q0.q_t * q0_ls_a + q0_a * q0.q_tl)
                + g2[row]
                    * (dq0_t_a_u * q0.q_ls
                        + q0_t_a * dq0_ls_u
                        + dq0_t_u * q0_ls_a
                        + q0.q_t * dq0_ls_a_u
                        + dq0_a_u * q0.q_tl
                        + q0_a * dq0_tl_u)
                + dm_u * q0_tl_a
                + m[row] * dq0_tl_a_u;
            let dq_ll_a_u = dg3_u * q0_a * q0.q_ls * q0.q_ls
                + g3[row] * (dq0_a_u * q0.q_ls * q0.q_ls + 2.0 * q0_a * q0.q_ls * dq0_ls_u)
                + dg2_u * (2.0 * q0.q_ls * q0_ls_a + q0_a * q0.q_ll)
                + g2[row]
                    * (2.0 * dq0_ls_u * q0_ls_a
                        + 2.0 * q0.q_ls * dq0_ls_a_u
                        + dq0_a_u * q0.q_ll
                        + q0_a * dq0_ll_u)
                + dm_u * q0_ll_a
                + m[row] * dq0_ll_a_u;
            dq_tw_a_u.fill(0.0);
            dq_tw_a_u.scaled_add(dq0_u * q0_a * q0.q_t, &d3r);
            dq_tw_a_u.scaled_add(dq0_a_u * q0.q_t + q0_a * dq0_t_u + dq0_u * q0_t_a, &ddr);
            dq_tw_a_u.scaled_add(dq0_t_a_u, &dr);
            dq_lw_a_u.fill(0.0);
            dq_lw_a_u.scaled_add(dq0_u * q0_a * q0.q_ls, &d3r);
            dq_lw_a_u.scaled_add(dq0_a_u * q0.q_ls + q0_a * dq0_ls_u + dq0_u * q0_ls_a, &ddr);
            dq_lw_a_u.scaled_add(dq0_ls_a_u, &dr);

            b.fill(0.0);
            b.slice_mut(s![0..pt]).scaled_add(q_t, &xtr);
            b.slice_mut(s![pt..pt + pls]).scaled_add(q_ls, &xlsr);
            b.slice_mut(s![pt + pls..]).assign(&br);

            c_a.fill(0.0);
            c_a.slice_mut(s![0..pt]).scaled_add(q_t_a, &xtr);
            c_a.slice_mut(s![0..pt]).scaled_add(q_t, &xta.view());
            c_a.slice_mut(s![pt..pt + pls]).scaled_add(q_ls_a, &xlsr);
            c_a.slice_mut(s![pt..pt + pls])
                .scaled_add(q_ls, &xlsa.view());
            c_a.slice_mut(s![pt + pls..]).assign(&qw_a);

            gamma.fill(0.0);
            gamma
                .slice_mut(s![0..pt])
                .scaled_add(q_tt * xi_t_i + q_tl * xi_ls_i + q0.q_t * d_dot_u, &xtr);
            gamma
                .slice_mut(s![pt..pt + pls])
                .scaled_add(q_tl * xi_t_i + q_ll * xi_ls_i + q0.q_ls * d_dot_u, &xlsr);
            gamma.slice_mut(s![pt + pls..]).scaled_add(dq0_u, &dr);

            let q_tw_a_dot_u = q_tw_a.dot(&uw);
            let q_lw_a_dot_u = q_lw_a.dot(&uw);
            gamma_a.fill(0.0);
            gamma_a.slice_mut(s![0..pt]).scaled_add(
                q_tt_a * xi_t_i
                    + q_tt * xi_ta_i
                    + q_tl_a * xi_ls_i
                    + q_tl * xi_lsa_i
                    + q_tw_a_dot_u,
                &xtr,
            );
            gamma_a.slice_mut(s![0..pt]).scaled_add(
                q_tt * xi_t_i + q_tl * xi_ls_i + q0.q_t * d_dot_u,
                &xta.view(),
            );
            gamma_a.slice_mut(s![pt..pt + pls]).scaled_add(
                q_tl_a * xi_t_i
                    + q_tl * xi_ta_i
                    + q_ll_a * xi_ls_i
                    + q_ll * xi_lsa_i
                    + q_lw_a_dot_u,
                &xlsr,
            );
            gamma_a.slice_mut(s![pt..pt + pls]).scaled_add(
                q_tl * xi_t_i + q_ll * xi_ls_i + q0.q_ls * d_dot_u,
                &xlsa.view(),
            );
            gamma_a
                .slice_mut(s![pt + pls..])
                .scaled_add(xi_t_i, &q_tw_a);
            gamma_a.slice_mut(s![pt + pls..]).scaled_add(xi_ta_i, &q_tw);
            gamma_a
                .slice_mut(s![pt + pls..])
                .scaled_add(xi_ls_i, &q_lw_a);
            gamma_a
                .slice_mut(s![pt + pls..])
                .scaled_add(xi_lsa_i, &q_lw);

            let alpha = b.dot(d_beta_flat);
            let alpha_a = c_a.dot(d_beta_flat);

            q_mat.fill(0.0);
            scaled_outer_add(q_mat.slice_mut(s![0..pt, 0..pt]), q_tt, xtr, xtr);
            scaled_outer_add(q_mat.slice_mut(s![0..pt, pt..pt + pls]), q_tl, xtr, xlsr);
            scaled_outer_add(
                q_mat.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll,
                xlsr,
                xlsr,
            );
            scaled_outer_add(
                q_mat.slice_mut(s![0..pt, pt + pls..]),
                1.0,
                xtr,
                q_tw.view(),
            );
            scaled_outer_add(
                q_mat.slice_mut(s![pt..pt + pls, pt + pls..]),
                1.0,
                xlsr,
                q_lw.view(),
            );
            mirror_upper_to_lower(&mut q_mat);

            r_a.fill(0.0);
            scaled_outer_add(r_a.slice_mut(s![0..pt, 0..pt]), q_tt_a, xtr, xtr);
            scaled_outer_add(r_a.slice_mut(s![0..pt, 0..pt]), q_tt, xta.view(), xtr);
            scaled_outer_add(r_a.slice_mut(s![0..pt, 0..pt]), q_tt, xtr, xta.view());
            scaled_outer_add(r_a.slice_mut(s![0..pt, pt..pt + pls]), q_tl_a, xtr, xlsr);
            scaled_outer_add(
                r_a.slice_mut(s![0..pt, pt..pt + pls]),
                q_tl,
                xta.view(),
                xlsr,
            );
            scaled_outer_add(
                r_a.slice_mut(s![0..pt, pt..pt + pls]),
                q_tl,
                xtr,
                xlsa.view(),
            );
            scaled_outer_add(
                r_a.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll_a,
                xlsr,
                xlsr,
            );
            scaled_outer_add(
                r_a.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll,
                xlsa.view(),
                xlsr,
            );
            scaled_outer_add(
                r_a.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                q_ll,
                xlsr,
                xlsa.view(),
            );
            scaled_outer_add(
                r_a.slice_mut(s![0..pt, pt + pls..]),
                1.0,
                xta.view(),
                q_tw.view(),
            );
            scaled_outer_add(
                r_a.slice_mut(s![0..pt, pt + pls..]),
                1.0,
                xtr,
                q_tw_a.view(),
            );
            scaled_outer_add(
                r_a.slice_mut(s![pt..pt + pls, pt + pls..]),
                1.0,
                xlsa.view(),
                q_lw.view(),
            );
            scaled_outer_add(
                r_a.slice_mut(s![pt..pt + pls, pt + pls..]),
                1.0,
                xlsr,
                q_lw_a.view(),
            );
            mirror_upper_to_lower(&mut r_a);

            c_u.fill(0.0);
            scaled_outer_add(c_u.slice_mut(s![0..pt, 0..pt]), dq_tt_u, xtr, xtr);
            scaled_outer_add(c_u.slice_mut(s![0..pt, pt..pt + pls]), dq_tl_u, xtr, xlsr);
            scaled_outer_add(
                c_u.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                dq_ll_u,
                xlsr,
                xlsr,
            );
            scaled_outer_add(
                c_u.slice_mut(s![0..pt, pt + pls..]),
                1.0,
                xtr,
                dq_tw_u.view(),
            );
            scaled_outer_add(
                c_u.slice_mut(s![pt..pt + pls, pt + pls..]),
                1.0,
                xlsr,
                dq_lw_u.view(),
            );
            mirror_upper_to_lower(&mut c_u);

            delta_a.fill(0.0);
            scaled_outer_add(delta_a.slice_mut(s![0..pt, 0..pt]), dq_tt_a_u, xtr, xtr);
            scaled_outer_add(
                delta_a.slice_mut(s![0..pt, 0..pt]),
                dq_tt_u,
                xta.view(),
                xtr,
            );
            scaled_outer_add(
                delta_a.slice_mut(s![0..pt, 0..pt]),
                dq_tt_u,
                xtr,
                xta.view(),
            );
            scaled_outer_add(
                delta_a.slice_mut(s![0..pt, pt..pt + pls]),
                dq_tl_a_u,
                xtr,
                xlsr,
            );
            scaled_outer_add(
                delta_a.slice_mut(s![0..pt, pt..pt + pls]),
                dq_tl_u,
                xta.view(),
                xlsr,
            );
            scaled_outer_add(
                delta_a.slice_mut(s![0..pt, pt..pt + pls]),
                dq_tl_u,
                xtr,
                xlsa.view(),
            );
            scaled_outer_add(
                delta_a.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                dq_ll_a_u,
                xlsr,
                xlsr,
            );
            scaled_outer_add(
                delta_a.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                dq_ll_u,
                xlsa.view(),
                xlsr,
            );
            scaled_outer_add(
                delta_a.slice_mut(s![pt..pt + pls, pt..pt + pls]),
                dq_ll_u,
                xlsr,
                xlsa.view(),
            );
            scaled_outer_add(
                delta_a.slice_mut(s![0..pt, pt + pls..]),
                1.0,
                xta.view(),
                dq_tw_u.view(),
            );
            scaled_outer_add(
                delta_a.slice_mut(s![0..pt, pt + pls..]),
                1.0,
                xtr,
                dq_tw_a_u.view(),
            );
            scaled_outer_add(
                delta_a.slice_mut(s![pt..pt + pls, pt + pls..]),
                1.0,
                xlsa.view(),
                dq_lw_u.view(),
            );
            scaled_outer_add(
                delta_a.slice_mut(s![pt..pt + pls, pt + pls..]),
                1.0,
                xlsr,
                dq_lw_a_u.view(),
            );
            mirror_upper_to_lower(&mut delta_a);

            out.scaled_add(loss_1, &delta_a);
            out.scaled_add(loss_2 * alpha, &r_a);
            out.scaled_add(loss_2 * q_a, &c_u);
            scaled_outer_add(out.view_mut(), loss_2, gamma_a.view(), b.view());
            scaled_outer_add(out.view_mut(), loss_2, b.view(), gamma_a.view());
            scaled_outer_add(out.view_mut(), loss_2, gamma.view(), c_a.view());
            scaled_outer_add(out.view_mut(), loss_2, c_a.view(), gamma.view());
            out.scaled_add(loss_2 * alpha_a, &q_mat);
            scaled_outer_add(out.view_mut(), loss_3 * alpha * q_a, b.view(), b.view());
            scaled_outer_add(out.view_mut(), loss_3 * q_a, gamma.view(), b.view());
            scaled_outer_add(out.view_mut(), loss_3 * q_a, b.view(), gamma.view());
            scaled_outer_add(out.view_mut(), loss_3 * alpha, c_a.view(), b.view());
            scaled_outer_add(out.view_mut(), loss_3 * alpha, b.view(), c_a.view());
            out.scaled_add(loss_3 * alpha * q_a, &q_mat);
            scaled_outer_add(
                out.view_mut(),
                loss_4 * alpha * q_a + loss_3 * alpha_a,
                b.view(),
                b.view(),
            );
        }
        mirror_upper_to_lower(&mut out);
        Ok(out)
    }

    /// Build a turnkey wiggle block from a q-seed vector and knot settings.
    /// Returns both the block input and the generated knot vector.
    pub fn buildwiggle_block_input(
        q_seed: ArrayView1<'_, f64>,
        degree: usize,
        num_internal_knots: usize,
        penalty_order: usize,
        double_penalty: bool,
    ) -> Result<(ParameterBlockInput, Array1<f64>), String> {
        let knots = Self::initializewiggle_knots_from_q(q_seed, degree, num_internal_knots)?;
        let block = buildwiggle_block_input_from_knots(
            q_seed,
            &knots,
            degree,
            penalty_order,
            double_penalty,
        )?;
        Ok((block, knots))
    }

    /// Compute the rowwise pieces (diagonal weights + B/B' basis arrays) used
    /// to assemble the joint Hessian for the 3-block wiggle family. Both the
    /// dense Hessian path and the matrix-free workspace consume these pieces
    /// without recomputing the per-row scalar derivatives.
    fn wiggle_hessian_row_pieces(
        &self,
        block_states: &[ParameterBlockState],
    ) -> Result<BinomialLocationScaleWiggleHessianRowPieces, String> {
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        if eta_t.len() != n || eta_ls.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialLocationScaleWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }

        let betaw0 = block_states[Self::BLOCK_WIGGLE].beta.clone();
        let core0 = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            Some(etaw),
            &self.link_kind,
        )?;
        let b0 = self.wiggle_design(core0.q0.view())?;
        let d0 =
            self.wiggle_basiswith_options(core0.q0.view(), BasisOptions::first_derivative())?;
        let dd0 =
            self.wiggle_basiswith_options(core0.q0.view(), BasisOptions::second_derivative())?;
        if b0.ncols() != betaw0.len() || d0.ncols() != betaw0.len() || dd0.ncols() != betaw0.len() {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "wiggle basis/beta mismatch in exact joint Hessian: B={} B'={} B''={} betaw={}",
                    b0.ncols(),
                    d0.ncols(),
                    dd0.ncols(),
                    betaw0.len()
                ),
            }
            .into());
        }
        let m = d0.dot(&betaw0) + 1.0;
        let g2 = dd0.dot(&betaw0);
        let (sigma, ..) = exp_sigma_derivs_up_to_third(eta_ls.view());
        let mut coeff_tt = Array1::<f64>::zeros(n);
        let mut coeff_tl = Array1::<f64>::zeros(n);
        let mut coeff_ll = Array1::<f64>::zeros(n);
        let mut coeff_tw_b = Array1::<f64>::zeros(n);
        let mut coeff_tw_d = Array1::<f64>::zeros(n);
        let mut coeff_lw_b = Array1::<f64>::zeros(n);
        let mut coeff_lw_d = Array1::<f64>::zeros(n);
        let mut coeffww = Array1::<f64>::zeros(n);
        for i in 0..n {
            let q_i = core0.q0[i] + etaw[i];
            let (m1, m2, _) = binomial_neglog_q_derivatives_dispatch(
                self.y[i],
                self.weights[i],
                q_i,
                core0.mu[i],
                core0.dmu_dq[i],
                core0.d2mu_dq2[i],
                core0.d3mu_dq3[i],
                &self.link_kind,
            );
            let q0 = nonwiggle_q_derivs(eta_t[i], sigma[i]);

            let q_t = m[i] * q0.q_t;
            let q_ls = m[i] * q0.q_ls;
            let q_tt = g2[i] * q0.q_t * q0.q_t;
            let q_tl = g2[i] * q0.q_t * q0.q_ls + m[i] * q0.q_tl;
            let q_ll = g2[i] * q0.q_ls * q0.q_ls + m[i] * q0.q_ll;

            coeff_tt[i] = hessian_coeff_fromobjective_q_terms(m1, m2, q_t, q_t, q_tt);
            coeff_tl[i] = hessian_coeff_fromobjective_q_terms(m1, m2, q_t, q_ls, q_tl);
            coeff_ll[i] = hessian_coeff_fromobjective_q_terms(m1, m2, q_ls, q_ls, q_ll);
            coeff_tw_b[i] = m2 * q_t;
            coeff_tw_d[i] = m1 * q0.q_t;
            coeff_lw_b[i] = m2 * q_ls;
            coeff_lw_d[i] = m1 * q0.q_ls;
            coeffww[i] = m2;
        }
        Ok(BinomialLocationScaleWiggleHessianRowPieces {
            coeff_tt,
            coeff_tl,
            coeff_ll,
            coeff_tw_b,
            coeff_tw_d,
            coeff_lw_b,
            coeff_lw_d,
            coeffww,
            b0,
            d0,
        })
    }
}

/// Per-row pieces of the 3-block wiggle joint Hessian.
///
/// `coeff_*` are diagonal weights (length n). `b0` and `d0` are the realized
/// wiggle basis values and first-derivative values at the current q0
/// (n × p_w). The dense Hessian path assembles these into a (p_t+p_ls+p_w)²
/// matrix; the matrix-free workspace applies the operator
///
///   r_t = D_tt u_t + D_tl u_ls + D_tw_b (B v_w) + D_tw_d (B' v_w),
///   r_ls = D_tl u_t + D_ll u_ls + D_lw_b (B v_w) + D_lw_d (B' v_w),
///   r_b = D_tw_b u_t + D_lw_b u_ls + D_ww (B v_w),
///   r_d = D_tw_d u_t + D_lw_d u_ls,
///
/// and combines `out_w = B^T r_b + (B')^T r_d` to form `H v` directly.
struct BinomialLocationScaleWiggleHessianRowPieces {
    coeff_tt: Array1<f64>,
    coeff_tl: Array1<f64>,
    coeff_ll: Array1<f64>,
    coeff_tw_b: Array1<f64>,
    coeff_tw_d: Array1<f64>,
    coeff_lw_b: Array1<f64>,
    coeff_lw_d: Array1<f64>,
    coeffww: Array1<f64>,
    b0: Array2<f64>,
    d0: Array2<f64>,
}

impl BinomialLocationScaleWiggleHessianRowPieces {
    fn assemble_dense(&self, x_t: &Array2<f64>, x_ls: &Array2<f64>) -> Result<Array2<f64>, String> {
        let pt = x_t.ncols();
        let pls = x_ls.ncols();
        let pw = self.b0.ncols();
        let total = pt + pls + pw;
        let h_tt = xt_diag_x_dense(x_t, &self.coeff_tt)?;
        let h_tl = xt_diag_y_dense(x_t, &self.coeff_tl, x_ls)?;
        let h_ll = xt_diag_x_dense(x_ls, &self.coeff_ll)?;
        let h_tw = xt_diag_y_dense(x_t, &self.coeff_tw_b, &self.b0)?
            + &xt_diag_y_dense(x_t, &self.coeff_tw_d, &self.d0)?;
        let h_lw = xt_diag_y_dense(x_ls, &self.coeff_lw_b, &self.b0)?
            + &xt_diag_y_dense(x_ls, &self.coeff_lw_d, &self.d0)?;
        let hww = xt_diag_x_dense(&self.b0, &self.coeffww)?;

        let mut h = Array2::<f64>::zeros((total, total));
        h.slice_mut(s![0..pt, 0..pt]).assign(&h_tt);
        h.slice_mut(s![0..pt, pt..pt + pls]).assign(&h_tl);
        h.slice_mut(s![pt..pt + pls, pt..pt + pls]).assign(&h_ll);
        h.slice_mut(s![0..pt, pt + pls..total]).assign(&h_tw);
        h.slice_mut(s![pt..pt + pls, pt + pls..total]).assign(&h_lw);
        h.slice_mut(s![pt + pls..total, pt + pls..total])
            .assign(&hww);
        mirror_upper_to_lower(&mut h);
        Ok(h)
    }

    /// Block-diagonal Hessians (h_tt, h_ll, h_ww) without ever materializing
    /// the cross blocks. Used by `evaluate()` to populate per-block working
    /// sets.
    fn assemble_block_diagonals(
        &self,
        x_t: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> Result<(Array2<f64>, Array2<f64>, Array2<f64>), String> {
        let h_tt = xt_diag_x_dense(x_t, &self.coeff_tt)?;
        let h_ll = xt_diag_x_dense(x_ls, &self.coeff_ll)?;
        let h_ww = xt_diag_x_dense(&self.b0, &self.coeffww)?;
        Ok((h_tt, h_ll, h_ww))
    }
}

/// Per-row coefficient arrays for the BLS Wiggle joint first-directional
/// Hessian derivative `D_β H_L[u]`, shared by the dense `_directional_derivative`
/// assembly and the matrix-free `bls_wiggle_directional_operator`.
struct BinomialWiggleDhRowCoeffs {
    coeff_tt: Array1<f64>,
    coeff_tl: Array1<f64>,
    coeff_ll: Array1<f64>,
    coeff_tw_b: Array1<f64>,
    coeff_tw_d: Array1<f64>,
    coeff_tw_dd: Array1<f64>,
    coeff_lw_b: Array1<f64>,
    coeff_lw_d: Array1<f64>,
    coeff_lw_dd: Array1<f64>,
    coeffww_bb: Array1<f64>,
    coeffww_db: Array1<f64>,
}

/// All references needed to evaluate [`BinomialWiggleDhRowCoeffs`].
struct BinomialWiggleDhRowInputs<'a> {
    core0: &'a BinomialLocationScaleCore,
    eta_t: &'a Array1<f64>,
    etaw: &'a Array1<f64>,
    sigma: &'a Array1<f64>,
    m: &'a Array1<f64>,
    g2: &'a Array1<f64>,
    g3: &'a Array1<f64>,
    b0: &'a Array2<f64>,
    d0: &'a Array2<f64>,
    dd0: &'a Array2<f64>,
    uw: &'a Array1<f64>,
    d_eta_t: &'a Array1<f64>,
    d_eta_ls: &'a Array1<f64>,
}

impl BinomialLocationScaleWiggleFamily {
    /// Per-row coefficient loop for the joint first-directional Hessian
    /// derivative. The dense and operator paths build the identical 11
    /// coefficient arrays from the same canonical directional-q formulas.
    fn binomial_wiggle_dh_row_coeffs(
        &self,
        n: usize,
        inputs: &BinomialWiggleDhRowInputs<'_>,
    ) -> BinomialWiggleDhRowCoeffs {
        let BinomialWiggleDhRowInputs {
            core0,
            eta_t,
            etaw,
            sigma,
            m,
            g2,
            g3,
            b0,
            d0,
            dd0,
            uw,
            d_eta_t,
            d_eta_ls,
        } = *inputs;

        let mut coeff_tt = Array1::<f64>::zeros(n);
        let mut coeff_tl = Array1::<f64>::zeros(n);
        let mut coeff_ll = Array1::<f64>::zeros(n);
        let mut coeff_tw_b = Array1::<f64>::zeros(n);
        let mut coeff_tw_d = Array1::<f64>::zeros(n);
        let mut coeff_tw_dd = Array1::<f64>::zeros(n);
        let mut coeff_lw_b = Array1::<f64>::zeros(n);
        let mut coeff_lw_d = Array1::<f64>::zeros(n);
        let mut coeff_lw_dd = Array1::<f64>::zeros(n);
        let mut coeffww_bb = Array1::<f64>::zeros(n);
        let mut coeffww_db = Array1::<f64>::zeros(n);
        for i in 0..n {
            let q_i = core0.q0[i] + etaw[i];
            let (m1, m2, m3) = binomial_neglog_q_derivatives_dispatch(
                self.y[i],
                self.weights[i],
                q_i,
                core0.mu[i],
                core0.dmu_dq[i],
                core0.d2mu_dq2[i],
                core0.d3mu_dq3[i],
                &self.link_kind,
            );
            let q0 = nonwiggle_q_derivs(eta_t[i], sigma[i]);
            let dq0 = nonwiggle_q_directional(q0, d_eta_t[i], d_eta_ls[i]);

            let br = b0.row(i);
            let dr = d0.row(i);
            let ddr = dd0.row(i);
            let duw_i = dr.dot(uw);
            let dduw_i = ddr.dot(uw);

            let delta_m = g2[i] * dq0.delta_q + duw_i;
            let delta_g2 = g3[i] * dq0.delta_q + dduw_i;

            let q_t = m[i] * q0.q_t;
            let q_ls = m[i] * q0.q_ls;
            let q_tt = g2[i] * q0.q_t * q0.q_t;
            let q_tl = g2[i] * q0.q_t * q0.q_ls + m[i] * q0.q_tl;
            let q_ll = g2[i] * q0.q_ls * q0.q_ls + m[i] * q0.q_ll;

            let delta_q_t = delta_m * q0.q_t + m[i] * dq0.delta_q_t;
            let delta_q_ls = delta_m * q0.q_ls + m[i] * dq0.delta_q_ls;
            let delta_q_tt = delta_g2 * q0.q_t * q0.q_t + g2[i] * 2.0 * q0.q_t * dq0.delta_q_t;
            let delta_q_tl = delta_g2 * q0.q_t * q0.q_ls
                + g2[i] * (dq0.delta_q_t * q0.q_ls + q0.q_t * dq0.delta_q_ls)
                + delta_m * q0.q_tl
                + m[i] * dq0.delta_q_tl;
            let delta_q_ll = delta_g2 * q0.q_ls * q0.q_ls
                + g2[i] * 2.0 * q0.q_ls * dq0.delta_q_ls
                + delta_m * q0.q_ll
                + m[i] * dq0.delta_q_ll;

            let delta_q = m[i] * dq0.delta_q + br.dot(uw);

            coeff_tt[i] = directionalhessian_coeff_fromobjective_q_terms(
                m1, m2, m3, delta_q, q_t, q_t, q_tt, delta_q_t, delta_q_t, delta_q_tt,
            );
            coeff_tl[i] = directionalhessian_coeff_fromobjective_q_terms(
                m1, m2, m3, delta_q, q_t, q_ls, q_tl, delta_q_t, delta_q_ls, delta_q_tl,
            );
            coeff_ll[i] = directionalhessian_coeff_fromobjective_q_terms(
                m1, m2, m3, delta_q, q_ls, q_ls, q_ll, delta_q_ls, delta_q_ls, delta_q_ll,
            );
            coeff_tw_b[i] = m3 * delta_q * q_t + m2 * delta_q_t;
            coeff_tw_d[i] = m2 * (q_t * dq0.delta_q + delta_q * q0.q_t) + m1 * dq0.delta_q_t;
            coeff_tw_dd[i] = m1 * dq0.delta_q * q0.q_t;
            coeff_lw_b[i] = m3 * delta_q * q_ls + m2 * delta_q_ls;
            coeff_lw_d[i] = m2 * (q_ls * dq0.delta_q + delta_q * q0.q_ls) + m1 * dq0.delta_q_ls;
            coeff_lw_dd[i] = m1 * dq0.delta_q * q0.q_ls;
            coeffww_bb[i] = m3 * delta_q;
            coeffww_db[i] = m2 * dq0.delta_q;
        }

        BinomialWiggleDhRowCoeffs {
            coeff_tt,
            coeff_tl,
            coeff_ll,
            coeff_tw_b,
            coeff_tw_d,
            coeff_tw_dd,
            coeff_lw_b,
            coeff_lw_d,
            coeff_lw_dd,
            coeffww_bb,
            coeffww_db,
        }
    }

    /// Build the [`BlockEffectiveJacobian`] for block `block_idx`.
    ///
    /// The two-output map is (η_threshold, η_log_sigma).
    /// The wiggle block operates on the combined linear predictor through the
    /// nonlinear inverse link and has a zero effective linear Jacobian.
    ///
    /// - block 0 (threshold):  output 0 = design rows, output 1 = zeros
    /// - block 1 (log_sigma):  output 0 = zeros, output 1 = design rows
    /// - block 2 (wiggle):     all zeros (nonlinear link modulation)
    pub fn block_effective_jacobian(
        specs: &[ParameterBlockSpec],
        block_idx: usize,
    ) -> Result<Box<dyn BlockEffectiveJacobian>, String> {
        crate::util::block_jacobian::AdditiveWiggleBlockLayout {
            family: "BinomialLocationScaleWiggleFamily",
            n_outputs: 2,
            additive_blocks: &[Self::BLOCK_T, Self::BLOCK_LOG_SIGMA],
            wiggle_block: Some(Self::BLOCK_WIGGLE),
        }
        .block_effective_jacobian(specs, block_idx)
    }
}

impl CustomFamily for BinomialLocationScaleWiggleFamily {
    /// The Binomial location-scale-wiggle joint Hessian depends on β because
    /// it involves the nonlinear link function evaluated at the combined
    /// predictor, which changes with all three coefficient blocks.
    fn exact_newton_joint_hessian_beta_dependent(&self) -> bool {
        true
    }

    fn coefficient_hessian_cost(&self, specs: &[ParameterBlockSpec]) -> u64 {
        // Operator-aware: matrix-free workspace applies joint Hv at
        // O(n · (p_t + p_ℓ + p_w)); only fall back to the dense build cost when
        // `use_joint_matrix_free_path` declines the operator path.
        crate::families::location_scale_engine::location_scale_coefficient_hessian_cost(
            self.y.len() as u64,
            specs,
        )
    }

    /// The wiggle family carries a structural null-space direction: the
    /// threshold β_t and the overall wiggle-intercept combination
    /// `β_w^⊤ B(q₀)` both shift q = q₀ + B^⊤ β_w additively, which makes the
    /// penalized joint Hessian H = H_L + S near-singular along that
    /// direction (σ_min ≈ ridge_floor ≈ 1e-10).  Under the default `Smooth`
    /// regularization this null direction contributes a first-order
    /// component to `d log|H|/dρ` via `φ'(σ_min) · dσ_min/dρ` that cannot
    /// be matched by the analytic `u^⊤ (dH/dρ) u` formula — the
    /// eigenvector `u` for a near-zero σ is numerically arbitrary inside
    /// the null space, so first-order perturbation theory breaks down.
    /// `HardPseudo` excludes σ ≤ ε from BOTH log|H| and its gradient
    /// consistently, so the null direction drops out of the analytic geometry.
    fn pseudo_logdet_mode(&self) -> crate::custom_family::PseudoLogdetMode {
        crate::custom_family::PseudoLogdetMode::HardPseudo
    }

    fn block_linear_constraints(
        &self,
        block_states: &[ParameterBlockState],
        block_idx: usize,
        spec: &ParameterBlockSpec,
    ) -> Result<Option<LinearInequalityConstraints>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        if block_idx != Self::BLOCK_WIGGLE {
            return Ok(None);
        }
        Ok(monotone_wiggle_nonnegative_constraints(spec.design.ncols()))
    }

    fn post_update_block_beta(
        &self,
        block_states: &[ParameterBlockState],
        block_idx: usize,
        block_spec: &ParameterBlockSpec,
        beta: Array1<f64>,
    ) -> Result<Array1<f64>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        assert!(!block_spec.name.is_empty());
        if block_idx != Self::BLOCK_WIGGLE {
            return Ok(beta);
        }
        validate_monotone_wiggle_beta_nonnegative(
            &beta,
            "BinomialLocationScaleWiggleFamily post-update",
        )?;
        Ok(beta)
    }

    fn evaluate(&self, block_states: &[ParameterBlockState]) -> Result<FamilyEvaluation, String> {
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        if eta_t.len() != n || eta_ls.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialLocationScaleWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }

        let core = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            Some(etaw),
            &self.link_kind,
        )?;
        let wiggle_design = self.wiggle_design(core.q0.view())?;
        let dq_dq0 =
            self.wiggle_dq_dq0(core.q0.view(), block_states[Self::BLOCK_WIGGLE].beta.view())?;
        let threshold_design = self.threshold_design.as_ref().ok_or_else(|| {
            "BinomialLocationScaleWiggleFamily exact-newton path is missing threshold design"
                .to_string()
        })?;
        let log_sigma_design = self.log_sigma_design.as_ref().ok_or_else(|| {
            "BinomialLocationScaleWiggleFamily exact-newton path is missing log-sigma design"
                .to_string()
        })?;

        // Per-block gradients from the eta-space score.
        //
        //   q = q0 + w(q0), a = dq/dq0
        //   score_q = -m1   (m1 = dF/dq, F = -ℓ)
        //   grad_eta_t[i]  = score_q * a * q0_t
        //   grad_eta_ls[i] = score_q * a * q0_ls
        //   grad_q[i]      = score_q          (wiggle basis acts on q)
        let mut grad_eta_t = Array1::<f64>::zeros(n);
        let mut grad_eta_ls = Array1::<f64>::zeros(n);
        let mut grad_q = Array1::<f64>::zeros(n);
        for i in 0..n {
            let q_i = core.q0[i] + etaw[i];
            let (m1, _, _) = binomial_neglog_q_derivatives_dispatch(
                self.y[i],
                self.weights[i],
                q_i,
                core.mu[i],
                core.dmu_dq[i],
                core.d2mu_dq2[i],
                core.d3mu_dq3[i],
                &self.link_kind,
            );
            let score_q = -m1;
            let q0d = nonwiggle_q_derivs(eta_t[i], core.sigma[i]);
            grad_eta_t[i] = score_q * dq_dq0[i] * q0d.q_t;
            grad_eta_ls[i] = score_q * dq_dq0[i] * q0d.q_ls;
            grad_q[i] = score_q;
        }
        let grad_t = threshold_design.transpose_vector_multiply(&grad_eta_t);
        let grad_ls = log_sigma_design.transpose_vector_multiply(&grad_eta_ls);
        let grad_w = fast_atv(&wiggle_design, &grad_q);

        // Per-block diagonal Hessians without ever materializing the full p×p
        // joint matrix. The shared row-pieces struct exposes block diagonals
        // directly, so the cross blocks (h_tl, h_tw, h_lw) are not formed.
        let (x_t, x_ls) = self
            .exact_joint_dense_block_designs(None)?
            .ok_or("BinomialLocationScaleWiggleFamily: joint block designs unavailable")?;
        let pieces = self.wiggle_hessian_row_pieces(block_states)?;
        let (h_tt, h_ll, h_ww) = pieces.assemble_block_diagonals(&x_t, &x_ls)?;
        Ok(FamilyEvaluation {
            log_likelihood: core.log_likelihood,
            blockworking_sets: vec![
                BlockWorkingSet::ExactNewton {
                    gradient: grad_t,
                    hessian: SymmetricMatrix::Dense(h_tt),
                },
                BlockWorkingSet::ExactNewton {
                    gradient: grad_ls,
                    hessian: SymmetricMatrix::Dense(h_ll),
                },
                BlockWorkingSet::ExactNewton {
                    gradient: grad_w,
                    hessian: SymmetricMatrix::Dense(h_ww),
                },
            ],
        })
    }

    fn log_likelihood_only(&self, block_states: &[ParameterBlockState]) -> Result<f64, String> {
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        if eta_t.len() != n || eta_ls.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialLocationScaleWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        binomial_location_scale_ll_only(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            Some(etaw),
            &self.link_kind,
        )
    }

    /// Outer-only log-likelihood with optional row subsample.
    ///
    /// When `options.outer_score_subsample` is `Some`, only the sampled rows
    /// contribute; each row's per-row log-likelihood term is multiplied by
    /// `WeightedOuterRow.weight`, the Horvitz–Thompson inverse-inclusion
    /// factor 1/π_i (uniform or stratified sampling both supported), so the
    /// partial sum is an unbiased estimator of the full-data log-likelihood.
    /// When `None`, this returns the full-data `log_likelihood_only`. Inner
    /// PIRLS line searches never install the subsample option, so they
    /// continue to score the exact full-data log-likelihood.
    fn log_likelihood_only_with_options(
        &self,
        block_states: &[ParameterBlockState],
        options: &BlockwiseFitOptions,
    ) -> Result<f64, String> {
        let Some(subsample) = options.outer_score_subsample.as_ref() else {
            return self.log_likelihood_only(block_states);
        };
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        if eta_t.len() != n || eta_ls.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialLocationScaleWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        use rayon::iter::ParallelIterator;
        let link_kind = &self.link_kind;
        let ll: Result<f64, String> = subsample
            .rows
            .par_iter()
            .try_fold(
                || 0.0_f64,
                |acc, row| -> Result<f64, String> {
                    let i = row.index;
                    let wi = self.weights[i];
                    if wi == 0.0 {
                        return Ok(acc);
                    }
                    let SigmaJet1 { sigma, .. } = exp_sigma_jet1_scalar(eta_ls[i]);
                    let q0 = binomial_location_scale_q0(eta_t[i], sigma);
                    let q = q0 + etaw[i];
                    let mu = if matches!(link_kind, InverseLink::Standard(StandardLink::Probit)) {
                        0.5
                    } else {
                        let jet = inverse_link_jet_for_inverse_link(link_kind, q).map_err(|e| {
                            format!("location-scale inverse-link evaluation failed: {e}")
                        })?;
                        jet.mu
                    };
                    let term =
                        binomial_location_scale_log_likelihood(self.y[i], wi, q, link_kind, mu)?;
                    Ok(acc + row.weight * term)
                },
            )
            .try_reduce(|| 0.0_f64, |a, b| Ok(a + b));
        ll
    }

    fn requires_joint_outer_hyper_path(&self) -> bool {
        true
    }

    fn exact_newton_hessian_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        block_idx: usize,
        d_beta: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let (x_t, x_ls) = self.dense_block_designs()?;
        let pt = x_t.ncols();
        let pls = x_ls.ncols();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let core0 = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            None,
            &self.link_kind,
        )?;
        let b0 = self.wiggle_design(core0.q0.view())?;
        let pw = b0.ncols();
        let total = pt + pls + pw;

        let (range_start, range_end) = match block_idx {
            Self::BLOCK_T => (0usize, pt),
            Self::BLOCK_LOG_SIGMA => (pt, pt + pls),
            Self::BLOCK_WIGGLE => (pt + pls, total),
            _ => return Ok(None),
        };
        if d_beta.len() != (range_end - range_start) {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "block {block_idx} d_beta length mismatch: got {}, expected {}",
                    d_beta.len(),
                    range_end - range_start
                ),
            }
            .into());
        }

        // Block-local exact Newton directional derivative is extracted from the
        // full joint directional Hessian.
        //
        // For the 3-block wiggle model with beta=(beta_t,beta_ls,betaw),
        // define the full negative-loglik Hessian H(beta) in flattened block
        // coordinates. For a direction that moves only one block,
        //
        //   u = [u_t, 0,   0]   or
        //   u = [0,   u_ls,0]   or
        //   u = [0,   0,   uw],
        //
        // the exact blockwise directional Hessian required by the trait is just
        // the corresponding principal block of D H[u]:
        //
        //   D H_block[u_block]
        //   = (D H_joint[u])_{block,block}.
        //
        // This avoids maintaining a second, partially duplicated derivation for
        // the block-local case and keeps the exact-newton block callback aligned
        // with the already-validated joint formulas.
        let mut d_beta_flat = Array1::<f64>::zeros(total);
        match block_idx {
            Self::BLOCK_T => {
                d_beta_flat.slice_mut(s![0..pt]).assign(d_beta);
            }
            Self::BLOCK_LOG_SIGMA => {
                d_beta_flat.slice_mut(s![pt..pt + pls]).assign(d_beta);
            }
            Self::BLOCK_WIGGLE => {
                d_beta_flat.slice_mut(s![pt + pls..]).assign(d_beta);
            }
            _ => {}
        }
        let d_joint = self
            .exact_newton_joint_hessian_directional_derivative(block_states, &d_beta_flat)?
            .ok_or_else(|| "missing exact wiggle joint dH".to_string())?;
        let out = d_joint
            .slice(s![range_start..range_end, range_start..range_end])
            .to_owned();
        Ok(Some(out))
    }

    fn exact_newton_joint_hessian(
        &self,
        block_states: &[ParameterBlockState],
    ) -> Result<Option<Array2<f64>>, String> {
        // Exact joint Hessian for the 3-block binomial location-scale wiggle family.
        //
        // Model:
        //   q0 = -eta_t / sigma(eta_ls),
        //   q  = q0 + betaw^T B(q0),
        //   mu = Phi(q),
        //   F  = -sum_i ell_i(mu_i).
        //
        // The shared rowwise weights (coeff_tt, coeff_tl, coeff_ll, coeff_tw_b,
        // coeff_tw_d, coeff_lw_b, coeff_lw_d, coeffww) plus the realized B/B'
        // basis arrays are computed once by `wiggle_hessian_row_pieces` and
        // assembled here into the dense p×p matrix. The matrix-free workspace
        // path reuses the exact same row pieces to apply H to a vector
        // without ever forming the dense matrix.
        let Some((x_t, x_ls)) = self.exact_joint_dense_block_designs(None)? else {
            return Ok(None);
        };
        let pieces = self.wiggle_hessian_row_pieces(block_states)?;
        Ok(Some(pieces.assemble_dense(&x_t, &x_ls)?))
    }

    fn has_explicit_joint_hessian(&self) -> bool {
        true
    }

    fn exact_newton_joint_hessian_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        // Exact directional derivative dH[u] for the same 3-block model.
        //
        // Direction:
        //   u = (u_t, u_l, uw),
        //   d_eta_t = X_t u_t, d_eta_l = X_l u_l.
        //
        // Canonical objective identity for scalar-q composition:
        //   dH_ab[u] =
        //      m3 * dq * q_a q_b
        //    + m2 * (dq_a q_b + q_a dq_b + dq q_ab)
        //    + m1 * dq_ab
        // where (m1,m2,m3) are derivatives of F wrt q.
        //
        // Log-likelihood derivative relation used in code:
        //   s = d ell/dq, c = d² ell/dq², t = d³ ell/dq³
        //   m1 = -s, m2 = -c, m3 = -t.
        //
        // Required analytic chain terms:
        //
        // 1) Wiggle scalars:
        //   m  = 1 + betaw^T B'(q0)
        //   g2 = betaw^T B''(q0)
        //   g3 = betaw^T B'''(q0)
        //
        // 2) Directional wiggle scalars:
        //   dm  = (B'·uw)  + g2*dq0
        //   dg2 = (B''·uw) + g3*dq0
        //
        // 3) Directional q pieces:
        //   dq   = m*dq0 + B·uw
        //   dq_t = dm*q0_t + m*dq0_t
        //   dq_l = dm*q0_l + m*dq0_l
        //
        // 4) Directional second q pieces:
        //   dq_tt = dg2*q0_t*q0_t + g2*(2*q0_t*dq0_t)
        //   dq_tl = dg2*q0_t*q0_l + g2*(dq0_t*q0_l + q0_t*dq0_l)
        //           + dm*q0_tl + m*dq0_tl
        //   dq_ll = dg2*q0_l*q0_l + g2*(2*q0_l*dq0_l)
        //           + dm*q0_ll + m*dq0_ll
        //
        // 5) Mixed w-block directional terms:
        //   qw   = B,         dqw   = B' dq0
        //   q_tw  = q0_t B',   dq_tw  = dq0_t B' + dq0 q0_t B''
        //   q_lw  = q0_l B',   dq_lw  = dq0_l B' + dq0 q0_l B''
        //   qww  = 0,         dqww  = 0
        //
        // Implementation below follows these formulas exactly block-by-block.
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        if eta_t.len() != n || eta_ls.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialLocationScaleWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }

        let Some((x_t, x_ls)) = self.exact_joint_dense_block_designs(None)? else {
            return Ok(None);
        };
        let pt = x_t.ncols();
        let pls = x_ls.ncols();
        let betaw0 = block_states[Self::BLOCK_WIGGLE].beta.clone();
        let core0 = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            Some(etaw),
            &self.link_kind,
        )?;
        let b0 = self.wiggle_design(core0.q0.view())?;
        let pw = b0.ncols();
        let beta_layout = GamlssBetaLayout::withwiggle(pt, pls, pw);
        let total = beta_layout.total();
        let (u_t, u_ls, uw) = beta_layout.split_three(d_beta_flat, "wiggle joint d_beta")?;
        let d_eta_t = fast_av(&x_t, &u_t);
        let d_eta_ls = fast_av(&x_ls, &u_ls);

        let d0 =
            self.wiggle_basiswith_options(core0.q0.view(), BasisOptions::first_derivative())?;
        let dd0 =
            self.wiggle_basiswith_options(core0.q0.view(), BasisOptions::second_derivative())?;
        let d3q = self.wiggle_d3q_dq03(core0.q0.view(), betaw0.view())?;
        if d0.ncols() != betaw0.len() || dd0.ncols() != betaw0.len() {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "wiggle derivative/beta mismatch in exact joint dH: B'={} B''={} betaw={}",
                    d0.ncols(),
                    dd0.ncols(),
                    betaw0.len()
                ),
            }
            .into());
        }
        let m = d0.dot(&betaw0) + 1.0;
        let g2 = dd0.dot(&betaw0);
        let g3 = d3q;
        let (sigma, ..) = exp_sigma_derivs_up_to_third(eta_ls.view());

        let BinomialWiggleDhRowCoeffs {
            coeff_tt,
            coeff_tl,
            coeff_ll,
            coeff_tw_b,
            coeff_tw_d,
            coeff_tw_dd,
            coeff_lw_b,
            coeff_lw_d,
            coeff_lw_dd,
            coeffww_bb,
            coeffww_db,
        } = self.binomial_wiggle_dh_row_coeffs(
            n,
            &BinomialWiggleDhRowInputs {
                core0: &core0,
                eta_t,
                etaw,
                sigma: &sigma,
                m: &m,
                g2: &g2,
                g3: &g3,
                b0: &b0,
                d0: &d0,
                dd0: &dd0,
                uw: &uw,
                d_eta_t: &d_eta_t,
                d_eta_ls: &d_eta_ls,
            },
        );
        let d_h_tt = xt_diag_x_dense(&x_t, &coeff_tt)?;
        let d_h_tl = xt_diag_y_dense(&x_t, &coeff_tl, &x_ls)?;
        let d_h_ll = xt_diag_x_dense(&x_ls, &coeff_ll)?;
        let d_h_tw = xt_diag_y_dense(&x_t, &coeff_tw_b, &b0)?
            + &xt_diag_y_dense(&x_t, &coeff_tw_d, &d0)?
            + &xt_diag_y_dense(&x_t, &coeff_tw_dd, &dd0)?;
        let d_h_lw = xt_diag_y_dense(&x_ls, &coeff_lw_b, &b0)?
            + &xt_diag_y_dense(&x_ls, &coeff_lw_d, &d0)?
            + &xt_diag_y_dense(&x_ls, &coeff_lw_dd, &dd0)?;
        let mut d_hww = xt_diag_x_dense(&b0, &coeffww_bb)?;
        d_hww += &xt_diag_y_dense(&d0, &coeffww_db, &b0)?;
        d_hww += &xt_diag_y_dense(&b0, &coeffww_db, &d0)?;

        let mut d_h = Array2::<f64>::zeros((total, total));
        d_h.slice_mut(s![0..pt, 0..pt]).assign(&d_h_tt);
        d_h.slice_mut(s![0..pt, pt..pt + pls]).assign(&d_h_tl);
        d_h.slice_mut(s![pt..pt + pls, pt..pt + pls])
            .assign(&d_h_ll);
        d_h.slice_mut(s![0..pt, pt + pls..total]).assign(&d_h_tw);
        d_h.slice_mut(s![pt..pt + pls, pt + pls..total])
            .assign(&d_h_lw);
        d_h.slice_mut(s![pt + pls..total, pt + pls..total])
            .assign(&d_hww);
        mirror_upper_to_lower(&mut d_h);
        Ok(Some(d_h))
    }

    fn exact_newton_joint_hessiansecond_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        d_beta_u_flat: &Array1<f64>,
        d_betav_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        if eta_t.len() != n || eta_ls.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialLocationScaleWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }

        let Some((x_t, x_ls)) = self.exact_joint_dense_block_designs(None)? else {
            return Ok(None);
        };
        let pt = x_t.ncols();
        let pls = x_ls.ncols();
        let betaw0 = block_states[Self::BLOCK_WIGGLE].beta.clone();
        let core0 = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            Some(etaw),
            &self.link_kind,
        )?;
        let b0 = self.wiggle_design(core0.q0.view())?;
        let d0 =
            self.wiggle_basiswith_options(core0.q0.view(), BasisOptions::first_derivative())?;
        let dd0 =
            self.wiggle_basiswith_options(core0.q0.view(), BasisOptions::second_derivative())?;
        let d3_basis = self.wiggle_d3basis_constrained(core0.q0.view())?;
        let d3q = self.wiggle_d3q_dq03(core0.q0.view(), betaw0.view())?;
        let d4q = self.wiggle_d4q_dq04(core0.q0.view(), betaw0.view())?;
        let pw = b0.ncols();
        let beta_layout = GamlssBetaLayout::withwiggle(pt, pls, pw);
        let total = beta_layout.total();
        if d0.ncols() != betaw0.len()
            || dd0.ncols() != betaw0.len()
            || d3_basis.ncols() != betaw0.len()
        {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "wiggle derivative/beta mismatch in exact joint d2H: B'={} B''={} B'''={} betaw={}",
                d0.ncols(),
                dd0.ncols(),
                d3_basis.ncols(),
                betaw0.len()
            ) }.into());
        }

        let (u_t, u_ls, uw) = beta_layout.split_three(d_beta_u_flat, "wiggle joint d_beta_u")?;
        let (v_t, v_ls, vw) = beta_layout.split_three(d_betav_flat, "wiggle joint d_betav")?;
        let d_eta_t_u = fast_av(&x_t, &u_t);
        let d_eta_ls_u = fast_av(&x_ls, &u_ls);
        let d_eta_tv = fast_av(&x_t, &v_t);
        let d_eta_lsv = fast_av(&x_ls, &v_ls);

        let m = d0.dot(&betaw0) + 1.0;
        let g2 = dd0.dot(&betaw0);
        let g3 = d3q;
        let g4 = d4q;
        let (sigma, ds, d2s, d3s, d4s) = exp_sigma_derivs_up_to_fourth_array(eta_ls.view());

        let mut d2_h: Array2<f64> = (0..n)
            .into_par_iter()
            .map(|i| -> Result<Array2<f64>, String> {
                let mut row_h = Array2::<f64>::zeros((total, total));
                // Per-row scalar objective derivatives for F_i(q).
                let q_i = core0.q0[i] + etaw[i];
                let (m1, m2, m3) = binomial_neglog_q_derivatives_dispatch(
                    self.y[i],
                    self.weights[i],
                    q_i,
                    core0.mu[i],
                    core0.dmu_dq[i],
                    core0.d2mu_dq2[i],
                    core0.d3mu_dq3[i],
                    &self.link_kind,
                );
                let m4 = binomial_neglog_q_fourth_derivative_dispatch(
                    self.y[i],
                    self.weights[i],
                    q_i,
                    core0.mu[i],
                    core0.dmu_dq[i],
                    core0.d2mu_dq2[i],
                    core0.d3mu_dq3[i],
                    &self.link_kind,
                )?;

                // Non-wiggle q0(eta_t, eta_ls) derivatives and sigma-ratio helpers.
                let q0 = nonwiggle_q_derivs(eta_t[i], sigma[i]);
                let s_safe = sigma[i];
                let s2 = s_safe * s_safe;
                let s3 = s2 * s_safe;
                let s4 = s3 * s_safe;
                let s5 = s4 * s_safe;
                let q0_tl_ls_ls =
                    d3s[i] / s2 - 6.0 * ds[i] * d2s[i] / s3 + 6.0 * ds[i] * ds[i] * ds[i] / s4;
                let q0_tl_ls_ls_ls =
                    d4s[i] / s2 - 8.0 * ds[i] * d3s[i] / s3 - 6.0 * d2s[i] * d2s[i] / s3
                        + 36.0 * ds[i] * ds[i] * d2s[i] / s4
                        - 24.0 * ds[i] * ds[i] * ds[i] * ds[i] / s5;
                let q0_ll_ls_ls = eta_t[i] * q0_tl_ls_ls_ls;

                let u_t_i = d_eta_t_u[i];
                let u_ls_i = d_eta_ls_u[i];
                let v_t_i = d_eta_tv[i];
                let v_ls_i = d_eta_lsv[i];

                // Directional z=q0 primitives for u and v.
                let dq0_u = q0.q_t * u_t_i + q0.q_ls * u_ls_i;
                let dq0v = q0.q_t * v_t_i + q0.q_ls * v_ls_i;
                let d2q0_uv =
                    q0.q_tl * (u_t_i * v_ls_i + v_t_i * u_ls_i) + q0.q_ll * u_ls_i * v_ls_i;

                let dq0_t_u = q0.q_tl * u_ls_i;
                let dq0_tv = q0.q_tl * v_ls_i;
                let dq0_ls_u = q0.q_tl * u_t_i + q0.q_ll * u_ls_i;
                let dq0_lsv = q0.q_tl * v_t_i + q0.q_ll * v_ls_i;
                let dq0_tl_u = q0.q_tl_ls * u_ls_i;
                let dq0_tlv = q0.q_tl_ls * v_ls_i;
                let dq0_ll_u = q0.q_tl_ls * u_t_i + q0.q_ll_ls * u_ls_i;
                let dq0_llv = q0.q_tl_ls * v_t_i + q0.q_ll_ls * v_ls_i;

                let d2q0_t_uv = q0.q_tl_ls * u_ls_i * v_ls_i;
                let d2q0_ls_uv =
                    q0.q_tl_ls * (u_ls_i * v_t_i + v_ls_i * u_t_i) + q0.q_ll_ls * u_ls_i * v_ls_i;
                let d2q0_tl_uv = q0_tl_ls_ls * u_ls_i * v_ls_i;
                let d2q0_ll_uv =
                    q0_tl_ls_ls * (u_t_i * v_ls_i + v_t_i * u_ls_i) + q0_ll_ls_ls * u_ls_i * v_ls_i;

                let br = b0.row(i);
                let dr = d0.row(i);
                let ddr = dd0.row(i);
                let d3r = d3_basis.row(i);
                let b_u = br.dot(&uw);
                let bv = br.dot(&vw);
                let b1_u = dr.dot(&uw);
                let b1v = dr.dot(&vw);
                let b2_u = ddr.dot(&uw);
                let b2v = ddr.dot(&vw);
                let b3_u = d3r.dot(&uw);
                let b3v = d3r.dot(&vw);

                // Wiggle scalar chain terms:
                //   m = 1 + g1,     g2 = betaw^T B''(q0),
                //   dm[u]   = B'·uw + g2*dq0[u],
                //   d2m[u,v]= g3*dq0[u]dq0[v] + g2*d2q0[u,v] + (B''·vw)dq0[u] + (B''·uw)dq0[v],
                //   dg2[u]  = B''·uw + g3*dq0[u],
                //   d2g2[u,v]=g4*dq0[u]dq0[v] + g3*d2q0[u,v] + (B'''·vw)dq0[u] + (B'''·uw)dq0[v].
                let dm_u = b1_u + g2[i] * dq0_u;
                let dmv = b1v + g2[i] * dq0v;
                let d2m_uv = g3[i] * dq0_u * dq0v + g2[i] * d2q0_uv + b2v * dq0_u + b2_u * dq0v;
                let dg2_u = b2_u + g3[i] * dq0_u;
                let dg2v = b2v + g3[i] * dq0v;
                let d2g2_uv = g4[i] * dq0_u * dq0v + g3[i] * d2q0_uv + b3v * dq0_u + b3_u * dq0v;

                // First/second directional terms for total q.
                let dq_u = m[i] * dq0_u + b_u;
                let dqv = m[i] * dq0v + bv;
                // Simplify exact formula for q = q0 + betaw^T B(q0):
                //   D²q[u,v] = m*d²q0 + g2*dq0[u]dq0[v] + (B'·uw)dq0[v] + (B'·vw)dq0[u].
                let d2q_uv = m[i] * d2q0_uv + g2[i] * dq0_u * dq0v + b1_u * dq0v + b1v * dq0_u;

                // q partials by block and their first/second directional derivatives.
                let q_t = m[i] * q0.q_t;
                let q_ls = m[i] * q0.q_ls;
                let q_tt = g2[i] * q0.q_t * q0.q_t;
                let q_tl = g2[i] * q0.q_t * q0.q_ls + m[i] * q0.q_tl;
                let q_ll = g2[i] * q0.q_ls * q0.q_ls + m[i] * q0.q_ll;

                let dq_t_u = dm_u * q0.q_t + m[i] * dq0_t_u;
                let dq_tv = dmv * q0.q_t + m[i] * dq0_tv;
                let dq_ls_u = dm_u * q0.q_ls + m[i] * dq0_ls_u;
                let dq_lsv = dmv * q0.q_ls + m[i] * dq0_lsv;

                let d2q_t_uv = d2m_uv * q0.q_t + dm_u * dq0_tv + dmv * dq0_t_u + m[i] * d2q0_t_uv;
                let d2q_ls_uv =
                    d2m_uv * q0.q_ls + dm_u * dq0_lsv + dmv * dq0_ls_u + m[i] * d2q0_ls_uv;

                let dq_tt_u = dg2_u * q0.q_t * q0.q_t + g2[i] * (2.0 * q0.q_t * dq0_t_u);
                let dq_ttv = dg2v * q0.q_t * q0.q_t + g2[i] * (2.0 * q0.q_t * dq0_tv);
                let d2q_tt_uv = d2g2_uv * q0.q_t * q0.q_t
                    + dg2_u * (2.0 * q0.q_t * dq0_tv)
                    + dg2v * (2.0 * q0.q_t * dq0_t_u)
                    + g2[i] * (2.0 * dq0_t_u * dq0_tv + 2.0 * q0.q_t * d2q0_t_uv);

                let dq_tl_u = dg2_u * q0.q_t * q0.q_ls
                    + g2[i] * (dq0_t_u * q0.q_ls + q0.q_t * dq0_ls_u)
                    + dm_u * q0.q_tl
                    + m[i] * dq0_tl_u;
                let dq_tlv = dg2v * q0.q_t * q0.q_ls
                    + g2[i] * (dq0_tv * q0.q_ls + q0.q_t * dq0_lsv)
                    + dmv * q0.q_tl
                    + m[i] * dq0_tlv;
                let d2q_tl_uv = d2g2_uv * q0.q_t * q0.q_ls
                    + dg2_u * (dq0_tv * q0.q_ls + q0.q_t * dq0_lsv)
                    + dg2v * (dq0_t_u * q0.q_ls + q0.q_t * dq0_ls_u)
                    + g2[i]
                        * (d2q0_t_uv * q0.q_ls
                            + dq0_t_u * dq0_lsv
                            + dq0_tv * dq0_ls_u
                            + q0.q_t * d2q0_ls_uv)
                    + d2m_uv * q0.q_tl
                    + dm_u * dq0_tlv
                    + dmv * dq0_tl_u
                    + m[i] * d2q0_tl_uv;

                let dq_ll_u = dg2_u * q0.q_ls * q0.q_ls
                    + g2[i] * (2.0 * q0.q_ls * dq0_ls_u)
                    + dm_u * q0.q_ll
                    + m[i] * dq0_ll_u;
                let dq_llv = dg2v * q0.q_ls * q0.q_ls
                    + g2[i] * (2.0 * q0.q_ls * dq0_lsv)
                    + dmv * q0.q_ll
                    + m[i] * dq0_llv;
                let d2q_ll_uv = d2g2_uv * q0.q_ls * q0.q_ls
                    + dg2_u * (2.0 * q0.q_ls * dq0_lsv)
                    + dg2v * (2.0 * q0.q_ls * dq0_ls_u)
                    + g2[i] * (2.0 * dq0_ls_u * dq0_lsv + 2.0 * q0.q_ls * d2q0_ls_uv)
                    + d2m_uv * q0.q_ll
                    + dm_u * dq0_llv
                    + dmv * dq0_ll_u
                    + m[i] * d2q0_ll_uv;

                // Exact second directional coefficients for the scalar block weights.
                let coeff_tt = second_directionalhessian_coeff_fromobjective_q_terms(
                    m1, m2, m3, m4, dq_u, dqv, d2q_uv, q_t, q_t, q_tt, dq_t_u, dq_tv, dq_t_u,
                    dq_tv, d2q_t_uv, d2q_t_uv, dq_tt_u, dq_ttv, d2q_tt_uv,
                );
                let coeff_tl = second_directionalhessian_coeff_fromobjective_q_terms(
                    m1, m2, m3, m4, dq_u, dqv, d2q_uv, q_t, q_ls, q_tl, dq_t_u, dq_tv, dq_ls_u,
                    dq_lsv, d2q_t_uv, d2q_ls_uv, dq_tl_u, dq_tlv, d2q_tl_uv,
                );
                let coeff_ll = second_directionalhessian_coeff_fromobjective_q_terms(
                    m1, m2, m3, m4, dq_u, dqv, d2q_uv, q_ls, q_ls, q_ll, dq_ls_u, dq_lsv, dq_ls_u,
                    dq_lsv, d2q_ls_uv, d2q_ls_uv, dq_ll_u, dq_llv, d2q_ll_uv,
                );

                let xtr = x_t.row(i);
                let xlsr = x_ls.row(i);
                for a_idx in 0..pt {
                    for b_idx in a_idx..pt {
                        row_h[[a_idx, b_idx]] += coeff_tt * xtr[a_idx] * xtr[b_idx];
                    }
                }
                for a_idx in 0..pt {
                    for b_idx in 0..pls {
                        row_h[[a_idx, pt + b_idx]] += coeff_tl * xtr[a_idx] * xlsr[b_idx];
                    }
                }
                for a_idx in 0..pls {
                    for b_idx in a_idx..pls {
                        row_h[[pt + a_idx, pt + b_idx]] += coeff_ll * xlsr[a_idx] * xlsr[b_idx];
                    }
                }

                for j in 0..pw {
                    let qw = br[j];
                    let dqw_u = dr[j] * dq0_u;
                    let dqwv = dr[j] * dq0v;
                    let d2qw_uv = ddr[j] * dq0_u * dq0v + dr[j] * d2q0_uv;
                    let q_tw = dr[j] * q0.q_t;
                    let q_lw = dr[j] * q0.q_ls;
                    let dq_tw_u = ddr[j] * dq0_u * q0.q_t + dr[j] * dq0_t_u;
                    let dq_twv = ddr[j] * dq0v * q0.q_t + dr[j] * dq0_tv;
                    let d2q_tw_uv = d3r[j] * dq0_u * dq0v * q0.q_t
                        + ddr[j] * (d2q0_uv * q0.q_t + dq0_u * dq0_tv + dq0v * dq0_t_u)
                        + dr[j] * d2q0_t_uv;
                    let dq_lw_u = ddr[j] * dq0_u * q0.q_ls + dr[j] * dq0_ls_u;
                    let dq_lwv = ddr[j] * dq0v * q0.q_ls + dr[j] * dq0_lsv;
                    let d2q_lw_uv = d3r[j] * dq0_u * dq0v * q0.q_ls
                        + ddr[j] * (d2q0_uv * q0.q_ls + dq0_u * dq0_lsv + dq0v * dq0_ls_u)
                        + dr[j] * d2q0_ls_uv;

                    let coeff_tw = second_directionalhessian_coeff_fromobjective_q_terms(
                        m1, m2, m3, m4, dq_u, dqv, d2q_uv, q_t, qw, q_tw, dq_t_u, dq_tv, dqw_u,
                        dqwv, d2q_t_uv, d2qw_uv, dq_tw_u, dq_twv, d2q_tw_uv,
                    );
                    let coeff_lw = second_directionalhessian_coeff_fromobjective_q_terms(
                        m1, m2, m3, m4, dq_u, dqv, d2q_uv, q_ls, qw, q_lw, dq_ls_u, dq_lsv, dqw_u,
                        dqwv, d2q_ls_uv, d2qw_uv, dq_lw_u, dq_lwv, d2q_lw_uv,
                    );

                    for a_idx in 0..pt {
                        row_h[[a_idx, pt + pls + j]] += coeff_tw * xtr[a_idx];
                    }
                    for a_idx in 0..pls {
                        row_h[[pt + a_idx, pt + pls + j]] += coeff_lw * xlsr[a_idx];
                    }
                }

                for j in 0..pw {
                    let qwj = br[j];
                    let dqwj_u = dr[j] * dq0_u;
                    let dqwjv = dr[j] * dq0v;
                    let d2qwj_uv = ddr[j] * dq0_u * dq0v + dr[j] * d2q0_uv;
                    for k in j..pw {
                        let qwk = br[k];
                        let dqwk_u = dr[k] * dq0_u;
                        let dqwkv = dr[k] * dq0v;
                        let d2qwk_uv = ddr[k] * dq0_u * dq0v + dr[k] * d2q0_uv;
                        let coeffww = second_directionalhessian_coeff_fromobjective_q_terms(
                            m1, m2, m3, m4, dq_u, dqv, d2q_uv, qwj, qwk, 0.0, dqwj_u, dqwjv,
                            dqwk_u, dqwkv, d2qwj_uv, d2qwk_uv, 0.0, 0.0, 0.0,
                        );
                        row_h[[pt + pls + j, pt + pls + k]] += coeffww;
                    }
                }

                Ok(row_h)
            })
            .try_reduce(
                || Array2::<f64>::zeros((total, total)),
                |mut acc, row_h| {
                    acc += &row_h;
                    Ok(acc)
                },
            )?;

        mirror_upper_to_lower(&mut d2_h);
        Ok(Some(d2_h))
    }

    fn exact_newton_joint_hessian_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
    ) -> Result<Option<Array2<f64>>, String> {
        let Some(shadow) = self.shadow_with_exact_joint_designs(specs)? else {
            return Ok(None);
        };
        shadow.exact_newton_joint_hessian(block_states)
    }

    fn exact_newton_joint_hessian_directional_derivative_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        let Some(shadow) = self.shadow_with_exact_joint_designs(specs)? else {
            return Ok(None);
        };
        shadow.exact_newton_joint_hessian_directional_derivative(block_states, d_beta_flat)
    }

    fn exact_newton_joint_hessian_second_directional_derivative_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        d_beta_u_flat: &Array1<f64>,
        d_betav_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        let Some(shadow) = self.shadow_with_exact_joint_designs(specs)? else {
            return Ok(None);
        };
        shadow.exact_newton_joint_hessiansecond_directional_derivative(
            block_states,
            d_beta_u_flat,
            d_betav_flat,
        )
    }

    fn exact_newton_joint_psi_terms(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiTerms>, String> {
        // These three joint psi hooks are the wiggle family's exact
        // likelihood-side contribution to the unified full [rho, psi] outer
        // Hessian:
        //
        //   exact_newton_joint_psi_terms(...)                    -> D_a, D_{beta a}, D_{beta beta a}
        //   exact_newton_joint_psisecond_order_terms(...)       -> D_ab, D_{beta ab}, D_{beta beta ab}
        //   exact_newton_joint_psihessian_directional_derivative(...) -> T_a[u]
        //
        // Generic exact-joint code in custom_family.rs adds all realized
        // penalty motion S_a / S_ab and combines these likelihood-only objects
        // with the joint mode solves beta_i, beta_ij and the total Hessian
        // drifts dot H_i, ddot H_ij. Keeping this contract explicit is what
        // makes the wiggle family's full [rho, psi] Hessian real rather than a
        // gradient-only or block-local surrogate.
        self.exact_newton_joint_psi_terms_for_specs(
            block_states,
            specs,
            derivative_blocks,
            psi_index,
        )
    }

    fn exact_newton_joint_psisecond_order_terms(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_i: usize,
        psi_j: usize,
    ) -> Result<Option<crate::custom_family::ExactNewtonJointPsiSecondOrderTerms>, String> {
        self.exact_newton_joint_psisecond_order_terms_for_specs(
            block_states,
            specs,
            derivative_blocks,
            psi_i,
            psi_j,
        )
    }

    fn exact_newton_joint_psihessian_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
        psi_index: usize,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_psihessian_directional_derivative_for_specs(
            block_states,
            specs,
            derivative_blocks,
            psi_index,
            d_beta_flat,
        )
    }

    fn exact_newton_joint_psi_workspace(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>],
    ) -> Result<Option<Arc<dyn ExactNewtonJointPsiWorkspace>>, String> {
        if !self.exact_joint_supported() {
            return Ok(None);
        }
        Ok(Some(Arc::new(
            BinomialLocationScaleWiggleExactNewtonJointPsiWorkspace::new(
                self.clone(),
                block_states.to_vec(),
                specs,
                derivative_blocks.to_vec(),
            )?,
        )))
    }

    fn block_geometry(
        &self,
        block_states: &[ParameterBlockState],
        spec: &crate::custom_family::ParameterBlockSpec,
    ) -> Result<(DesignMatrix, Array1<f64>), String> {
        if spec.name != "wiggle" {
            return Ok((spec.design.clone(), spec.offset.clone()));
        }
        if block_states.len() < 2 {
            return Err(GamlssError::UnsupportedConfiguration {
                reason: "wiggle geometry requires threshold and log-sigma blocks".to_string(),
            }
            .into());
        }
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        if eta_t.len() != self.y.len() || eta_ls.len() != self.y.len() {
            return Err(GamlssError::DimensionMismatch {
                reason: "wiggle geometry input size mismatch".to_string(),
            }
            .into());
        }
        let mut q0 = Array1::<f64>::zeros(eta_t.len());
        for i in 0..q0.len() {
            let sigma = exp_sigma_from_eta_scalar(eta_ls[i]);
            q0[i] = binomial_location_scale_q0(eta_t[i], sigma);
        }
        let x = self.wiggle_design(q0.view())?;
        if x.ncols() != spec.design.ncols() {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "dynamic wiggle design col mismatch: got {}, expected {}",
                    x.ncols(),
                    spec.design.ncols()
                ),
            }
            .into());
        }
        let nrows = x.nrows();
        Ok((
            DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(x)),
            Array1::zeros(nrows),
        ))
    }

    fn block_geometry_is_dynamic(&self) -> bool {
        true
    }

    fn exact_newton_joint_hessian_workspace(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
    ) -> Result<Option<Arc<dyn ExactNewtonJointHessianWorkspace>>, String> {
        let Some((x_t, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        let workspace = BinomialLocationScaleWiggleHessianWorkspace::new(
            self.clone(),
            block_states.to_vec(),
            x_t.into_owned(),
            x_ls.into_owned(),
        )?;
        Ok(Some(Arc::new(workspace)))
    }

    /// Outer-aware joint-Hessian workspace with optional row subsample.
    ///
    /// When `options.outer_score_subsample` is `None`, this is byte-identical
    /// to `exact_newton_joint_hessian_workspace`. When `Some`, the precomputed
    /// per-row coefficient arrays in `pieces` (`coeff_tt`, `coeff_tl`,
    /// `coeff_ll`, `coeff_tw_b`, `coeff_tw_d`, `coeff_lw_b`, `coeff_lw_d`,
    /// `coeffww`) — which every downstream assembly (`hessian_dense`,
    /// `hessian_matvec`, `hessian_diagonal`) consumes row-linearly via
    /// `Xᵀ diag(W) Y` — are replaced by a Horvitz–Thompson mask: each sampled
    /// row's coefficient is multiplied by `WeightedOuterRow.weight` (the
    /// inverse-inclusion factor 1/π_i; uniform or stratified sampling both
    /// supported), and non-sampled rows are zeroed. The resulting joint
    /// Hessian is an unbiased estimator of the full-data joint Hessian.
    /// Inner PIRLS never installs the option, so the inner solve continues
    /// to consume the exact full-data Hessian.
    fn exact_newton_joint_hessian_workspace_with_options(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        options: &BlockwiseFitOptions,
    ) -> Result<Option<Arc<dyn ExactNewtonJointHessianWorkspace>>, String> {
        let Some((x_t, x_ls)) = self.exact_joint_dense_block_designs(Some(specs))? else {
            return Ok(None);
        };
        let mut workspace = BinomialLocationScaleWiggleHessianWorkspace::new(
            self.clone(),
            block_states.to_vec(),
            x_t.into_owned(),
            x_ls.into_owned(),
        )?;
        if let Some(subsample) = options.outer_score_subsample.as_ref() {
            workspace.apply_outer_subsample(subsample.rows.as_ref());
        }
        Ok(Some(Arc::new(workspace)))
    }

    /// Outer-derivative policy: declare HT-subsample capability.
    ///
    /// BinomialLocationScaleWiggleFamily overrides
    /// `log_likelihood_only_with_options` and
    /// `exact_newton_joint_hessian_workspace_with_options` to consume
    /// `options.outer_score_subsample` with per-row Horvitz–Thompson weights
    /// (each sampled row's contribution is multiplied by
    /// `WeightedOuterRow.weight = 1/π_i`; non-sampled rows are zeroed),
    /// yielding unbiased estimators of the full-data log-likelihood and
    /// joint Hessian. The ψ-workspace path is not yet subsample-aware: it
    /// builds the exact full-data ψ Hessian blocks, which are trivially
    /// unbiased; so the outer-score components are a sum of HT-unbiased and
    /// exact-unbiased pieces and the total remains an unbiased estimator of
    /// the full-data outer score. Inner-PIRLS and final-covariance paths
    /// never install the option, so they continue to consume the exact
    /// full-data quantities.
    fn outer_derivative_subsample_capable(&self) -> bool {
        true
    }

    fn inner_coefficient_hessian_hvp_available(&self, specs: &[ParameterBlockSpec]) -> bool {
        // Same gating as the workspace impl: matrix-free path is available
        // when both threshold and log-σ block designs are present (the
        // wiggle block is folded into the per-row pieces inside
        // `BinomialLocationScaleWiggleHessianWorkspace`). This advertises
        // β-space representation support only.
        self.exact_joint_supported()
            && matches!(
                self.exact_joint_dense_block_designs(Some(specs)),
                Ok(Some(_))
            )
    }
}

impl BinomialLocationScaleWiggleFamily {
    /// Build a matrix-free `RowCoeffOperator` for the BLS Wiggle joint
    /// directional derivative `D_β H_L[u]`. Channels (in order):
    /// X_t, X_ls, B (b0), B' (d0), B'' (dd0). The operator acts on the
    /// joint coefficient vector `(β_t, β_ls, β_w)`.
    fn bls_wiggle_directional_operator(
        &self,
        block_states: &[ParameterBlockState],
        x_t_arc: Arc<Array2<f64>>,
        x_ls_arc: Arc<Array2<f64>>,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Arc<dyn crate::solver::estimate::reml::unified::HyperOperator>>, String>
    {
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        if eta_t.len() != n || eta_ls.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialLocationScaleWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        let pt = x_t_arc.ncols();
        let pls = x_ls_arc.ncols();
        let betaw0 = block_states[Self::BLOCK_WIGGLE].beta.clone();
        let core0 = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            Some(etaw),
            &self.link_kind,
        )?;
        let b0 = self.wiggle_design(core0.q0.view())?;
        let pw = b0.ncols();
        let beta_layout = GamlssBetaLayout::withwiggle(pt, pls, pw);
        let total = beta_layout.total();
        if d_beta_flat.len() != total {
            return Err(GamlssError::InvalidInput {
                reason: format!(
                    "BLS wiggle dH operator: d_beta length {} != {}",
                    d_beta_flat.len(),
                    total
                ),
            }
            .into());
        }
        let (u_t, u_ls, uw) =
            beta_layout.split_three(d_beta_flat, "wiggle joint dH operator d_beta")?;
        let d_eta_t = fast_av(x_t_arc.as_ref(), &u_t);
        let d_eta_ls = fast_av(x_ls_arc.as_ref(), &u_ls);

        let d0 =
            self.wiggle_basiswith_options(core0.q0.view(), BasisOptions::first_derivative())?;
        let dd0 =
            self.wiggle_basiswith_options(core0.q0.view(), BasisOptions::second_derivative())?;
        let d3q = self.wiggle_d3q_dq03(core0.q0.view(), betaw0.view())?;
        if d0.ncols() != betaw0.len() || dd0.ncols() != betaw0.len() {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "wiggle derivative/beta mismatch in dH operator: B'={} B''={} betaw={}",
                    d0.ncols(),
                    dd0.ncols(),
                    betaw0.len()
                ),
            }
            .into());
        }
        let m = d0.dot(&betaw0) + 1.0;
        let g2 = dd0.dot(&betaw0);
        let g3 = d3q;
        let (sigma, ..) = exp_sigma_derivs_up_to_third(eta_ls.view());

        let BinomialWiggleDhRowCoeffs {
            coeff_tt,
            coeff_tl,
            coeff_ll,
            coeff_tw_b,
            coeff_tw_d,
            coeff_tw_dd,
            coeff_lw_b,
            coeff_lw_d,
            coeff_lw_dd,
            coeffww_bb,
            coeffww_db,
        } = self.binomial_wiggle_dh_row_coeffs(
            n,
            &BinomialWiggleDhRowInputs {
                core0: &core0,
                eta_t,
                etaw,
                sigma: &sigma,
                m: &m,
                g2: &g2,
                g3: &g3,
                b0: &b0,
                d0: &d0,
                dd0: &dd0,
                uw: &uw,
                d_eta_t: &d_eta_t,
                d_eta_ls: &d_eta_ls,
            },
        );

        let basis: Arc<Array2<f64>> = Arc::new(b0);
        let basis_d1: Arc<Array2<f64>> = Arc::new(d0);
        let basis_d2: Arc<Array2<f64>> = Arc::new(dd0);

        Ok(Some(Arc::new(RowCoeffOperator::from_directions(
            vec![pt, pls, pw],
            vec![
                (0, x_t_arc),
                (1, x_ls_arc),
                (2, basis),
                (2, basis_d1),
                (2, basis_d2),
            ],
            vec![
                // (X_t, X_t)  ← `xt_diag_x_dense(&x_t, &coeff_tt)`
                (0, 0, coeff_tt),
                // (X_t, X_ls) ← `xt_diag_y_dense(&x_t, &coeff_tl, &x_ls)`
                (0, 1, coeff_tl),
                // (X_ls, X_ls) ← `xt_diag_x_dense(&x_ls, &coeff_ll)`
                (1, 1, coeff_ll),
                // (X_t, B / B' / B'') ← three sub-blocks of d_h_tw =
                // `xt_diag_y_dense(x_t, coeff_tw_b, b0) + xt_diag_y_dense(
                //  x_t, coeff_tw_d, d0) + xt_diag_y_dense(x_t, coeff_tw_dd, dd0)`
                (0, 2, coeff_tw_b),
                (0, 3, coeff_tw_d),
                (0, 4, coeff_tw_dd),
                // (X_ls, B / B' / B'') ← analogous d_h_lw triple
                (1, 2, coeff_lw_b),
                (1, 3, coeff_lw_d),
                (1, 4, coeff_lw_dd),
                // (B, B) ← `xt_diag_x_dense(&b0, &coeffww_bb)`
                (2, 2, coeffww_bb),
                // (B, B') ← `xt_diag_y_dense(&d0, &coeffww_db, &b0) +
                // xt_diag_y_dense(&b0, &coeffww_db, &d0)` =
                // d0^T diag(c) b0 + b0^T diag(c) d0 (symmetric pair)
                (2, 3, coeffww_db),
            ],
            n,
        ))))
    }

    /// Build a matrix-free `RowCoeffOperator` for the BLS Wiggle joint
    /// second directional derivative `D²_β H_L[u, v]`. Channels: X_t,
    /// X_ls, B, B', B'', B'''.
    ///
    /// The dense path computes a per-row scalar `coeff_*(i, j[, k])` via
    /// `second_directionalhessian_coeff_fromobjective_q_terms` and outer-
    /// products it into the (t,t) / (t,ls) / (ls,ls) / (t,w) / (ls,w) /
    /// (w,w) blocks. Each `coeff_tw(i, j)` is *linear* in the basis
    /// derivatives at column j (`br[j], dr[j], ddr[j], d3r[j]` — they
    /// only ever appear once in the q-Hessian directional polynomial),
    /// so each per-(i,j) contribution decomposes into 4 channel-pair
    /// row coefficients (X_t, B/B'/B''/B'''). The wiggle-wiggle term
    /// `coeff_ww(i, j, k)` is *bilinear* in (br[j], dr[j], ddr[j]) ⊗
    /// (br[k], dr[k], ddr[k]), giving 4 symmetric pair coefficients on
    /// (B, B), (B, B'), (B, B''), (B', B'). No (B'', B'') term — the
    /// formula is at most degree 2 in any single basis derivative.
    fn bls_wiggle_second_directional_operator(
        &self,
        block_states: &[ParameterBlockState],
        x_t_arc: Arc<Array2<f64>>,
        x_ls_arc: Arc<Array2<f64>>,
        d_beta_u: &Array1<f64>,
        d_beta_v: &Array1<f64>,
    ) -> Result<Option<Arc<dyn crate::solver::estimate::reml::unified::HyperOperator>>, String>
    {
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let n = self.y.len();
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        if eta_t.len() != n || eta_ls.len() != n || etaw.len() != n || self.weights.len() != n {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialLocationScaleWiggleFamily input size mismatch".to_string(),
            }
            .into());
        }
        let pt = x_t_arc.ncols();
        let pls = x_ls_arc.ncols();
        let betaw0 = block_states[Self::BLOCK_WIGGLE].beta.clone();
        let core0 = binomial_location_scale_core(
            &self.y,
            &self.weights,
            eta_t,
            eta_ls,
            Some(etaw),
            &self.link_kind,
        )?;
        let b0 = self.wiggle_design(core0.q0.view())?;
        let d0 =
            self.wiggle_basiswith_options(core0.q0.view(), BasisOptions::first_derivative())?;
        let dd0 =
            self.wiggle_basiswith_options(core0.q0.view(), BasisOptions::second_derivative())?;
        let d3_basis = self.wiggle_d3basis_constrained(core0.q0.view())?;
        let d3q = self.wiggle_d3q_dq03(core0.q0.view(), betaw0.view())?;
        let d4q = self.wiggle_d4q_dq04(core0.q0.view(), betaw0.view())?;
        let pw = b0.ncols();
        let beta_layout = GamlssBetaLayout::withwiggle(pt, pls, pw);
        let total = beta_layout.total();
        if d_beta_u.len() != total || d_beta_v.len() != total {
            return Err(GamlssError::InvalidInput {
                reason: format!(
                    "BLS wiggle d2H operator: d_beta_{{u,v}} length {}/{} != {}",
                    d_beta_u.len(),
                    d_beta_v.len(),
                    total
                ),
            }
            .into());
        }
        if d0.ncols() != betaw0.len()
            || dd0.ncols() != betaw0.len()
            || d3_basis.ncols() != betaw0.len()
        {
            return Err(GamlssError::DimensionMismatch { reason: format!(
                "wiggle derivative/beta mismatch in d2H operator: B'={} B''={} B'''={} betaw={}",
                d0.ncols(),
                dd0.ncols(),
                d3_basis.ncols(),
                betaw0.len()
            ) }.into());
        }

        let (u_t, u_ls, uw) = beta_layout.split_three(d_beta_u, "wiggle d2H op u")?;
        let (v_t, v_ls, vw) = beta_layout.split_three(d_beta_v, "wiggle d2H op v")?;
        let d_eta_t_u = fast_av(x_t_arc.as_ref(), &u_t);
        let d_eta_ls_u = fast_av(x_ls_arc.as_ref(), &u_ls);
        let d_eta_t_v = fast_av(x_t_arc.as_ref(), &v_t);
        let d_eta_ls_v = fast_av(x_ls_arc.as_ref(), &v_ls);

        let m = d0.dot(&betaw0) + 1.0;
        let g2 = dd0.dot(&betaw0);
        let g3 = d3q;
        let g4 = d4q;
        let (sigma, ds, d2s, d3s, d4s) = exp_sigma_derivs_up_to_fourth_array(eta_ls.view());

        // Per-row scalar pair coefficients.
        let mut coeff_tt = Array1::<f64>::zeros(n);
        let mut coeff_tl = Array1::<f64>::zeros(n);
        let mut coeff_ll = Array1::<f64>::zeros(n);
        // Per-row coefficients for the t↔wiggle decomposition into
        // (X_t, B), (X_t, B'), (X_t, B''), (X_t, B''') pair entries.
        let mut alpha_tw_b = Array1::<f64>::zeros(n);
        let mut alpha_tw_d = Array1::<f64>::zeros(n);
        let mut alpha_tw_dd = Array1::<f64>::zeros(n);
        let mut alpha_tw_d3 = Array1::<f64>::zeros(n);
        let mut alpha_lw_b = Array1::<f64>::zeros(n);
        let mut alpha_lw_d = Array1::<f64>::zeros(n);
        let mut alpha_lw_dd = Array1::<f64>::zeros(n);
        let mut alpha_lw_d3 = Array1::<f64>::zeros(n);
        // Wiggle-wiggle bilinear pair entries on (B,B), (B,B'), (B,B''), (B',B').
        let mut c_ww_bb = Array1::<f64>::zeros(n);
        let mut c_ww_bd = Array1::<f64>::zeros(n);
        let mut c_ww_bdd = Array1::<f64>::zeros(n);
        let mut c_ww_dd_pair = Array1::<f64>::zeros(n);

        for i in 0..n {
            let q_i = core0.q0[i] + etaw[i];
            let (m1, m2, m3) = binomial_neglog_q_derivatives_dispatch(
                self.y[i],
                self.weights[i],
                q_i,
                core0.mu[i],
                core0.dmu_dq[i],
                core0.d2mu_dq2[i],
                core0.d3mu_dq3[i],
                &self.link_kind,
            );
            let m4 = binomial_neglog_q_fourth_derivative_dispatch(
                self.y[i],
                self.weights[i],
                q_i,
                core0.mu[i],
                core0.dmu_dq[i],
                core0.d2mu_dq2[i],
                core0.d3mu_dq3[i],
                &self.link_kind,
            )?;

            let q0_d = nonwiggle_q_derivs(eta_t[i], sigma[i]);
            let s_safe = sigma[i];
            let s2 = s_safe * s_safe;
            let s3 = s2 * s_safe;
            let s4 = s3 * s_safe;
            let s5 = s4 * s_safe;
            let q0_tl_ls_ls =
                d3s[i] / s2 - 6.0 * ds[i] * d2s[i] / s3 + 6.0 * ds[i] * ds[i] * ds[i] / s4;
            let q0_tl_ls_ls_ls =
                d4s[i] / s2 - 8.0 * ds[i] * d3s[i] / s3 - 6.0 * d2s[i] * d2s[i] / s3
                    + 36.0 * ds[i] * ds[i] * d2s[i] / s4
                    - 24.0 * ds[i] * ds[i] * ds[i] * ds[i] / s5;
            let q0_ll_ls_ls = eta_t[i] * q0_tl_ls_ls_ls;

            let u_t_i = d_eta_t_u[i];
            let u_ls_i = d_eta_ls_u[i];
            let v_t_i = d_eta_t_v[i];
            let v_ls_i = d_eta_ls_v[i];

            let dq0_u = q0_d.q_t * u_t_i + q0_d.q_ls * u_ls_i;
            let dq0v = q0_d.q_t * v_t_i + q0_d.q_ls * v_ls_i;
            let d2q0_uv =
                q0_d.q_tl * (u_t_i * v_ls_i + v_t_i * u_ls_i) + q0_d.q_ll * u_ls_i * v_ls_i;

            let dq0_t_u = q0_d.q_tl * u_ls_i;
            let dq0_tv = q0_d.q_tl * v_ls_i;
            let dq0_ls_u = q0_d.q_tl * u_t_i + q0_d.q_ll * u_ls_i;
            let dq0_lsv = q0_d.q_tl * v_t_i + q0_d.q_ll * v_ls_i;
            let dq0_tl_u = q0_d.q_tl_ls * u_ls_i;
            let dq0_tlv = q0_d.q_tl_ls * v_ls_i;
            let dq0_ll_u = q0_d.q_tl_ls * u_t_i + q0_d.q_ll_ls * u_ls_i;
            let dq0_llv = q0_d.q_tl_ls * v_t_i + q0_d.q_ll_ls * v_ls_i;

            let d2q0_t_uv = q0_d.q_tl_ls * u_ls_i * v_ls_i;
            let d2q0_ls_uv =
                q0_d.q_tl_ls * (u_ls_i * v_t_i + v_ls_i * u_t_i) + q0_d.q_ll_ls * u_ls_i * v_ls_i;
            let d2q0_tl_uv = q0_tl_ls_ls * u_ls_i * v_ls_i;
            let d2q0_ll_uv =
                q0_tl_ls_ls * (u_t_i * v_ls_i + v_t_i * u_ls_i) + q0_ll_ls_ls * u_ls_i * v_ls_i;

            let br = b0.row(i);
            let dr = d0.row(i);
            let ddr = dd0.row(i);
            let d3r = d3_basis.row(i);
            let b_u = br.dot(&uw);
            let bv = br.dot(&vw);
            let b1_u = dr.dot(&uw);
            let b1v = dr.dot(&vw);
            let b2_u = ddr.dot(&uw);
            let b2v = ddr.dot(&vw);
            let b3_u = d3r.dot(&uw);
            let b3v = d3r.dot(&vw);

            let dm_u = b1_u + g2[i] * dq0_u;
            let dmv = b1v + g2[i] * dq0v;
            let d2m_uv = g3[i] * dq0_u * dq0v + g2[i] * d2q0_uv + b2v * dq0_u + b2_u * dq0v;
            let dg2_u = b2_u + g3[i] * dq0_u;
            let dg2v = b2v + g3[i] * dq0v;
            let d2g2_uv = g4[i] * dq0_u * dq0v + g3[i] * d2q0_uv + b3v * dq0_u + b3_u * dq0v;

            let dq_u = m[i] * dq0_u + b_u;
            let dqv = m[i] * dq0v + bv;
            let d2q_uv = m[i] * d2q0_uv + g2[i] * dq0_u * dq0v + b1_u * dq0v + b1v * dq0_u;

            let q_t = m[i] * q0_d.q_t;
            let q_ls = m[i] * q0_d.q_ls;
            let q_tt = g2[i] * q0_d.q_t * q0_d.q_t;
            let q_tl = g2[i] * q0_d.q_t * q0_d.q_ls + m[i] * q0_d.q_tl;
            let q_ll = g2[i] * q0_d.q_ls * q0_d.q_ls + m[i] * q0_d.q_ll;

            let dq_t_u = dm_u * q0_d.q_t + m[i] * dq0_t_u;
            let dq_tv = dmv * q0_d.q_t + m[i] * dq0_tv;
            let dq_ls_u = dm_u * q0_d.q_ls + m[i] * dq0_ls_u;
            let dq_lsv = dmv * q0_d.q_ls + m[i] * dq0_lsv;

            let d2q_t_uv = d2m_uv * q0_d.q_t + dm_u * dq0_tv + dmv * dq0_t_u + m[i] * d2q0_t_uv;
            let d2q_ls_uv =
                d2m_uv * q0_d.q_ls + dm_u * dq0_lsv + dmv * dq0_ls_u + m[i] * d2q0_ls_uv;

            let dq_tt_u = dg2_u * q0_d.q_t * q0_d.q_t + g2[i] * (2.0 * q0_d.q_t * dq0_t_u);
            let dq_ttv = dg2v * q0_d.q_t * q0_d.q_t + g2[i] * (2.0 * q0_d.q_t * dq0_tv);
            let d2q_tt_uv = d2g2_uv * q0_d.q_t * q0_d.q_t
                + dg2_u * (2.0 * q0_d.q_t * dq0_tv)
                + dg2v * (2.0 * q0_d.q_t * dq0_t_u)
                + g2[i] * (2.0 * dq0_t_u * dq0_tv + 2.0 * q0_d.q_t * d2q0_t_uv);

            let dq_tl_u = dg2_u * q0_d.q_t * q0_d.q_ls
                + g2[i] * (dq0_t_u * q0_d.q_ls + q0_d.q_t * dq0_ls_u)
                + dm_u * q0_d.q_tl
                + m[i] * dq0_tl_u;
            let dq_tlv = dg2v * q0_d.q_t * q0_d.q_ls
                + g2[i] * (dq0_tv * q0_d.q_ls + q0_d.q_t * dq0_lsv)
                + dmv * q0_d.q_tl
                + m[i] * dq0_tlv;
            let d2q_tl_uv = d2g2_uv * q0_d.q_t * q0_d.q_ls
                + dg2_u * (dq0_tv * q0_d.q_ls + q0_d.q_t * dq0_lsv)
                + dg2v * (dq0_t_u * q0_d.q_ls + q0_d.q_t * dq0_ls_u)
                + g2[i]
                    * (d2q0_t_uv * q0_d.q_ls
                        + dq0_t_u * dq0_lsv
                        + dq0_tv * dq0_ls_u
                        + q0_d.q_t * d2q0_ls_uv)
                + d2m_uv * q0_d.q_tl
                + dm_u * dq0_tlv
                + dmv * dq0_tl_u
                + m[i] * d2q0_tl_uv;

            let dq_ll_u = dg2_u * q0_d.q_ls * q0_d.q_ls
                + g2[i] * (2.0 * q0_d.q_ls * dq0_ls_u)
                + dm_u * q0_d.q_ll
                + m[i] * dq0_ll_u;
            let dq_llv = dg2v * q0_d.q_ls * q0_d.q_ls
                + g2[i] * (2.0 * q0_d.q_ls * dq0_lsv)
                + dmv * q0_d.q_ll
                + m[i] * dq0_llv;
            let d2q_ll_uv = d2g2_uv * q0_d.q_ls * q0_d.q_ls
                + dg2_u * (2.0 * q0_d.q_ls * dq0_lsv)
                + dg2v * (2.0 * q0_d.q_ls * dq0_ls_u)
                + g2[i] * (2.0 * dq0_ls_u * dq0_lsv + 2.0 * q0_d.q_ls * d2q0_ls_uv)
                + d2m_uv * q0_d.q_ll
                + dm_u * dq0_llv
                + dmv * dq0_ll_u
                + m[i] * d2q0_ll_uv;

            // Scalar pair coefficients on (X_t, X_t), (X_t, X_ls), (X_ls, X_ls).
            coeff_tt[i] = second_directionalhessian_coeff_fromobjective_q_terms(
                m1, m2, m3, m4, dq_u, dqv, d2q_uv, q_t, q_t, q_tt, dq_t_u, dq_tv, dq_t_u, dq_tv,
                d2q_t_uv, d2q_t_uv, dq_tt_u, dq_ttv, d2q_tt_uv,
            );
            coeff_tl[i] = second_directionalhessian_coeff_fromobjective_q_terms(
                m1, m2, m3, m4, dq_u, dqv, d2q_uv, q_t, q_ls, q_tl, dq_t_u, dq_tv, dq_ls_u, dq_lsv,
                d2q_t_uv, d2q_ls_uv, dq_tl_u, dq_tlv, d2q_tl_uv,
            );
            coeff_ll[i] = second_directionalhessian_coeff_fromobjective_q_terms(
                m1, m2, m3, m4, dq_u, dqv, d2q_uv, q_ls, q_ls, q_ll, dq_ls_u, dq_lsv, dq_ls_u,
                dq_lsv, d2q_ls_uv, d2q_ls_uv, dq_ll_u, dq_llv, d2q_ll_uv,
            );

            // Cross block (X_a, B/B'/B''/B''') with X_a ∈ {X_t, X_ls}. Each
            // `coeff_xw(i, j)` is linear in (br[j], dr[j], ddr[j], d3r[j])
            // because each q-Hessian variable carrying `j` (q_xw, dq_xw_u,
            // dq_xwv, d2q_xw_uv, qw, dqw_u, dqwv, d2qw_uv) is linear in those
            // four. We expand `second_directionalhessian_coeff_fromobjective_q_terms`
            // by collecting like basis-derivative powers; the coefficients are
            // the four α_xw_{b,d,dd,d3} arrays.
            //
            // qw=br, dqw_u=dr·dq0u, dqwv=dr·dq0v, d2qw_uv=ddr·dq0u·dq0v + dr·d2q0_uv
            // q_xw=dr·q0_x, dq_xw_u=ddr·dq0u·q0_x + dr·dq0_x_u, dq_xwv=ddr·dq0v·q0_x + dr·dq0_xv
            // d2q_xw_uv=d3r·dq0u·dq0v·q0_x + ddr·(d2q0_uv·q0_x + dq0u·dq0_xv + dq0v·dq0_x_u) + dr·d2q0_x_uv
            //
            // d_qaqb_u = dq_x_u·qw + q_x·dqw_u  →  dq_x_u·br + q_x·dr·dq0u
            // d_qaqbv  = dq_xv·qw + q_x·dqwv    →  dq_xv·br + q_x·dr·dq0v
            // d2_qaqb_uv = d2q_x_uv·br + dq_x_u·dr·dq0v + dq_xv·dr·dq0u + q_x·d2qw_uv
            //
            // The full formula (expanded for "tw"; "lw" identical with x→ls):
            //
            //   m4·dq_u·dqv·q_x·br
            // + m3·(d2q_uv·q_x·br + dq_u·(dq_xv·br + q_x·dr·dq0v) + dqv·(dq_x_u·br + q_x·dr·dq0u) + dq_u·dqv·dr·q0_x)
            // + m2·(d2q_x_uv·br + dq_x_u·dr·dq0v + dq_xv·dr·dq0u + q_x·(ddr·dq0u·dq0v + dr·d2q0_uv)
            //       + d2q_uv·dr·q0_x + dq_u·(ddr·dq0v·q0_x + dr·dq0_xv) + dqv·(ddr·dq0u·q0_x + dr·dq0_x_u))
            // + m1·(d3r·dq0u·dq0v·q0_x + ddr·(d2q0_uv·q0_x + dq0u·dq0_xv + dq0v·dq0_x_u) + dr·d2q0_x_uv)
            //
            // Collecting like basis-derivative terms produces the closed-form
            // expressions below.

            // X_t ↔ wiggle channels.
            alpha_tw_b[i] = m4 * dq_u * dqv * q_t
                + m3 * (d2q_uv * q_t + dq_u * dq_tv + dqv * dq_t_u)
                + m2 * d2q_t_uv;
            alpha_tw_d[i] = m3 * (dq_u * q_t * dq0v + dqv * q_t * dq0_u + dq_u * dqv * q0_d.q_t)
                + m2 * (dq_t_u * dq0v
                    + dq_tv * dq0_u
                    + q_t * d2q0_uv
                    + d2q_uv * q0_d.q_t
                    + dq_u * dq0_tv
                    + dqv * dq0_t_u)
                + m1 * d2q0_t_uv;
            alpha_tw_dd[i] = m2
                * (q_t * dq0_u * dq0v + dq_u * dq0v * q0_d.q_t + dqv * dq0_u * q0_d.q_t)
                + m1 * (d2q0_uv * q0_d.q_t + dq0_u * dq0_tv + dq0v * dq0_t_u);
            alpha_tw_d3[i] = m1 * dq0_u * dq0v * q0_d.q_t;

            // X_ls ↔ wiggle channels (same formulas, swap t→ls).
            alpha_lw_b[i] = m4 * dq_u * dqv * q_ls
                + m3 * (d2q_uv * q_ls + dq_u * dq_lsv + dqv * dq_ls_u)
                + m2 * d2q_ls_uv;
            alpha_lw_d[i] = m3 * (dq_u * q_ls * dq0v + dqv * q_ls * dq0_u + dq_u * dqv * q0_d.q_ls)
                + m2 * (dq_ls_u * dq0v
                    + dq_lsv * dq0_u
                    + q_ls * d2q0_uv
                    + d2q_uv * q0_d.q_ls
                    + dq_u * dq0_lsv
                    + dqv * dq0_ls_u)
                + m1 * d2q0_ls_uv;
            alpha_lw_dd[i] = m2
                * (q_ls * dq0_u * dq0v + dq_u * dq0v * q0_d.q_ls + dqv * dq0_u * q0_d.q_ls)
                + m1 * (d2q0_uv * q0_d.q_ls + dq0_u * dq0_lsv + dq0v * dq0_ls_u);
            alpha_lw_d3[i] = m1 * dq0_u * dq0v * q0_d.q_ls;

            // Wiggle ↔ wiggle (bilinear in (br, dr, ddr) ⊗ (br, dr, ddr); no d3r).
            //
            // qa=brj, qb=brk, qab=0, dqa_u=drj·dq0u, dqav=drj·dq0v,
            // dqb_u=drk·dq0u, dqbv=drk·dq0v, d2qa_uv=ddrj·dq0u·dq0v+drj·d2q0_uv,
            // d2qb_uv=ddrk·dq0u·dq0v+drk·d2q0_uv, dqab_u=0, dqabv=0, d2qab_uv=0.
            //
            //   m4·dq_u·dqv·brj·brk
            // + m3·(d2q_uv·brj·brk + dq_u·(drj·dq0v·brk + brj·drk·dq0v)
            //                       + dqv·(drj·dq0u·brk + brj·drk·dq0u))
            // + m2·d2_qaqb_uv
            // where d2_qaqb_uv = (ddrj·dq0u·dq0v+drj·d2q0_uv)·brk
            //                  + drj·dq0u·drk·dq0v + drj·dq0v·drk·dq0u
            //                  + brj·(ddrk·dq0u·dq0v+drk·d2q0_uv).
            //
            // Pair (B, B): m4·dq_u·dqv + m3·d2q_uv  → coefficient of br[j]·br[k].
            // Pair (B, B'): m3·(dq_u·dq0v + dqv·dq0u) + m2·d2q0_uv → br·dr + dr·br.
            // Pair (B, B''): m2·dq0u·dq0v → br·ddr + ddr·br.
            // Pair (B', B'): 2·m2·dq0u·dq0v → dr·dr (the diagonal pair only
            //   accumulates once in `RowCoeffOperator`, so we double-count
            //   here to match the symmetric `dr[j]·dq0u·dr[k]·dq0v +
            //   dr[j]·dq0v·dr[k]·dq0u` cross product).
            c_ww_bb[i] = m4 * dq_u * dqv + m3 * d2q_uv;
            c_ww_bd[i] = m3 * (dq_u * dq0v + dqv * dq0_u) + m2 * d2q0_uv;
            c_ww_bdd[i] = m2 * dq0_u * dq0v;
            c_ww_dd_pair[i] = 2.0 * m2 * dq0_u * dq0v;
        }

        let basis: Arc<Array2<f64>> = Arc::new(b0);
        let basis_d1: Arc<Array2<f64>> = Arc::new(d0);
        let basis_d2: Arc<Array2<f64>> = Arc::new(dd0);
        let basis_d3: Arc<Array2<f64>> = Arc::new(d3_basis);

        Ok(Some(Arc::new(RowCoeffOperator::from_directions(
            vec![pt, pls, pw],
            vec![
                (0, x_t_arc),
                (1, x_ls_arc),
                (2, basis),
                (2, basis_d1),
                (2, basis_d2),
                (2, basis_d3),
            ],
            vec![
                // (X_t, X_t)   ← `d2_h[a, b] += coeff_tt · xtr[a] · xtr[b]`
                (0, 0, coeff_tt),
                // (X_t, X_ls)  ← `d2_h[a, pt+b] += coeff_tl · xtr[a] · xlsr[b]`
                (0, 1, coeff_tl),
                // (X_ls, X_ls) ← `d2_h[pt+a, pt+b] += coeff_ll · xlsr[a] · xlsr[b]`
                (1, 1, coeff_ll),
                // (X_t, B/B'/B''/B''') ← per-row α_tw_{b,d,dd,d3} decomposition of
                // `d2_h[a, pt+pls+j] += coeff_tw(i,j) · xtr[a]` (coeff_tw is
                // linear in br[j], dr[j], ddr[j], d3r[j])
                (0, 2, alpha_tw_b),
                (0, 3, alpha_tw_d),
                (0, 4, alpha_tw_dd),
                (0, 5, alpha_tw_d3),
                // (X_ls, B/B'/B''/B''') ← analogous α_lw_{b,d,dd,d3} decomposition
                // of `d2_h[pt+a, pt+pls+j] += coeff_lw(i,j) · xlsr[a]`
                (1, 2, alpha_lw_b),
                (1, 3, alpha_lw_d),
                (1, 4, alpha_lw_dd),
                (1, 5, alpha_lw_d3),
                // (B, B/B'/B'') ← bilinear decomposition of
                // `d2_h[pt+pls+j, pt+pls+k] += coeff_ww(i,j,k)` in
                // (br, dr, ddr) ⊗ (br, dr, ddr); no d3r entry — coeff_ww is
                // at most degree 2 in any single basis derivative.
                (2, 2, c_ww_bb),
                (2, 3, c_ww_bd),
                (2, 4, c_ww_bdd),
                // (B', B') diagonal — coefficient absorbs a factor of 2 to
                // match the symmetric `dr[j]·dq0u·dr[k]·dq0v + dr[j]·dq0v·
                // dr[k]·dq0u` cross product (the diagonal pair only
                // accumulates once in `RowCoeffOperator::mul_vec`).
                (3, 3, c_ww_dd_pair),
            ],
            n,
        ))))
    }
}

/// Matrix-free joint-Hessian operator for the 3-block binomial
/// location-scale wiggle family. See `BinomialLocationScaleWiggleHessianRowPieces`
/// for the per-row weight structure.
struct BinomialLocationScaleWiggleHessianWorkspace {
    family: BinomialLocationScaleWiggleFamily,
    block_states: Vec<ParameterBlockState>,
    x_t: Arc<Array2<f64>>,
    x_ls: Arc<Array2<f64>>,
    pieces: BinomialLocationScaleWiggleHessianRowPieces,
}

impl BinomialLocationScaleWiggleHessianWorkspace {
    fn new(
        family: BinomialLocationScaleWiggleFamily,
        block_states: Vec<ParameterBlockState>,
        x_t: Array2<f64>,
        x_ls: Array2<f64>,
    ) -> Result<Self, String> {
        let pieces = family.wiggle_hessian_row_pieces(&block_states)?;
        Ok(Self {
            family,
            block_states,
            x_t: Arc::new(x_t),
            x_ls: Arc::new(x_ls),
            pieces,
        })
    }

    /// Apply a Horvitz–Thompson outer-row subsample mask to the precomputed
    /// per-row coefficient arrays in place.
    ///
    /// Each sampled row's `coeff_*[i]` is multiplied by its
    /// `WeightedOuterRow.weight` (the HT inverse-inclusion factor 1/π_i —
    /// uniform or stratified sampling both supported). All non-sampled rows
    /// are zeroed. Because every downstream assembly (`hessian_dense`,
    /// `hessian_matvec`, `hessian_diagonal`) is row-linear in these arrays
    /// via `Xᵀ diag(W) Y`, the resulting joint-Hessian is an unbiased
    /// estimator of the full-data joint Hessian. The `b0`/`d0` basis matrices
    /// are independent of the per-row weights and remain unchanged.
    fn apply_outer_subsample(
        &mut self,
        rows: &[crate::families::marginal_slope_shared::WeightedOuterRow],
    ) {
        let n = self.pieces.coeff_tt.len();
        let mut mask_tt = Array1::<f64>::zeros(n);
        let mut mask_tl = Array1::<f64>::zeros(n);
        let mut mask_ll = Array1::<f64>::zeros(n);
        let mut mask_tw_b = Array1::<f64>::zeros(n);
        let mut mask_tw_d = Array1::<f64>::zeros(n);
        let mut mask_lw_b = Array1::<f64>::zeros(n);
        let mut mask_lw_d = Array1::<f64>::zeros(n);
        let mut maskww = Array1::<f64>::zeros(n);
        for r in rows {
            let i = r.index;
            let w = r.weight;
            mask_tt[i] = self.pieces.coeff_tt[i] * w;
            mask_tl[i] = self.pieces.coeff_tl[i] * w;
            mask_ll[i] = self.pieces.coeff_ll[i] * w;
            mask_tw_b[i] = self.pieces.coeff_tw_b[i] * w;
            mask_tw_d[i] = self.pieces.coeff_tw_d[i] * w;
            mask_lw_b[i] = self.pieces.coeff_lw_b[i] * w;
            mask_lw_d[i] = self.pieces.coeff_lw_d[i] * w;
            maskww[i] = self.pieces.coeffww[i] * w;
        }
        self.pieces.coeff_tt = mask_tt;
        self.pieces.coeff_tl = mask_tl;
        self.pieces.coeff_ll = mask_ll;
        self.pieces.coeff_tw_b = mask_tw_b;
        self.pieces.coeff_tw_d = mask_tw_d;
        self.pieces.coeff_lw_b = mask_lw_b;
        self.pieces.coeff_lw_d = mask_lw_d;
        self.pieces.coeffww = maskww;
    }
}

impl ExactNewtonJointHessianWorkspace for BinomialLocationScaleWiggleHessianWorkspace {
    fn hessian_dense(&self) -> Result<Option<Array2<f64>>, String> {
        // Same Hv structure as `hessian_matvec`, but routed through the
        // already-existing `assemble_dense` row-pieces helper (eight GEMMs
        // covering h_tt, h_tl, h_ll, h_tw_b, h_tw_d, h_lw_b, h_lw_d, h_ww).
        // Avoids `total` canonical-basis HVPs in
        // `MatrixFreeSpdOperator::materialize_dense_operator`, which at
        // biobank scale (n≈320k, p_total≈82) costs ~568s per κ-iter versus
        // ~1s for the dense build.
        let dense = self
            .pieces
            .assemble_dense(self.x_t.as_ref(), self.x_ls.as_ref())?;
        Ok(Some(dense))
    }

    fn hessian_matvec_available(&self) -> bool {
        true
    }

    fn hessian_matvec(&self, v: &Array1<f64>) -> Result<Option<Array1<f64>>, String> {
        let pt = self.x_t.ncols();
        let pls = self.x_ls.ncols();
        let pw = self.pieces.b0.ncols();
        let total = pt + pls + pw;
        if v.len() != total {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleWiggle matvec dimension mismatch: got {}, expected {}",
                    v.len(),
                    total
                ),
            }
            .into());
        }
        let v_t = v.slice(s![0..pt]);
        let v_ls = v.slice(s![pt..pt + pls]);
        let v_w = v.slice(s![pt + pls..total]);

        let u_t = self.x_t.dot(&v_t);
        let u_ls = self.x_ls.dot(&v_ls);
        let u_b = self.pieces.b0.dot(&v_w);
        let u_d = self.pieces.d0.dot(&v_w);

        let r_t = &self.pieces.coeff_tt * &u_t
            + &self.pieces.coeff_tl * &u_ls
            + &self.pieces.coeff_tw_b * &u_b
            + &self.pieces.coeff_tw_d * &u_d;
        let r_ls = &self.pieces.coeff_tl * &u_t
            + &self.pieces.coeff_ll * &u_ls
            + &self.pieces.coeff_lw_b * &u_b
            + &self.pieces.coeff_lw_d * &u_d;
        let r_b = &self.pieces.coeff_tw_b * &u_t
            + &self.pieces.coeff_lw_b * &u_ls
            + &self.pieces.coeffww * &u_b;
        let r_d = &self.pieces.coeff_tw_d * &u_t + &self.pieces.coeff_lw_d * &u_ls;

        let out_t = fast_atv(self.x_t.as_ref(), &r_t);
        let out_ls = fast_atv(self.x_ls.as_ref(), &r_ls);
        let out_w = fast_atv(&self.pieces.b0, &r_b) + &fast_atv(&self.pieces.d0, &r_d);

        let mut out = Array1::<f64>::zeros(total);
        out.slice_mut(s![0..pt]).assign(&out_t);
        out.slice_mut(s![pt..pt + pls]).assign(&out_ls);
        out.slice_mut(s![pt + pls..total]).assign(&out_w);
        Ok(Some(out))
    }

    fn hessian_diagonal(&self) -> Result<Option<Array1<f64>>, String> {
        let pt = self.x_t.ncols();
        let pls = self.x_ls.ncols();
        let pw = self.pieces.b0.ncols();
        let total = pt + pls + pw;
        let mut diag = Array1::<f64>::zeros(total);
        let n = self.pieces.coeff_tt.len();
        for j in 0..pt {
            let col = self.x_t.column(j);
            let mut acc = 0.0;
            for i in 0..n {
                let v = col[i];
                acc += self.pieces.coeff_tt[i] * v * v;
            }
            diag[j] = acc;
        }
        for j in 0..pls {
            let col = self.x_ls.column(j);
            let mut acc = 0.0;
            for i in 0..n {
                let v = col[i];
                acc += self.pieces.coeff_ll[i] * v * v;
            }
            diag[pt + j] = acc;
        }
        for j in 0..pw {
            let col = self.pieces.b0.column(j);
            let mut acc = 0.0;
            for i in 0..n {
                let v = col[i];
                acc += self.pieces.coeffww[i] * v * v;
            }
            diag[pt + pls + j] = acc;
        }
        Ok(Some(diag))
    }

    fn directional_derivative(
        &self,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.family
            .exact_newton_joint_hessian_directional_derivative(&self.block_states, d_beta_flat)
    }

    fn directional_derivative_operator(
        &self,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Arc<dyn crate::solver::estimate::reml::unified::HyperOperator>>, String>
    {
        self.family.bls_wiggle_directional_operator(
            &self.block_states,
            self.x_t.clone(),
            self.x_ls.clone(),
            d_beta_flat,
        )
    }

    fn second_directional_derivative(
        &self,
        d_beta_u_flat: &Array1<f64>,
        d_beta_v_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.family
            .exact_newton_joint_hessiansecond_directional_derivative(
                &self.block_states,
                d_beta_u_flat,
                d_beta_v_flat,
            )
    }

    fn second_directional_derivative_operator(
        &self,
        d_beta_u: &Array1<f64>,
        d_beta_v: &Array1<f64>,
    ) -> Result<Option<Arc<dyn crate::solver::estimate::reml::unified::HyperOperator>>, String>
    {
        self.family.bls_wiggle_second_directional_operator(
            &self.block_states,
            self.x_t.clone(),
            self.x_ls.clone(),
            d_beta_u,
            d_beta_v,
        )
    }
}

impl CustomFamilyGenerative for BinomialLocationScaleWiggleFamily {
    fn generativespec(
        &self,
        block_states: &[ParameterBlockState],
    ) -> Result<GenerativeSpec, String> {
        if block_states.len() != 3 {
            return Err(GamlssError::DimensionMismatch {
                reason: format!(
                    "BinomialLocationScaleWiggleFamily expects 3 blocks, got {}",
                    block_states.len()
                ),
            }
            .into());
        }
        let eta_t = &block_states[Self::BLOCK_T].eta;
        let eta_ls = &block_states[Self::BLOCK_LOG_SIGMA].eta;
        let etaw = &block_states[Self::BLOCK_WIGGLE].eta;
        if eta_t.len() != self.y.len() || eta_ls.len() != self.y.len() || etaw.len() != self.y.len()
        {
            return Err(GamlssError::DimensionMismatch {
                reason: "BinomialLocationScaleWiggleFamily generative size mismatch".to_string(),
            }
            .into());
        }
        let mean = gamlss_rowwise_map_result(self.y.len(), |i| {
            let sigma = exp_sigma_from_eta_scalar(eta_ls[i]);
            let q0 = binomial_location_scale_q0(eta_t[i], sigma);
            let jet = inverse_link_jet_for_inverse_link(&self.link_kind, q0 + etaw[i])
                .map_err(|e| format!("location-scale inverse-link evaluation failed: {e}"))?;
            Ok(jet.mu)
        })?;
        Ok(GenerativeSpec {
            mean,
            noise: NoiseModel::Bernoulli,
        })
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    // Helpers exercised only by these tests; imported here (not at module scope)
    // so they are not flagged unused in a non-test `--lib` build.
    use super::binomial_q_derivs::{
        binomial_neglog_q_derivatives_cloglog_closed_form,
        binomial_neglog_q_derivatives_logit_closed_form,
        binomial_neglog_q_derivatives_probit_closed_form,
        binomial_neglog_q_fourth_derivative_cloglog_closed_form,
        binomial_neglog_q_fourth_derivative_logit_closed_form,
        binomial_neglog_q_fourth_derivative_probit_closed_form,
    };
    use crate::basis::{
        CenterStrategy, Dense, KnotSource, MaternBasisSpec, MaternIdentifiability, MaternNu,
        create_basis,
    };
    use crate::families::wiggle::{
        initializewiggle_knots_from_seed, monotone_wiggle_internal_degree,
        split_wiggle_penalty_orders,
    };
    use crate::smooth::{ShapeConstraint, SmoothBasisSpec, SmoothTermSpec};
    use crate::test_support::{binomial_location_scale_base_fixture, no_densify_design};
    use ndarray::{Array2, Axis, array};
    use num_dual::{
        DualNum, second_derivative, second_partial_derivative, third_partial_derivative_vec,
    };

    fn intercept_block(n: usize) -> ParameterBlockInput {
        ParameterBlockInput {
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::from_elem(
                (n, 1),
                1.0,
            ))),
            offset: Array1::zeros(n),
            penalties: Vec::new(),
            nullspace_dims: vec![],
            initial_log_lambdas: None,
            initial_beta: None,
        }
    }

    fn compose_theta_from_hints_test(
        mean_penalty_count: usize,
        noise_penalty_count: usize,
        mean_log_lambda_hint: &Option<Array1<f64>>,
        noise_log_lambda_hint: &Option<Array1<f64>>,
        extra_rho0: &Array1<f64>,
    ) -> Array1<f64> {
        let layout = GamlssLambdaLayout::withwiggle(
            mean_penalty_count,
            noise_penalty_count,
            extra_rho0.len(),
        );
        let mut theta = Array1::<f64>::zeros(layout.total());
        if let Some(v) = mean_log_lambda_hint
            && v.len() == layout.k_mean
        {
            theta.slice_mut(s![0..layout.mean_end()]).assign(v);
        }
        if let Some(v) = noise_log_lambda_hint
            && v.len() == layout.k_noise
        {
            theta
                .slice_mut(s![layout.noise_start()..layout.noise_end()])
                .assign(v);
        }
        if layout.kwiggle > 0 {
            theta
                .slice_mut(s![layout.wiggle_start()..layout.wiggle_end()])
                .assign(extra_rho0);
        }
        theta
    }

    #[test]
    fn monotone_wiggle_post_update_validator_rejects_hidden_projection() {
        validate_monotone_wiggle_beta_nonnegative(
            &array![0.0, 1.0e-13, 2.0],
            "monotone wiggle validator test",
        )
        .expect("feasible nonnegative wiggle beta should validate");

        let err = validate_monotone_wiggle_beta_nonnegative(
            &array![0.0, -1.0e-3, 2.0],
            "monotone wiggle validator test",
        )
        .expect_err("negative wiggle beta must be rejected instead of projected");
        assert!(
            err.contains("monotone wiggle coefficients must be non-negative"),
            "unexpected error: {err}"
        );
    }

    #[test]
    fn logb_dlog_sigma_deta_preserves_negative_tail_precision() {
        let eta = -703.4873664863218;
        let SigmaJet1 { sigma, d1 } = logb_sigma_jet1_scalar(eta);

        assert_eq!(
            1.0 - LOGB_SIGMA_FLOOR / sigma,
            0.0,
            "the algebraically equivalent complement form must cancel at this eta"
        );
        assert!(
            logb_dlog_sigma_deta(sigma, d1) > 0.0,
            "d_sigma_deta / sigma must preserve the remaining tail derivative"
        );
        assert_eq!(logb_dlog_sigma_deta(f64::INFINITY, f64::INFINITY), 1.0);
    }

    fn logistic_numdual<D: DualNum<f64> + Copy>(x: D) -> D {
        D::one() / (D::one() + (-x).exp())
    }

    fn bspline_basis_scalar_numdual<D: DualNum<f64> + Copy>(
        x: D,
        knots: &Array1<f64>,
        degree: usize,
    ) -> Vec<D> {
        let n_basis = knots.len() - degree - 1;
        let x_real = x.re();
        let mut basis = vec![D::zero(); n_basis];
        let last_knot = knots[knots.len() - 1];
        for j in 0..n_basis {
            let left = knots[j];
            let right = knots[j + 1];
            let active = if x_real == last_knot {
                j + 1 == n_basis
            } else {
                left <= x_real && x_real < right
            };
            if active {
                basis[j] = D::one();
            }
        }
        for k in 1..=degree {
            let mut next = vec![D::zero(); n_basis];
            for j in 0..n_basis {
                let mut acc = D::zero();
                let left_denom = knots[j + k] - knots[j];
                if left_denom > 0.0 {
                    acc += ((x - D::from(knots[j])) / D::from(left_denom)) * basis[j];
                }
                if j + 1 < n_basis {
                    let right_denom = knots[j + k + 1] - knots[j + 1];
                    if right_denom > 0.0 {
                        acc +=
                            ((D::from(knots[j + k + 1]) - x) / D::from(right_denom)) * basis[j + 1];
                    }
                }
                next[j] = acc;
            }
            basis = next;
        }
        basis
    }

    fn monotone_wiggle_basis_scalar_numdual<D: DualNum<f64> + Copy>(
        x: D,
        knots: &Array1<f64>,
        degree: usize,
    ) -> Array1<D> {
        let bs_degree =
            monotone_wiggle_internal_degree(degree).expect("monotone wiggle degree") + 1;
        let left = knots[bs_degree];
        let full = bspline_basis_scalar_numdual(x, knots, bs_degree);
        let left_full = bspline_basis_scalar_numdual(D::from(left), knots, bs_degree);
        let mut out = Array1::<D>::from_elem(full.len().saturating_sub(1), D::zero());
        let mut running = D::zero();
        let mut left_running = D::zero();
        for j in (1..full.len()).rev() {
            running += full[j];
            left_running += left_full[j];
            out[j - 1] = running - left_running;
        }
        out
    }

    fn wiggle_negloglik_threshold_numdual<D: DualNum<f64> + Copy>(
        beta_t: D,
        beta_ls: f64,
        betaw: &Array1<f64>,
        y: &Array1<f64>,
        weights: &Array1<f64>,
        knots: &Array1<f64>,
        degree: usize,
    ) -> D {
        let sigma = D::from(beta_ls).exp();
        let q0 = -beta_t / sigma;
        let basis = monotone_wiggle_basis_scalar_numdual(q0, knots, degree);
        let mut etaw = D::zero();
        for j in 0..betaw.len() {
            etaw += basis[j] * D::from(betaw[j]);
        }
        let q = q0 + etaw;
        let mu = logistic_numdual(q);
        let one_minusmu = D::one() - mu;
        let mut out = D::zero();
        for i in 0..y.len() {
            out -= D::from(weights[i])
                * (D::from(y[i]) * mu.ln() + D::from(1.0 - y[i]) * one_minusmu.ln());
        }
        out
    }

    // Source-of-truth Gaussian logb negloglik. Analytic helpers MUST autodiff-match this.
    fn gaussian_negloglik_log_sigma_psi_numdual<D: DualNum<f64> + Copy>(
        beta_mu: D,
        beta_ls: D,
        psi: D,
        y: &Array1<f64>,
        weights: &Array1<f64>,
        x_mu0: &Array1<f64>,
        x_ls0: &Array1<f64>,
        x_ls_psi: &Array1<f64>,
        x_ls_psi_psi: &Array1<f64>,
    ) -> D {
        let half = D::from(0.5);
        let mut out = D::zero();
        for i in 0..y.len() {
            let eta_mu = D::from(x_mu0[i]) * beta_mu;
            let x_ls = D::from(x_ls0[i])
                + psi * D::from(x_ls_psi[i])
                + half * psi * psi * D::from(x_ls_psi_psi[i]);
            let eta_ls = x_ls * beta_ls;
            // Mirror the production logb noise link σ = LOGB_SIGMA_FLOOR + exp(η_ls)
            // (see `GaussianLocationScaleFamily::loglik`); using the bare-exp link
            // here would diverge from the family's σ at the same η and break the
            // psi-derivative identities that this reference negloglik certifies.
            let sigma = D::from(LOGB_SIGMA_FLOOR) + eta_ls.exp();
            let resid = D::from(y[i]) - eta_mu;
            out += D::from(weights[i]) * (half * (resid / sigma).powi(2) + sigma.ln());
        }
        out
    }

    fn gaussian_negloglik_log_sigma_psi_only_numdual<D: DualNum<f64> + Copy>(
        psi: D,
        beta_mu: f64,
        beta_ls: f64,
        y: &Array1<f64>,
        weights: &Array1<f64>,
        x_mu0: &Array1<f64>,
        x_ls0: &Array1<f64>,
        x_ls_psi: &Array1<f64>,
        x_ls_psi_psi: &Array1<f64>,
    ) -> D {
        gaussian_negloglik_log_sigma_psi_numdual(
            D::from(beta_mu),
            D::from(beta_ls),
            psi,
            y,
            weights,
            x_mu0,
            x_ls0,
            x_ls_psi,
            x_ls_psi_psi,
        )
    }

    fn gaussian_negloglik_log_sigma_mu_psi_numdual<D: DualNum<f64> + Copy>(
        beta_mu: D,
        psi: D,
        beta_ls: f64,
        y: &Array1<f64>,
        weights: &Array1<f64>,
        x_mu0: &Array1<f64>,
        x_ls0: &Array1<f64>,
        x_ls_psi: &Array1<f64>,
        x_ls_psi_psi: &Array1<f64>,
    ) -> D {
        gaussian_negloglik_log_sigma_psi_numdual(
            beta_mu,
            D::from(beta_ls),
            psi,
            y,
            weights,
            x_mu0,
            x_ls0,
            x_ls_psi,
            x_ls_psi_psi,
        )
    }

    fn gaussian_negloglik_log_sigma_ls_psi_numdual<D: DualNum<f64> + Copy>(
        beta_ls: D,
        psi: D,
        beta_mu: f64,
        y: &Array1<f64>,
        weights: &Array1<f64>,
        x_mu0: &Array1<f64>,
        x_ls0: &Array1<f64>,
        x_ls_psi: &Array1<f64>,
        x_ls_psi_psi: &Array1<f64>,
    ) -> D {
        gaussian_negloglik_log_sigma_psi_numdual(
            D::from(beta_mu),
            beta_ls,
            psi,
            y,
            weights,
            x_mu0,
            x_ls0,
            x_ls_psi,
            x_ls_psi_psi,
        )
    }

    fn gaussian_negloglik_log_sigma_beta_vec_numdual<D: DualNum<f64> + Copy>(
        v: &[D],
        y: &Array1<f64>,
        weights: &Array1<f64>,
        x_mu0: &Array1<f64>,
        x_ls0: &Array1<f64>,
        x_ls_psi: &Array1<f64>,
        x_ls_psi_psi: &Array1<f64>,
    ) -> D {
        gaussian_negloglik_log_sigma_psi_numdual(
            v[0],
            v[1],
            v[2],
            y,
            weights,
            x_mu0,
            x_ls0,
            x_ls_psi,
            x_ls_psi_psi,
        )
    }

    fn gaussian_psi_test_spec(name: &str, design: Array2<f64>) -> ParameterBlockSpec {
        let n = design.nrows();
        ParameterBlockSpec {
            name: name.to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(design)),
            offset: Array1::zeros(n),
            penalties: Vec::new(),
            nullspace_dims: vec![],
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        }
    }

    #[test]
    fn gaussian_joint_psi_firstweights_score_ls_carries_logb_chain_rule_factor() {
        let y = array![1.1];
        let etamu = array![0.3];
        let eta_ls = array![-0.2];
        let weights = array![2.5];
        let rows =
            gaussian_jointrow_scalars(&y, &etamu, &eta_ls, &weights).expect("gaussian row scalars");
        let firstweights = gaussian_joint_psi_firstweights(&rows, &array![0.0], &array![1.0]);
        let sigma = crate::families::sigma_link::logb_sigma_from_eta_scalar(eta_ls[0]);
        let kappa = 1.0 - crate::families::sigma_link::LOGB_SIGMA_FLOOR / sigma;
        let expected = kappa * (weights[0] - rows.n[0]);

        assert!(
            (firstweights.score_ls[0] - expected).abs() <= 1e-12,
            "Under the logb link σ = b + exp(η_ls), d/dη_ls of weight*(ln σ + 0.5(y-μ)^2/σ^2) carries the chain-rule factor κ = 1 - b/σ, so the row score must equal κ*(weight - n_i). The helper coded {} but the κ-corrected expectation is {}.",
            firstweights.score_ls[0],
            expected
        );
        assert!(
            (firstweights.objective_psirow[0] - expected).abs() <= 1e-12,
            "With mu_psi=0 and eta_psi=1, the exact psi objective derivative must equal κ*(weight - n_i) (κ = 1 - b/σ from the logb chain rule). The helper coded {} but the κ-corrected expectation is {}.",
            firstweights.objective_psirow[0],
            expected
        );
    }

    #[test]
    fn cloglog_binomial_right_tail_derivatives_stay_finite() {
        let (m1, m2, m3) = binomial_neglog_q_derivatives_cloglog_closed_form(1.0, 1.0, 1000.0);
        let m4 = binomial_neglog_q_fourth_derivative_cloglog_closed_form(1.0, 1.0, 300.0);

        assert_eq!(m1, 0.0);
        assert_eq!(m2, 0.0);
        assert_eq!(m3, 0.0);
        assert_eq!(m4, 0.0);
    }

    #[test]
    fn cloglog_binomial_fractional_right_tail_keeps_y0_branch() {
        let y = 0.25;
        let weight = 2.0;
        let q = 300.0;
        let expected = weight * (1.0 - y) * q.exp();
        let (m1, m2, m3) = binomial_neglog_q_derivatives_cloglog_closed_form(y, weight, q);
        let m4 = binomial_neglog_q_fourth_derivative_cloglog_closed_form(y, weight, q);

        assert!(m1.is_finite());
        assert!(m2.is_finite());
        assert!(m3.is_finite());
        assert!(m4.is_finite());
        assert_eq!(m1, expected);
        assert_eq!(m2, expected);
        assert_eq!(m3, expected);
        assert_eq!(m4, expected);
    }

    #[test]
    fn logit_binomial_tail_derivatives_are_exact_not_clipped() {
        // Regression for issue #948 (2b): the logit curvature/4th derivative
        // must be the EXACT Bernoulli variance s = p(1-p) in the saturated
        // tail — never floored to MIN_PROB·(1−MIN_PROB) ≈ 1e-10. At q=50 the
        // true variance is s = e^{-50}/(1+e^{-50})² ≈ e^{-50} ≈ 1.93e-22.
        let q = 50.0;
        let t = (-q).exp();
        let denom = 1.0 + t;
        let s_exact = t / (denom * denom);

        let (m1, m2, m3) = binomial_neglog_q_derivatives_logit_closed_form(1.0, 1.0, q);
        let m4 = binomial_neglog_q_fourth_derivative_logit_closed_form(1.0, 1.0, q);

        // The clipped surrogate would have reported ~1e-10; the exact value is
        // ~1.9e-22, twelve orders of magnitude smaller.
        assert!(
            s_exact < 1e-21,
            "sanity: exact tail variance should be ~1e-22, got {s_exact}"
        );
        // m1 = w(p - y); at q=50, p rounds to 1.0 exactly, so m1 = 0.
        assert!(m1.abs() <= 1e-15, "m1 should be ~0 at p≈1, got {m1}");
        assert!(
            (m2 - s_exact).abs() <= 1e-30,
            "logit curvature must equal exact s=p(1-p) in the tail, got {m2}, want {s_exact}"
        );
        // The clipped floor would be ~5e-12 larger than the truth: assert we
        // are nowhere near it.
        assert!(
            m2 < 1e-15,
            "logit curvature must NOT be floored at MIN_PROB·(1−MIN_PROB)≈1e-10, got {m2}"
        );
        assert!(m3.is_finite());
        assert!(
            (m4 - s_exact * (1.0 - 6.0 * s_exact)).abs() <= 1e-30,
            "logit fourth derivative must equal exact ws(1-6s) in the tail, got {m4}"
        );
    }

    #[test]
    fn probit_binomial_incompatible_tail_keeps_mills_score() {
        let q = 40.0;
        let (m1, m2, m3) = binomial_neglog_q_derivatives_probit_closed_form(0.0, 1.0, q);
        let m4 = binomial_neglog_q_fourth_derivative_probit_closed_form(0.0, 1.0, q);

        assert!(
            m1 > 39.0 && m1 < 41.0,
            "right-tail probit score should be Mills-ratio sized, got {m1}"
        );
        assert!(
            m2 > 0.9 && m2 < 1.1,
            "right-tail probit curvature should stay near one, got {m2}"
        );
        assert!(
            m3.is_finite(),
            "third derivative must stay finite, got {m3}"
        );
        assert!(
            m4.is_finite(),
            "fourth derivative must stay finite, got {m4}"
        );
    }

    #[test]
    fn binomial_location_scale_loglik_uses_tail_stable_standard_links() {
        use crate::families::custom_family::{CustomFamily, ParameterBlockState};

        let n = 2usize;
        let design = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
            Array2::from_elem((n, 1), 1.0),
        ));
        let log_sigma = ParameterBlockState {
            beta: array![0.0],
            eta: array![0.0, 0.0],
        };

        let logit_family = BinomialLocationScaleFamily {
            y: array![0.0, 1.0],
            weights: Array1::ones(n),
            link_kind: InverseLink::Standard(StandardLink::Logit),
            threshold_design: Some(design.clone()),
            log_sigma_design: Some(design.clone()),
            policy: crate::resource::ResourcePolicy::default_library(),
        };
        let logit_states = vec![
            ParameterBlockState {
                beta: array![0.0],
                eta: array![-1000.0, 1000.0],
            },
            log_sigma.clone(),
        ];
        let logit_ll = logit_family
            .log_likelihood_only(&logit_states)
            .expect("logit tail likelihood");
        assert!(
            (logit_ll + 2000.0).abs() <= 1e-10,
            "logit tail likelihood must use softplus natural-parameter algebra, got {logit_ll}"
        );

        let cloglog_family = BinomialLocationScaleFamily {
            y: array![0.0, 1.0],
            weights: Array1::ones(n),
            link_kind: InverseLink::Standard(StandardLink::CLogLog),
            threshold_design: Some(design.clone()),
            log_sigma_design: Some(design),
            policy: crate::resource::ResourcePolicy::default_library(),
        };
        let cloglog_states = vec![
            ParameterBlockState {
                beta: array![0.0],
                eta: array![-20.0, 1000.0],
            },
            log_sigma,
        ];
        let cloglog_ll = cloglog_family
            .log_likelihood_only(&cloglog_states)
            .expect("cloglog tail likelihood");
        let expected = -20.0_f64.exp() - 1000.0;
        let rel = (cloglog_ll - expected).abs() / expected.abs();
        assert!(
            rel <= 1e-14,
            "cloglog tail likelihood must use exp(q) survival algebra, got {cloglog_ll}, expected {expected}"
        );
    }

    #[test]
    fn gaussian_joint_psisecondweights_eta_ab_term_carries_logb_chain_rule_factor() {
        let y = array![1.1];
        let etamu = array![0.3];
        let eta_ls = array![-0.2];
        let weights = array![2.5];
        let rows =
            gaussian_jointrow_scalars(&y, &etamu, &eta_ls, &weights).expect("gaussian row scalars");
        let secondweights = gaussian_joint_psisecondweights(
            &rows,
            &array![0.0],
            &array![0.0],
            &array![0.0],
            &array![0.0],
            &array![0.0],
            &array![1.0],
        );
        let sigma = crate::families::sigma_link::logb_sigma_from_eta_scalar(eta_ls[0]);
        let kappa = 1.0 - crate::families::sigma_link::LOGB_SIGMA_FLOOR / sigma;
        let expected = kappa * (weights[0] - rows.n[0]);

        assert!(
            (secondweights.objective_psi_psirow[0] - expected).abs() <= 1e-12,
            "With only eta_psi_psi=1 active, the Gaussian second psi objective contribution from the linear η_ls term carries the logb chain-rule factor κ = 1 - b/σ, so it must equal κ*(weight - n_i). The helper coded {} but the κ-corrected expectation is {}.",
            secondweights.objective_psi_psirow[0],
            expected
        );
    }

    #[test]
    fn gaussian_location_scale_coefficient_cost_delegates_to_joint_coupled_helper() {
        // GAMLSS families (all five variants) share the joint-coupled formula
        // n · (Σ p_b)². They each pull n from `self.y.len()` and forward the
        // specs to the shared helper. This regression test pins that contract
        // for the simplest representative (GaussianLocationScale); the other
        // four GAMLSS impls are byte-for-byte identical aside from the comment.
        let n = 100usize;
        let p_mu = 7usize;
        let p_log_sigma = 4usize;
        let family = GaussianLocationScaleFamily {
            y: Array1::zeros(n),
            weights: Array1::from_elem(n, 1.0),
            mu_design: None,
            log_sigma_design: None,
            policy: crate::resource::ResourcePolicy::default_library(),
            cached_row_scalars: std::sync::RwLock::new(None),
        };
        let specs = vec![
            ParameterBlockSpec {
                name: "mu".to_string(),
                design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros(
                    (n, p_mu),
                ))),
                offset: Array1::zeros(n),
                penalties: Vec::new(),
                nullspace_dims: Vec::new(),
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
            ParameterBlockSpec {
                name: "log_sigma".to_string(),
                design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros(
                    (n, p_log_sigma),
                ))),
                offset: Array1::zeros(n),
                penalties: Vec::new(),
                nullspace_dims: Vec::new(),
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
        ];
        let p_total = (p_mu + p_log_sigma) as u64;
        let expected =
            crate::custom_family::joint_coupled_coefficient_hessian_cost(n as u64, &specs);
        assert_eq!(family.coefficient_hessian_cost(&specs), expected);
        assert_eq!(expected, (n as u64) * p_total * p_total);
        assert!(
            expected > crate::custom_family::default_coefficient_hessian_cost(&specs),
            "joint-coupled cost must exceed block-diagonal default by the cross-block fill"
        );
    }

    #[test]
    fn large_n_gaussian_location_scale_keeps_exact_outer_hessian_plan() {
        let n = 50_001usize;
        let p_mu = 20usize;
        let p_log_sigma = 20usize;
        let family = GaussianLocationScaleFamily {
            y: Array1::zeros(n),
            weights: Array1::from_elem(n, 1.0),
            mu_design: None,
            log_sigma_design: None,
            policy: crate::resource::ResourcePolicy::default_library(),
            cached_row_scalars: std::sync::RwLock::new(None),
        };
        let specs = vec![
            ParameterBlockSpec {
                name: "mu".to_string(),
                design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros(
                    (n, p_mu),
                ))),
                offset: Array1::zeros(n),
                penalties: Vec::new(),
                nullspace_dims: Vec::new(),
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
            ParameterBlockSpec {
                name: "log_sigma".to_string(),
                design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros(
                    (n, p_log_sigma),
                ))),
                offset: Array1::zeros(n),
                penalties: Vec::new(),
                nullspace_dims: Vec::new(),
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
        ];

        let options = BlockwiseFitOptions::default();
        let (gradient, hessian) =
            crate::custom_family::custom_family_outer_derivatives(&family, &specs, &options);
        assert_eq!(
            gradient,
            crate::solver::outer_strategy::Derivative::Analytic
        );
        assert_eq!(
            hessian,
            crate::solver::outer_strategy::DeclaredHessianForm::Either,
            "large-n GAMLSS location-scale fits must advertise exact second-order curvature instead of triggering the historical BFGS downgrade"
        );

        let p_total = p_mu + p_log_sigma;
        assert!(
            crate::solver::estimate::reml::unified::prefer_outer_hessian_operator(n, p_total, 2),
            "the large-n work model should select the scalable explicit Hessian-operator representation"
        );

        let plan =
            crate::solver::outer_strategy::plan(&crate::solver::outer_strategy::OuterCapability {
                gradient,
                hessian,
                n_params: 2,
                psi_dim: 0,
                fixed_point_available: false,
                barrier_config: None,
                prefer_gradient_only: false,
                disable_fixed_point: true,
            });
        assert_eq!(plan.solver, crate::solver::outer_strategy::Solver::Arc);
        assert_eq!(
            plan.hessian_source,
            crate::solver::outer_strategy::HessianSource::Analytic
        );
    }

    /// Helper: build a small Gaussian location-scale family + state + specs
    /// for matrix-free joint-Hessian validation.
    fn gls_workspace_fixture() -> (
        GaussianLocationScaleFamily,
        Vec<ParameterBlockState>,
        Vec<ParameterBlockSpec>,
    ) {
        let n = 7usize;
        let p_mu = 3usize;
        let p_ls = 2usize;
        let xmu = Array2::from_shape_fn((n, p_mu), |(i, j)| {
            ((i as f64) * 0.13 + (j as f64) * 0.31).sin()
        });
        let xls = Array2::from_shape_fn((n, p_ls), |(i, j)| {
            ((i as f64) * 0.21 + (j as f64) * 0.47).cos()
        });
        let beta_mu = array![0.10, -0.20, 0.30];
        let beta_ls = array![0.40, -0.10];
        let eta_mu = xmu.dot(&beta_mu);
        let eta_ls = xls.dot(&beta_ls);
        let y = Array1::from_shape_fn(n, |i| 0.5 + 0.1 * (i as f64).cos());
        let weights = Array1::from_elem(n, 1.0);
        let mu_design = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(xmu.clone()));
        let log_sigma_design =
            DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(xls.clone()));
        let family = GaussianLocationScaleFamily {
            y,
            weights,
            mu_design: Some(mu_design.clone()),
            log_sigma_design: Some(log_sigma_design.clone()),
            policy: crate::resource::ResourcePolicy::default_library(),
            cached_row_scalars: std::sync::RwLock::new(None),
        };
        let states = vec![
            ParameterBlockState {
                beta: beta_mu,
                eta: eta_mu,
            },
            ParameterBlockState {
                beta: beta_ls,
                eta: eta_ls,
            },
        ];
        let specs = vec![
            ParameterBlockSpec {
                name: "mu".to_string(),
                design: mu_design,
                offset: Array1::zeros(n),
                penalties: Vec::new(),
                nullspace_dims: Vec::new(),
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
            ParameterBlockSpec {
                name: "log_sigma".to_string(),
                design: log_sigma_design,
                offset: Array1::zeros(n),
                penalties: Vec::new(),
                nullspace_dims: Vec::new(),
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
        ];
        (family, states, specs)
    }

    /// Helper: build a small Binomial location-scale family + state + specs
    /// for matrix-free joint-Hessian validation. Probit is the production link.
    fn bls_workspace_fixture() -> (
        BinomialLocationScaleFamily,
        Vec<ParameterBlockState>,
        Vec<ParameterBlockSpec>,
    ) {
        let n = 8usize;
        let pt = 3usize;
        let pls = 2usize;
        let xt = Array2::from_shape_fn((n, pt), |(i, j)| {
            ((i as f64) * 0.17 + (j as f64) * 0.29).sin()
        });
        let xls = Array2::from_shape_fn((n, pls), |(i, j)| {
            ((i as f64) * 0.23 + (j as f64) * 0.41).cos() * 0.5
        });
        let beta_t = array![0.20, -0.10, 0.05];
        let beta_ls = array![0.30, -0.15];
        let eta_t = xt.dot(&beta_t);
        let eta_ls = xls.dot(&beta_ls);
        let y = Array1::from_iter((0..n).map(|i| if i % 2 == 0 { 1.0 } else { 0.0 }));
        let weights = Array1::from_elem(n, 1.0);
        let threshold_design =
            DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(xt.clone()));
        let log_sigma_design =
            DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(xls.clone()));
        let family = BinomialLocationScaleFamily {
            y,
            weights,
            link_kind: InverseLink::Standard(StandardLink::Probit),
            threshold_design: Some(threshold_design.clone()),
            log_sigma_design: Some(log_sigma_design.clone()),
            policy: crate::resource::ResourcePolicy::default_library(),
        };
        let states = vec![
            ParameterBlockState {
                beta: beta_t,
                eta: eta_t,
            },
            ParameterBlockState {
                beta: beta_ls,
                eta: eta_ls,
            },
        ];
        let specs = vec![
            ParameterBlockSpec {
                name: "threshold".to_string(),
                design: threshold_design,
                offset: Array1::zeros(n),
                penalties: Vec::new(),
                nullspace_dims: Vec::new(),
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
            ParameterBlockSpec {
                name: "log_sigma".to_string(),
                design: log_sigma_design,
                offset: Array1::zeros(n),
                penalties: Vec::new(),
                nullspace_dims: Vec::new(),
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
        ];
        (family, states, specs)
    }

    #[test]
    fn gaussian_location_scale_workspace_matvec_matches_dense() {
        // Patch 7 mirror of the CTN matrix-free reference test: the matrix-
        // free `Hv` and `diag(H)` operators must reconstruct the dense joint
        // Hessian element-wise. This pins the cross-block coefficient
        // (`coeff_ml` in GaussianLocationScaleHessianWorkspace) against any
        // future regression of the t↔ℓ coupling.
        let (family, states, specs) = gls_workspace_fixture();
        let p = states[0].beta.len() + states[1].beta.len();
        let dense = family
            .exact_newton_joint_hessian(&states)
            .expect("dense joint Hessian build")
            .expect("dense joint Hessian present");
        assert_eq!(dense.nrows(), p);
        assert_eq!(dense.ncols(), p);

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");

        let diag_op = workspace
            .hessian_diagonal()
            .expect("diagonal call")
            .expect("diagonal present");
        assert_eq!(diag_op.len(), p);
        for i in 0..p {
            let want = dense[[i, i]];
            let got = diag_op[i];
            assert!(
                (want - got).abs() <= 1e-10 * want.abs().max(1.0) + 1e-10,
                "GLS diagonal mismatch at {i}: dense={want:.6e}, workspace={got:.6e}"
            );
        }

        let directions = [
            Array1::from_vec(vec![1.0, 0.0, 0.0, 0.0, 0.0]),
            Array1::from_vec(vec![0.0, 0.0, 0.0, 1.0, 0.0]),
            Array1::from_vec(vec![0.30, -0.70, 0.50, -0.20, 0.15]),
            Array1::from_vec(vec![-0.42, 0.11, 0.93, 0.05, -0.31]),
        ];
        for (k, v) in directions.iter().enumerate() {
            assert_eq!(v.len(), p);
            let want = dense.dot(v);
            let got = workspace
                .hessian_matvec(v)
                .expect("matvec call")
                .expect("matvec present");
            assert_eq!(got.len(), p);
            for i in 0..p {
                let tol = 1e-10 * want[i].abs().max(1.0) + 1e-10;
                assert!(
                    (want[i] - got[i]).abs() <= tol,
                    "GLS matvec[{k}, {i}] mismatch: dense={:.6e}, workspace={:.6e}",
                    want[i],
                    got[i]
                );
            }
        }
    }

    /// Shared assertion for the four "hessian_dense matches canonical-basis
    /// HVP path" tests across the LocationScale {Gaussian, Binomial} × {non-
    /// wiggle, wiggle} grid. Each test only needs to build the workspace and
    /// pass it here with the expected total coefficient dim and a short
    /// family label (used in the diff message).
    fn assert_dense_matches_canonical_basis_hvp(
        workspace: &dyn crate::custom_family::ExactNewtonJointHessianWorkspace,
        total: usize,
        label: &str,
    ) {
        let dense = workspace
            .hessian_dense()
            .expect("hessian_dense call")
            .expect("hessian_dense present");
        assert_eq!(dense.nrows(), total);
        assert_eq!(dense.ncols(), total);

        // Reconstruct H column-by-column via canonical-basis HVPs (the path
        // the dense build replaces).
        let mut assembled = Array2::<f64>::zeros((total, total));
        for j in 0..total {
            let mut e = Array1::<f64>::zeros(total);
            e[j] = 1.0;
            let col = workspace
                .hessian_matvec(&e)
                .expect("matvec call")
                .expect("matvec present");
            assembled.column_mut(j).assign(&col);
        }
        let assembled_sym = 0.5 * (&assembled + &assembled.t());

        let max_rel = dense
            .iter()
            .zip(assembled_sym.iter())
            .map(|(d, a)| ((d - a) / d.abs().max(a.abs()).max(1.0)).abs())
            .fold(0.0_f64, f64::max);
        assert!(
            max_rel < 1e-12,
            "{label} hessian_dense vs canonical HVP max relative diff: {max_rel:.3e}"
        );
    }

    /// Bit-equivalence guard for the `hessian_dense` hook. The dispatch site
    /// `exact_newton_joint_hessian_source_from_workspace` prefers
    /// `hessian_dense` over the canonical-basis HVP fallback at biobank
    /// scale; this test pins the dense build against the same column-by-
    /// column HVP path it replaces. Any future regression in the GEMM
    /// fill (e.g. swapped block coordinates, sign error in `coeff_ml`)
    /// fails here before it can corrupt outer-Hessian assembly.
    #[test]
    fn gaussian_location_scale_hessian_dense_matches_canonical_basis_hvp_path() {
        assert!(file!().ends_with(".rs"));
        let (family, states, specs) = gls_workspace_fixture();
        let total = states[0].beta.len() + states[1].beta.len();

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");

        assert_dense_matches_canonical_basis_hvp(workspace.as_ref(), total, "GLS");
    }

    /// Bit-equivalence guard for the binomial location-scale dense Hessian
    /// hook. Same structure as the Gaussian non-wiggle test.
    #[test]
    fn binomial_location_scale_hessian_dense_matches_canonical_basis_hvp_path() {
        assert!(file!().ends_with(".rs"));
        let (family, states, specs) = bls_workspace_fixture();
        let total = states[0].beta.len() + states[1].beta.len();

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");

        assert_dense_matches_canonical_basis_hvp(workspace.as_ref(), total, "BLS");
    }

    /// Bit-equivalence guard for the Gaussian location-scale-wiggle dense
    /// Hessian hook. Pins all six wiggle GEMMs (h_mm, h_ml, h_ll, h_mw_b,
    /// h_mw_d, h_lw, h_ww — note the GLS wiggle only has a single
    /// ls↔wiggle GEMM because σ-chain doesn't enter the wiggle term)
    /// against the canonical-basis HVP path.
    #[test]
    fn gaussian_location_scale_wiggle_hessian_dense_matches_canonical_basis_hvp_path() {
        assert!(file!().ends_with(".rs"));
        let (family, states, specs, _xmu, _xls, _xw) = gls_wiggle_workspace_fixture();
        let total = states[0].beta.len() + states[1].beta.len() + states[2].beta.len();

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");

        assert_dense_matches_canonical_basis_hvp(workspace.as_ref(), total, "GLSW");
    }

    /// Bit-equivalence guard for the binomial location-scale-wiggle dense
    /// Hessian hook. Pins all eight wiggle GEMMs (h_tt, h_tl, h_ll,
    /// h_tw_b, h_tw_d, h_lw_b, h_lw_d, h_ww) against the canonical-basis
    /// HVP path.
    #[test]
    fn binomial_location_scale_wiggle_hessian_dense_matches_canonical_basis_hvp_path() {
        assert!(file!().ends_with(".rs"));
        let (family, states, specs, _xt, _xls, _xw) = bls_wiggle_workspace_fixture();
        let total = states[0].beta.len() + states[1].beta.len() + states[2].beta.len();

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");

        assert_dense_matches_canonical_basis_hvp(workspace.as_ref(), total, "BLSW");
    }

    #[test]
    fn gaussian_location_scale_workspace_dh_operator_matches_dense() {
        let (family, states, specs) = gls_workspace_fixture();
        let p = states[0].beta.len() + states[1].beta.len();
        let d_beta = array![0.07, -0.04, 0.21, 0.08, -0.13];
        assert_eq!(d_beta.len(), p);

        let dense_dh = family
            .exact_newton_joint_hessian_directional_derivative(&states, &d_beta)
            .expect("dense dH build")
            .expect("dense dH present");

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");
        let dh_op = workspace
            .directional_derivative_operator(&d_beta)
            .expect("dH operator call")
            .expect("dH operator present");

        let probes = [
            Array1::from_vec(vec![1.0, 0.0, 0.0, 0.0, 0.0]),
            Array1::from_vec(vec![0.0, 1.0, 0.0, 0.0, 0.0]),
            Array1::from_vec(vec![0.30, -0.70, 0.50, -0.20, 0.15]),
        ];
        for (k, w) in probes.iter().enumerate() {
            assert_eq!(w.len(), p);
            let want = dense_dh.dot(w);
            let got = dh_op.mul_vec(w);
            assert_eq!(got.len(), p);
            for i in 0..p {
                let tol = 1e-9 * want[i].abs().max(1.0) + 1e-9;
                assert!(
                    (want[i] - got[i]).abs() <= tol,
                    "GLS dH op matvec[{k}, {i}] mismatch: dense={:.6e}, op={:.6e}",
                    want[i],
                    got[i]
                );
            }
        }
    }

    #[test]
    fn binomial_location_scale_workspace_matvec_matches_dense() {
        // Probit + logb-sigma is the production-pipeline link combination, so
        // the cross-block coefficient `coeff_tl` must agree with the dense
        // assembly to within tight tolerance on randomly sampled directions.
        let (family, states, specs) = bls_workspace_fixture();
        let p = states[0].beta.len() + states[1].beta.len();
        let dense = family
            .exact_newton_joint_hessian(&states)
            .expect("dense joint Hessian build")
            .expect("dense joint Hessian present");
        assert_eq!(dense.nrows(), p);

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");

        let diag_op = workspace
            .hessian_diagonal()
            .expect("diagonal call")
            .expect("diagonal present");
        assert_eq!(diag_op.len(), p);
        for i in 0..p {
            let want = dense[[i, i]];
            let got = diag_op[i];
            assert!(
                (want - got).abs() <= 1e-10 * want.abs().max(1.0) + 1e-10,
                "BLS diagonal mismatch at {i}: dense={want:.6e}, workspace={got:.6e}"
            );
        }

        let directions = [
            Array1::from_vec(vec![1.0, 0.0, 0.0, 0.0, 0.0]),
            Array1::from_vec(vec![0.0, 0.0, 0.0, 1.0, 0.0]),
            Array1::from_vec(vec![0.30, -0.70, 0.50, -0.20, 0.15]),
            Array1::from_vec(vec![-0.42, 0.11, 0.93, 0.05, -0.31]),
        ];
        for (k, v) in directions.iter().enumerate() {
            assert_eq!(v.len(), p);
            let want = dense.dot(v);
            let got = workspace
                .hessian_matvec(v)
                .expect("matvec call")
                .expect("matvec present");
            for i in 0..p {
                let tol = 1e-10 * want[i].abs().max(1.0) + 1e-10;
                assert!(
                    (want[i] - got[i]).abs() <= tol,
                    "BLS matvec[{k}, {i}] mismatch: dense={:.6e}, workspace={:.6e}",
                    want[i],
                    got[i]
                );
            }
        }
    }

    #[test]
    fn binomial_location_scale_operator_workspace_never_densifies_specs() {
        let n = 8usize;
        let pt = 3usize;
        let pls = 2usize;
        let xt = Array2::from_shape_fn((n, pt), |(i, j)| {
            ((i as f64) * 0.17 + (j as f64) * 0.29).sin()
        });
        let xls = Array2::from_shape_fn((n, pls), |(i, j)| {
            ((i as f64) * 0.23 + (j as f64) * 0.41).cos() * 0.5
        });
        let beta_t = array![0.20, -0.10, 0.05];
        let beta_ls = array![0.30, -0.15];
        let eta_t = xt.dot(&beta_t);
        let eta_ls = xls.dot(&beta_ls);
        let family = BinomialLocationScaleFamily {
            y: Array1::from_iter((0..n).map(|i| if i % 2 == 0 { 1.0 } else { 0.0 })),
            weights: Array1::from_elem(n, 1.0),
            link_kind: InverseLink::Standard(StandardLink::Probit),
            threshold_design: None,
            log_sigma_design: None,
            policy: crate::resource::ResourcePolicy::default_library(),
        };
        let states = vec![
            ParameterBlockState {
                beta: beta_t,
                eta: eta_t,
            },
            ParameterBlockState {
                beta: beta_ls,
                eta: eta_ls,
            },
        ];
        let specs = vec![
            ParameterBlockSpec {
                name: "threshold".to_string(),
                design: no_densify_design(xt.clone()),
                offset: Array1::zeros(n),
                penalties: Vec::new(),
                nullspace_dims: Vec::new(),
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
            ParameterBlockSpec {
                name: "log_sigma".to_string(),
                design: no_densify_design(xls.clone()),
                offset: Array1::zeros(n),
                penalties: Vec::new(),
                nullspace_dims: Vec::new(),
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
        ];
        assert!(family.inner_coefficient_hessian_hvp_available(&specs));

        let dense_h = family
            .exact_newton_joint_hessian_from_designs(&states, &xt, &xls)
            .expect("dense reference Hessian")
            .expect("dense Hessian present");
        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("operator workspace build")
            .expect("operator workspace present");
        let got_h = workspace
            .hessian_dense()
            .expect("operator-backed dense Hessian")
            .expect("operator-backed dense Hessian present");
        assert_eq!(got_h.dim(), dense_h.dim());
        for i in 0..got_h.nrows() {
            for j in 0..got_h.ncols() {
                let want = dense_h[[i, j]];
                let got = got_h[[i, j]];
                let tol = 1e-10 * want.abs().max(1.0) + 1e-10;
                assert!(
                    (want - got).abs() <= tol,
                    "lazy BLS dense Hessian mismatch at ({i}, {j}): dense={want:.6e}, op={got:.6e}"
                );
            }
        }
        let v = array![0.30, -0.70, 0.50, -0.20, 0.15];
        let got_hv = workspace
            .hessian_matvec(&v)
            .expect("operator matvec")
            .expect("operator matvec present");
        let want_hv = dense_h.dot(&v);
        for i in 0..v.len() {
            let tol = 1e-10 * want_hv[i].abs().max(1.0) + 1e-10;
            assert!(
                (want_hv[i] - got_hv[i]).abs() <= tol,
                "lazy BLS Hv mismatch at {i}: dense={:.6e}, op={:.6e}",
                want_hv[i],
                got_hv[i]
            );
        }

        let got_diag = workspace
            .hessian_diagonal()
            .expect("operator diagonal")
            .expect("operator diagonal present");
        for i in 0..v.len() {
            let want = dense_h[[i, i]];
            let tol = 1e-10 * want.abs().max(1.0) + 1e-10;
            assert!(
                (want - got_diag[i]).abs() <= tol,
                "lazy BLS diagonal mismatch at {i}: dense={:.6e}, op={:.6e}",
                want,
                got_diag[i]
            );
        }

        let dense_xt = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(xt.clone()));
        let dense_xls = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(xls.clone()));
        let want_grad = family
            .exact_newton_joint_gradient_from_designs(&states, &dense_xt, &dense_xls)
            .expect("dense reference gradient");
        let got_grad = family
            .exact_newton_joint_gradient_evaluation(&states, &specs)
            .expect("operator gradient")
            .expect("operator gradient present");
        assert!(
            (want_grad.log_likelihood - got_grad.log_likelihood).abs() <= 1e-12,
            "operator gradient log-likelihood mismatch"
        );
        for i in 0..v.len() {
            let want = want_grad.gradient[i];
            let got = got_grad.gradient[i];
            let tol = 1e-10 * want.abs().max(1.0) + 1e-10;
            assert!(
                (want - got).abs() <= tol,
                "lazy BLS gradient mismatch at {i}: dense={:.6e}, op={:.6e}",
                want,
                got
            );
        }

        let d_beta = array![0.07, -0.04, 0.21, 0.08, -0.13];
        let dense_dh = family
            .exact_newton_joint_hessian_directional_derivative_from_designs(
                &states, &xt, &xls, &d_beta,
            )
            .expect("dense dH")
            .expect("dense dH present");
        let got_dh_v = workspace
            .directional_derivative_operator(&d_beta)
            .expect("operator dH")
            .expect("operator dH present")
            .mul_vec(&v);
        let want_dh_v = dense_dh.dot(&v);
        for i in 0..v.len() {
            let tol = 1e-9 * want_dh_v[i].abs().max(1.0) + 1e-9;
            assert!(
                (want_dh_v[i] - got_dh_v[i]).abs() <= tol,
                "lazy BLS dH*v mismatch at {i}: dense={:.6e}, op={:.6e}",
                want_dh_v[i],
                got_dh_v[i]
            );
        }

        let d_beta_v = array![-0.11, 0.13, -0.05, -0.22, 0.09];
        let dense_d2h = family
            .exact_newton_joint_hessiansecond_directional_derivative_from_designs(
                &states, &xt, &xls, &d_beta, &d_beta_v,
            )
            .expect("dense d2H")
            .expect("dense d2H present");
        let got_d2h_v = workspace
            .second_directional_derivative_operator(&d_beta, &d_beta_v)
            .expect("operator d2H")
            .expect("operator d2H present")
            .mul_vec(&v);
        let want_d2h_v = dense_d2h.dot(&v);
        for i in 0..v.len() {
            let tol = 1e-9 * want_d2h_v[i].abs().max(1.0) + 1e-9;
            assert!(
                (want_d2h_v[i] - got_d2h_v[i]).abs() <= tol,
                "lazy BLS d2H*v mismatch at {i}: dense={:.6e}, op={:.6e}",
                want_d2h_v[i],
                got_d2h_v[i]
            );
        }
    }

    #[test]
    fn binomial_location_scale_workspace_dh_operator_matches_dense() {
        let (family, states, specs) = bls_workspace_fixture();
        let p = states[0].beta.len() + states[1].beta.len();
        let d_beta = array![0.07, -0.04, 0.21, 0.08, -0.13];
        assert_eq!(d_beta.len(), p);

        let dense_dh = family
            .exact_newton_joint_hessian_directional_derivative(&states, &d_beta)
            .expect("dense dH build")
            .expect("dense dH present");

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");
        let dh_op = workspace
            .directional_derivative_operator(&d_beta)
            .expect("dH operator call")
            .expect("dH operator present");

        let probes = [
            Array1::from_vec(vec![1.0, 0.0, 0.0, 0.0, 0.0]),
            Array1::from_vec(vec![0.0, 1.0, 0.0, 0.0, 0.0]),
            Array1::from_vec(vec![0.30, -0.70, 0.50, -0.20, 0.15]),
        ];
        for (k, w) in probes.iter().enumerate() {
            assert_eq!(w.len(), p);
            let want = dense_dh.dot(w);
            let got = dh_op.mul_vec(w);
            for i in 0..p {
                let tol = 1e-9 * want[i].abs().max(1.0) + 1e-9;
                assert!(
                    (want[i] - got[i]).abs() <= tol,
                    "BLS dH op matvec[{k}, {i}] mismatch: dense={:.6e}, op={:.6e}",
                    want[i],
                    got[i]
                );
            }
        }
    }

    #[test]
    fn binomial_location_scale_workspace_d2h_operator_matches_dense() {
        let (family, states, specs) = bls_workspace_fixture();
        let p = states[0].beta.len() + states[1].beta.len();
        let d_beta_u = array![0.07, -0.04, 0.21, 0.08, -0.13];
        let d_beta_v = array![-0.11, 0.13, -0.05, -0.22, 0.09];
        assert_eq!(d_beta_u.len(), p);
        assert_eq!(d_beta_v.len(), p);

        let dense_d2h = family
            .exact_newton_joint_hessiansecond_directional_derivative(&states, &d_beta_u, &d_beta_v)
            .expect("dense d2H build")
            .expect("dense d2H present");

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");
        let d2h_op = workspace
            .second_directional_derivative_operator(&d_beta_u, &d_beta_v)
            .expect("d2H operator call")
            .expect("d2H operator present");

        let probes = [
            Array1::from_vec(vec![1.0, 0.0, 0.0, 0.0, 0.0]),
            Array1::from_vec(vec![0.0, 1.0, 0.0, 0.0, 0.0]),
            Array1::from_vec(vec![0.30, -0.70, 0.50, -0.20, 0.15]),
        ];
        for (k, w) in probes.iter().enumerate() {
            let want = dense_d2h.dot(w);
            let got = d2h_op.mul_vec(w);
            for i in 0..p {
                let tol = 1e-9 * want[i].abs().max(1.0) + 1e-9;
                assert!(
                    (want[i] - got[i]).abs() <= tol,
                    "BLS d2H op matvec[{k}, {i}] mismatch: dense={:.6e}, op={:.6e}",
                    want[i],
                    got[i]
                );
            }
        }
    }

    #[test]
    fn binomial_location_scale_projected_trace_cache_matches_dense() {
        let (family, states, specs) = bls_workspace_fixture();
        let p = states[0].beta.len() + states[1].beta.len();
        let d_beta_u = array![0.07, -0.04, 0.21, 0.08, -0.13];
        let d_beta_v = array![-0.11, 0.13, -0.05, -0.22, 0.09];
        let factor = Array2::from_shape_fn((p, 3), |(i, j)| {
            ((i as f64 + 1.0) * 0.19 + (j as f64 + 0.5) * 0.37).sin()
        });

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");
        let dh_op = workspace
            .directional_derivative_operator(&d_beta_u)
            .expect("dH operator call")
            .expect("dH operator present");
        let d2h_op = workspace
            .second_directional_derivative_operator(&d_beta_u, &d_beta_v)
            .expect("d2H operator call")
            .expect("d2H operator present");
        let cache = crate::solver::estimate::reml::unified::ProjectedFactorCache::default();

        for (name, op) in [("dH", dh_op.clone()), ("d2H", d2h_op.clone())] {
            let dense = op.to_dense();
            let dense_projected = dense.dot(&factor);
            let want: f64 = factor
                .iter()
                .zip(dense_projected.iter())
                .map(|(&f, &bf)| f * bf)
                .sum();
            let uncached = op.trace_projected_factor(&factor);
            let cached_first = op.trace_projected_factor_cached(&factor, &cache);
            let cached_second = op.trace_projected_factor_cached(&factor, &cache);

            for (label, got) in [
                ("uncached", uncached),
                ("cached_first", cached_first),
                ("cached_second", cached_second),
            ] {
                let tol = 1e-9 * want.abs().max(1.0) + 1e-9;
                assert!(
                    (want - got).abs() <= tol,
                    "{name} projected trace {label} mismatch: dense={want:.6e}, got={got:.6e}"
                );
            }
        }

        let mut reused_factor = factor.clone();
        let cached_probe = dh_op.trace_projected_factor_cached(&reused_factor, &cache);
        assert!(cached_probe.is_finite());
        reused_factor[[0, 0]] += 0.25;
        let dense = dh_op.to_dense();
        let dense_projected = dense.dot(&reused_factor);
        let want: f64 = reused_factor
            .iter()
            .zip(dense_projected.iter())
            .map(|(&f, &bf)| f * bf)
            .sum();
        let got = dh_op.trace_projected_factor_cached(&reused_factor, &cache);
        let tol = 1e-9 * want.abs().max(1.0) + 1e-9;
        assert!(
            (want - got).abs() <= tol,
            "cached projected trace reused stale factor contents: dense={want:.6e}, got={got:.6e}"
        );
    }

    #[test]
    #[should_panic(expected = "two-block cached projected trace factor row mismatch")]
    fn binomial_location_scale_projected_trace_rejects_wrong_factor_rows() {
        let (family, states, specs) = bls_workspace_fixture();
        let p = states[0].beta.len() + states[1].beta.len();
        let d_beta = array![0.07, -0.04, 0.21, 0.08, -0.13];
        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");
        let dh_op = workspace
            .directional_derivative_operator(&d_beta)
            .expect("dH operator call")
            .expect("dH operator present");
        let bad_factor = Array2::<f64>::zeros((p + 1, 2));
        let cache = crate::solver::estimate::reml::unified::ProjectedFactorCache::default();
        dh_op.trace_projected_factor_cached(&bad_factor, &cache);
    }

    #[test]
    fn binomial_location_scale_workspace_dh_operator_finite_difference() {
        // FD check: [H(β + ε u) v − H(β − ε u) v] / (2ε) ≈ DH[u] v
        // The operator must agree with a centered finite-difference of the
        // dense Hessian along an arbitrary coefficient direction u.
        let (family, states, specs) = bls_workspace_fixture();
        let p = states[0].beta.len() + states[1].beta.len();
        let u = array![0.07, -0.04, 0.21, 0.08, -0.13];
        let v = array![0.30, -0.70, 0.50, -0.20, 0.15];
        let eps = 1e-6;
        // Build perturbed states (β ± ε u) using the fixture's designs to
        // recompute η.
        let perturb = |sign: f64| -> Vec<ParameterBlockState> {
            let mut out = states.clone();
            let pt = states[0].beta.len();
            for j in 0..pt {
                out[0].beta[j] += sign * eps * u[j];
            }
            for j in 0..(p - pt) {
                out[1].beta[j] += sign * eps * u[pt + j];
            }
            // recompute η from spec design and new beta.
            let xt_dense = specs[0].design.as_dense_ref().expect("dense xt");
            let xls_dense = specs[1].design.as_dense_ref().expect("dense xls");
            out[0].eta = xt_dense.dot(&out[0].beta);
            out[1].eta = xls_dense.dot(&out[1].beta);
            out
        };
        let states_plus = perturb(1.0);
        let states_minus = perturb(-1.0);
        let h_plus = family
            .exact_newton_joint_hessian(&states_plus)
            .expect("dense H+")
            .expect("dense H+ present");
        let h_minus = family
            .exact_newton_joint_hessian(&states_minus)
            .expect("dense H-")
            .expect("dense H- present");
        let fd = (h_plus.dot(&v) - h_minus.dot(&v)) / (2.0 * eps);

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");
        let dh_op = workspace
            .directional_derivative_operator(&u)
            .expect("dH op call")
            .expect("dH op present");
        let analytic = dh_op.mul_vec(&v);

        for i in 0..p {
            let tol = 1e-5 * fd[i].abs().max(1.0) + 1e-5;
            assert!(
                (fd[i] - analytic[i]).abs() <= tol,
                "BLS dH FD mismatch at {i}: fd={:.6e}, analytic={:.6e}",
                fd[i],
                analytic[i]
            );
        }
    }

    #[test]
    fn binomial_location_scale_workspace_d2h_operator_finite_difference() {
        // FD check on the second directional: [DH(β + ε u') [u] v
        //                                     − DH(β − ε u') [u] v]/(2ε)
        // ≈ D²H[u', u] v. We choose u' = v as the FD-direction and probe
        // with an arbitrary u.
        let (family, states, specs) = bls_workspace_fixture();
        let p = states[0].beta.len() + states[1].beta.len();
        let u = array![0.07, -0.04, 0.21, 0.08, -0.13];
        let u_fd = array![0.30, -0.70, 0.50, -0.20, 0.15];
        let probe = array![-0.21, 0.11, 0.05, 0.32, -0.04];
        let eps = 1e-6;
        let perturb = |sign: f64| -> Vec<ParameterBlockState> {
            let mut out = states.clone();
            let pt = states[0].beta.len();
            for j in 0..pt {
                out[0].beta[j] += sign * eps * u_fd[j];
            }
            for j in 0..(p - pt) {
                out[1].beta[j] += sign * eps * u_fd[pt + j];
            }
            let xt_dense = specs[0].design.as_dense_ref().expect("dense xt");
            let xls_dense = specs[1].design.as_dense_ref().expect("dense xls");
            out[0].eta = xt_dense.dot(&out[0].beta);
            out[1].eta = xls_dense.dot(&out[1].beta);
            out
        };
        let states_plus = perturb(1.0);
        let states_minus = perturb(-1.0);
        let dh_plus = family
            .exact_newton_joint_hessian_directional_derivative(&states_plus, &u)
            .expect("dense dH+")
            .expect("dense dH+ present");
        let dh_minus = family
            .exact_newton_joint_hessian_directional_derivative(&states_minus, &u)
            .expect("dense dH-")
            .expect("dense dH- present");
        let fd = (dh_plus.dot(&probe) - dh_minus.dot(&probe)) / (2.0 * eps);

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");
        let d2h_op = workspace
            .second_directional_derivative_operator(&u_fd, &u)
            .expect("d2H op call")
            .expect("d2H op present");
        let analytic = d2h_op.mul_vec(&probe);

        for i in 0..p {
            let tol = 5e-5 * fd[i].abs().max(1.0) + 5e-5;
            assert!(
                (fd[i] - analytic[i]).abs() <= tol,
                "BLS d2H FD mismatch at {i}: fd={:.6e}, analytic={:.6e}",
                fd[i],
                analytic[i]
            );
        }
    }

    #[test]
    fn gaussian_location_scale_workspace_d2h_operator_matches_dense() {
        let (family, states, specs) = gls_workspace_fixture();
        let p = states[0].beta.len() + states[1].beta.len();
        let d_beta_u = array![0.07, -0.04, 0.21, 0.08, -0.13];
        let d_beta_v = array![-0.11, 0.13, -0.05, -0.22, 0.09];

        let dense_d2h = family
            .exact_newton_joint_hessiansecond_directional_derivative(&states, &d_beta_u, &d_beta_v)
            .expect("dense d2H build")
            .expect("dense d2H present");

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");
        let d2h_op = workspace
            .second_directional_derivative_operator(&d_beta_u, &d_beta_v)
            .expect("d2H op call")
            .expect("d2H op present");

        let probes = [
            Array1::from_vec(vec![1.0, 0.0, 0.0, 0.0, 0.0]),
            Array1::from_vec(vec![0.0, 1.0, 0.0, 0.0, 0.0]),
            Array1::from_vec(vec![0.30, -0.70, 0.50, -0.20, 0.15]),
        ];
        for (k, w) in probes.iter().enumerate() {
            let want = dense_d2h.dot(w);
            let got = d2h_op.mul_vec(w);
            for i in 0..p {
                let tol = 1e-9 * want[i].abs().max(1.0) + 1e-9;
                assert!(
                    (want[i] - got[i]).abs() <= tol,
                    "GLS d2H op matvec[{k}, {i}] mismatch: dense={:.6e}, op={:.6e}",
                    want[i],
                    got[i]
                );
            }
        }
    }

    #[test]
    fn binomial_location_scale_wiggle_workspace_matvec_matches_dense() {
        // Probit + linkwiggle is the production-pipeline supervised link.
        // This is the load-bearing cross-block test: it pins the b/d wiggle
        // coefficients (`coeff_tw_b/d`, `coeff_lw_b/d`, `coeffww`) and the
        // t↔ℓ block against the dense assembly used by
        // `exact_newton_joint_hessian` for the wiggle variant.
        let (family, states, specs, _xt, _xls, wiggle_design_current) =
            bls_wiggle_workspace_fixture();
        let pt = 3usize;
        let pls = 2usize;
        let pw = wiggle_design_current.ncols();

        let p = pt + pls + pw;
        let dense = family
            .exact_newton_joint_hessian(&states)
            .expect("dense joint Hessian build")
            .expect("dense joint Hessian present");
        assert_eq!(dense.nrows(), p);

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");

        let directions = vec![
            // Axis-aligned probes per block:
            Array1::from_shape_fn(p, |i| if i == 0 { 1.0 } else { 0.0 }),
            Array1::from_shape_fn(p, |i| if i == pt { 1.0 } else { 0.0 }),
            Array1::from_shape_fn(p, |i| if i == pt + pls { 1.0 } else { 0.0 }),
            // Mixed direction across all three blocks:
            Array1::from_shape_fn(p, |i| 0.1 * ((i + 1) as f64).cos()),
        ];
        for (k, v) in directions.iter().enumerate() {
            let want = dense.dot(v);
            let got = workspace
                .hessian_matvec(v)
                .expect("matvec call")
                .expect("matvec present");
            for i in 0..p {
                let tol = 1e-9 * want[i].abs().max(1.0) + 1e-9;
                assert!(
                    (want[i] - got[i]).abs() <= tol,
                    "BLSW matvec[{k}, {i}] mismatch: dense={:.6e}, workspace={:.6e}",
                    want[i],
                    got[i]
                );
            }
        }
    }

    /// Helper: build a BLS Wiggle family + states + specs fixture
    /// (mirrors the inline structure of
    /// `binomial_location_scale_wiggle_workspace_matvec_matches_dense`).
    fn bls_wiggle_workspace_fixture() -> (
        BinomialLocationScaleWiggleFamily,
        Vec<ParameterBlockState>,
        Vec<ParameterBlockSpec>,
        Array2<f64>,
        Array2<f64>,
        Array2<f64>,
    ) {
        let n = 10usize;
        let pt = 3usize;
        let pls = 2usize;
        let xt = Array2::from_shape_fn((n, pt), |(i, j)| {
            ((i as f64) * 0.17 + (j as f64) * 0.29).sin() * 0.4
        });
        let xls = Array2::from_shape_fn((n, pls), |(i, j)| {
            ((i as f64) * 0.23 + (j as f64) * 0.41).cos() * 0.3
        });
        let beta_t = array![0.20, -0.10, 0.05];
        let beta_ls = array![0.30, -0.15];
        let eta_t = xt.dot(&beta_t);
        let eta_ls = xls.dot(&beta_ls);
        let q_seed = Array1::linspace(-1.0, 1.0, n);
        let (wiggle_block, knots) = BinomialLocationScaleWiggleFamily::buildwiggle_block_input(
            q_seed.view(),
            2,
            3,
            2,
            false,
        )
        .expect("wiggle block");
        let y = Array1::from_iter((0..n).map(|i| if i % 2 == 0 { 1.0 } else { 0.0 }));
        let weights = Array1::from_elem(n, 1.0);
        let threshold_design =
            DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(xt.clone()));
        let log_sigma_design =
            DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(xls.clone()));
        let family = BinomialLocationScaleWiggleFamily {
            y,
            weights,
            link_kind: InverseLink::Standard(StandardLink::Probit),
            threshold_design: Some(threshold_design.clone()),
            log_sigma_design: Some(log_sigma_design.clone()),
            wiggle_knots: knots,
            wiggle_degree: 2,
            policy: crate::resource::ResourcePolicy::default_library(),
        };
        let q0 = Array1::from_iter(
            eta_t
                .iter()
                .zip(eta_ls.iter())
                .map(|(&eta_t_i, &eta_ls_i)| {
                    binomial_location_scale_q0(eta_t_i, exp_sigma_from_eta_scalar(eta_ls_i))
                }),
        );
        let wiggle_design_current = family
            .wiggle_design(q0.view())
            .expect("current wiggle basis");
        let pw = wiggle_design_current.ncols();
        let beta_w = Array1::from_shape_fn(pw, |j| 0.05 * ((j + 1) as f64).cos());
        let eta_w = wiggle_design_current.dot(&beta_w);
        let states = vec![
            ParameterBlockState {
                beta: beta_t,
                eta: eta_t,
            },
            ParameterBlockState {
                beta: beta_ls,
                eta: eta_ls,
            },
            ParameterBlockState {
                beta: beta_w,
                eta: eta_w,
            },
        ];
        let specs = vec![
            ParameterBlockSpec {
                name: "threshold".to_string(),
                design: threshold_design,
                offset: Array1::zeros(n),
                penalties: Vec::new(),
                nullspace_dims: Vec::new(),
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
            ParameterBlockSpec {
                name: "log_sigma".to_string(),
                design: log_sigma_design,
                offset: Array1::zeros(n),
                penalties: Vec::new(),
                nullspace_dims: Vec::new(),
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
            ParameterBlockSpec {
                name: "wiggle".to_string(),
                design: wiggle_block.design,
                offset: Array1::zeros(n),
                penalties: Vec::new(),
                nullspace_dims: Vec::new(),
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
        ];
        (family, states, specs, xt, xls, wiggle_design_current)
    }

    #[test]
    fn binomial_location_scale_wiggle_workspace_dh_operator_matches_dense() {
        let (family, states, specs, _xt, _xls, _xw) = bls_wiggle_workspace_fixture();
        let p = states[0].beta.len() + states[1].beta.len() + states[2].beta.len();
        let d_beta = Array1::from_shape_fn(p, |i| 0.05 * ((i + 1) as f64).cos());

        let dense_dh = family
            .exact_newton_joint_hessian_directional_derivative(&states, &d_beta)
            .expect("dense dH build")
            .expect("dense dH present");
        assert_eq!(dense_dh.nrows(), p);

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");
        let dh_op = workspace
            .directional_derivative_operator(&d_beta)
            .expect("dH op call")
            .expect("dH op present");

        let probes = [
            Array1::from_shape_fn(p, |i| if i == 0 { 1.0 } else { 0.0 }),
            Array1::from_shape_fn(p, |i| if i == states[0].beta.len() { 1.0 } else { 0.0 }),
            Array1::from_shape_fn(p, |i| {
                if i == states[0].beta.len() + states[1].beta.len() {
                    1.0
                } else {
                    0.0
                }
            }),
            Array1::from_shape_fn(p, |i| 0.07 * ((i + 2) as f64).sin()),
        ];
        for (k, w) in probes.iter().enumerate() {
            let want = dense_dh.dot(w);
            let got = dh_op.mul_vec(w);
            for i in 0..p {
                let tol = 1e-9 * want[i].abs().max(1.0) + 1e-9;
                assert!(
                    (want[i] - got[i]).abs() <= tol,
                    "BLSW dH op matvec[{k}, {i}] mismatch: dense={:.6e}, op={:.6e}",
                    want[i],
                    got[i]
                );
            }
        }
    }

    #[test]
    fn binomial_location_scale_wiggle_workspace_dh_operator_finite_difference() {
        let (family, states, specs, xt, xls, _xw) = bls_wiggle_workspace_fixture();
        let p = states[0].beta.len() + states[1].beta.len() + states[2].beta.len();
        let u = Array1::from_shape_fn(p, |i| 0.05 * ((i + 1) as f64).cos());
        let v = Array1::from_shape_fn(p, |i| 0.07 * ((i + 2) as f64).sin());
        let pt = states[0].beta.len();
        let pls = states[1].beta.len();
        let eps = 1e-5;
        let perturb = |sign: f64| -> Vec<ParameterBlockState> {
            let mut out = states.clone();
            for j in 0..pt {
                out[0].beta[j] += sign * eps * u[j];
            }
            for j in 0..pls {
                out[1].beta[j] += sign * eps * u[pt + j];
            }
            for j in 0..(p - pt - pls) {
                out[2].beta[j] += sign * eps * u[pt + pls + j];
            }
            out[0].eta = xt.dot(&out[0].beta);
            out[1].eta = xls.dot(&out[1].beta);
            let q0 = Array1::from_iter(out[0].eta.iter().zip(out[1].eta.iter()).map(
                |(&eta_t, &eta_ls)| {
                    binomial_location_scale_q0(eta_t, exp_sigma_from_eta_scalar(eta_ls))
                },
            ));
            out[2].eta = family
                .wiggle_design(q0.view())
                .expect("perturbed wiggle basis")
                .dot(&out[2].beta);
            out
        };
        let states_plus = perturb(1.0);
        let states_minus = perturb(-1.0);
        let h_plus = family
            .exact_newton_joint_hessian(&states_plus)
            .expect("dense H+")
            .expect("dense H+ present");
        let h_minus = family
            .exact_newton_joint_hessian(&states_minus)
            .expect("dense H-")
            .expect("dense H- present");
        let fd = (h_plus.dot(&v) - h_minus.dot(&v)) / (2.0 * eps);

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");
        let dh_op = workspace
            .directional_derivative_operator(&u)
            .expect("dH op call")
            .expect("dH op present");
        let analytic = dh_op.mul_vec(&v);

        for i in 0..p {
            let tol = 5e-5 * fd[i].abs().max(1.0) + 5e-5;
            assert!(
                (fd[i] - analytic[i]).abs() <= tol,
                "BLSW dH FD mismatch at {i}: fd={:.6e}, analytic={:.6e}",
                fd[i],
                analytic[i]
            );
        }
    }

    #[test]
    fn binomial_location_scale_wiggle_workspace_d2h_operator_matches_dense() {
        let (family, states, specs, _xt, _xls, _xw) = bls_wiggle_workspace_fixture();
        let p = states[0].beta.len() + states[1].beta.len() + states[2].beta.len();
        let d_beta_u = Array1::from_shape_fn(p, |i| 0.05 * ((i + 1) as f64).cos());
        let d_beta_v = Array1::from_shape_fn(p, |i| 0.07 * ((i + 2) as f64).sin());

        let dense_d2h = family
            .exact_newton_joint_hessiansecond_directional_derivative(&states, &d_beta_u, &d_beta_v)
            .expect("dense d2H build")
            .expect("dense d2H present");
        assert_eq!(dense_d2h.nrows(), p);

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");
        let d2h_op = workspace
            .second_directional_derivative_operator(&d_beta_u, &d_beta_v)
            .expect("d2H op call")
            .expect("d2H op present");

        let pt = states[0].beta.len();
        let pls = states[1].beta.len();
        let probes = [
            Array1::from_shape_fn(p, |i| if i == 0 { 1.0 } else { 0.0 }),
            Array1::from_shape_fn(p, |i| if i == pt { 1.0 } else { 0.0 }),
            Array1::from_shape_fn(p, |i| if i == pt + pls { 1.0 } else { 0.0 }),
            Array1::from_shape_fn(p, |i| 0.07 * ((i + 3) as f64).cos()),
        ];
        for (k, w) in probes.iter().enumerate() {
            let want = dense_d2h.dot(w);
            let got = d2h_op.mul_vec(w);
            for i in 0..p {
                let tol = 1e-9 * want[i].abs().max(1.0) + 1e-9;
                assert!(
                    (want[i] - got[i]).abs() <= tol,
                    "BLSW d2H op matvec[{k}, {i}] mismatch: dense={:.6e}, op={:.6e}",
                    want[i],
                    got[i]
                );
            }
        }
    }

    #[test]
    fn binomial_location_scale_wiggle_workspace_d2h_operator_finite_difference() {
        // FD check: [DH(β + ε u_fd) [u] v − DH(β − ε u_fd) [u] v] / (2ε)
        // ≈ D²H[u_fd, u] v.
        let (family, states, specs, xt, xls, xw) = bls_wiggle_workspace_fixture();
        let p = states[0].beta.len() + states[1].beta.len() + states[2].beta.len();
        let u_fd = Array1::from_shape_fn(p, |i| 0.05 * ((i + 1) as f64).cos());
        let u = Array1::from_shape_fn(p, |i| 0.07 * ((i + 2) as f64).sin());
        let probe = Array1::from_shape_fn(p, |i| 0.04 * ((i + 3) as f64).sin());
        let pt = states[0].beta.len();
        let pls = states[1].beta.len();
        let eps = 1e-5;
        let perturb = |sign: f64| -> Vec<ParameterBlockState> {
            let mut out = states.clone();
            for j in 0..pt {
                out[0].beta[j] += sign * eps * u_fd[j];
            }
            for j in 0..pls {
                out[1].beta[j] += sign * eps * u_fd[pt + j];
            }
            for j in 0..(p - pt - pls) {
                out[2].beta[j] += sign * eps * u_fd[pt + pls + j];
            }
            out[0].eta = xt.dot(&out[0].beta);
            out[1].eta = xls.dot(&out[1].beta);
            out[2].eta = xw.dot(&out[2].beta);
            out
        };
        let states_plus = perturb(1.0);
        let states_minus = perturb(-1.0);
        let dh_plus = family
            .exact_newton_joint_hessian_directional_derivative(&states_plus, &u)
            .expect("dense dH+")
            .expect("dense dH+ present");
        let dh_minus = family
            .exact_newton_joint_hessian_directional_derivative(&states_minus, &u)
            .expect("dense dH-")
            .expect("dense dH- present");
        let fd = (dh_plus.dot(&probe) - dh_minus.dot(&probe)) / (2.0 * eps);

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");
        let d2h_op = workspace
            .second_directional_derivative_operator(&u_fd, &u)
            .expect("d2H op call")
            .expect("d2H op present");
        let analytic = d2h_op.mul_vec(&probe);

        for i in 0..p {
            let tol = 5e-5 * fd[i].abs().max(1.0) + 5e-5;
            assert!(
                (fd[i] - analytic[i]).abs() <= tol,
                "BLSW d2H FD mismatch at {i}: fd={:.6e}, analytic={:.6e}",
                fd[i],
                analytic[i]
            );
        }
    }

    #[test]
    fn gaussian_location_scale_wiggle_workspace_matvec_matches_dense() {
        let n = 10usize;
        let p_mu = 3usize;
        let p_ls = 2usize;
        let xmu = Array2::from_shape_fn((n, p_mu), |(i, j)| {
            ((i as f64) * 0.13 + (j as f64) * 0.31).sin() * 0.4
        });
        let xls = Array2::from_shape_fn((n, p_ls), |(i, j)| {
            ((i as f64) * 0.21 + (j as f64) * 0.47).cos() * 0.3
        });
        let beta_mu = array![0.10, -0.20, 0.30];
        let beta_ls = array![0.40, -0.10];
        let eta_mu = xmu.dot(&beta_mu);
        let eta_ls = xls.dot(&beta_ls);

        let q_seed = Array1::linspace(-1.0, 1.0, n);
        let (wiggle_block, knots) = BinomialLocationScaleWiggleFamily::buildwiggle_block_input(
            q_seed.view(),
            2,
            3,
            2,
            false,
        )
        .expect("wiggle block");
        let wiggle_design_dense = match wiggle_block.design.as_dense_ref() {
            Some(d) => d.clone(),
            None => panic!("wiggle design must be dense for this test fixture"),
        };
        let pw = wiggle_design_dense.ncols();
        let beta_w = Array1::from_shape_fn(pw, |j| 0.05 * ((j + 1) as f64).sin());
        let eta_w = wiggle_design_dense.dot(&beta_w);

        let y = Array1::from_shape_fn(n, |i| 0.5 + 0.1 * (i as f64).cos());
        let weights = Array1::from_elem(n, 1.0);
        let mu_design = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(xmu.clone()));
        let log_sigma_design =
            DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(xls.clone()));
        let family = GaussianLocationScaleWiggleFamily {
            y,
            weights,
            mu_design: Some(mu_design.clone()),
            log_sigma_design: Some(log_sigma_design.clone()),
            wiggle_knots: knots,
            wiggle_degree: 2,
            policy: crate::resource::ResourcePolicy::default_library(),
            cached_row_scalars: std::sync::RwLock::new(None),
        };
        let states = vec![
            ParameterBlockState {
                beta: beta_mu,
                eta: eta_mu,
            },
            ParameterBlockState {
                beta: beta_ls,
                eta: eta_ls,
            },
            ParameterBlockState {
                beta: beta_w,
                eta: eta_w,
            },
        ];
        let specs = vec![
            ParameterBlockSpec {
                name: "mu".to_string(),
                design: mu_design,
                offset: Array1::zeros(n),
                penalties: Vec::new(),
                nullspace_dims: Vec::new(),
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
            ParameterBlockSpec {
                name: "log_sigma".to_string(),
                design: log_sigma_design,
                offset: Array1::zeros(n),
                penalties: Vec::new(),
                nullspace_dims: Vec::new(),
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
            ParameterBlockSpec {
                name: "wiggle".to_string(),
                design: wiggle_block.design,
                offset: Array1::zeros(n),
                penalties: Vec::new(),
                nullspace_dims: Vec::new(),
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
        ];

        let p = p_mu + p_ls + pw;
        let dense = family
            .exact_newton_joint_hessian(&states)
            .expect("dense joint Hessian build")
            .expect("dense joint Hessian present");
        assert_eq!(dense.nrows(), p);

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");

        let directions = [
            Array1::from_shape_fn(p, |i| if i == 0 { 1.0 } else { 0.0 }),
            Array1::from_shape_fn(p, |i| if i == p_mu { 1.0 } else { 0.0 }),
            Array1::from_shape_fn(p, |i| if i == p_mu + p_ls { 1.0 } else { 0.0 }),
            Array1::from_shape_fn(p, |i| 0.1 * ((i + 1) as f64).sin()),
        ];
        for (k, v) in directions.iter().enumerate() {
            let want = dense.dot(v);
            let got = workspace
                .hessian_matvec(v)
                .expect("matvec call")
                .expect("matvec present");
            for i in 0..p {
                let tol = 1e-9 * want[i].abs().max(1.0) + 1e-9;
                assert!(
                    (want[i] - got[i]).abs() <= tol,
                    "GLSW matvec[{k}, {i}] mismatch: dense={:.6e}, workspace={:.6e}",
                    want[i],
                    got[i]
                );
            }
        }
    }

    /// Helper: build a GLS Wiggle family + states + specs fixture
    /// (mirrors the inline structure of
    /// `gaussian_location_scale_wiggle_workspace_matvec_matches_dense`).
    fn gls_wiggle_workspace_fixture() -> (
        GaussianLocationScaleWiggleFamily,
        Vec<ParameterBlockState>,
        Vec<ParameterBlockSpec>,
        Array2<f64>,
        Array2<f64>,
        Array2<f64>,
    ) {
        let n = 10usize;
        let p_mu = 3usize;
        let p_ls = 2usize;
        let xmu = Array2::from_shape_fn((n, p_mu), |(i, j)| {
            ((i as f64) * 0.13 + (j as f64) * 0.31).sin() * 0.4
        });
        let xls = Array2::from_shape_fn((n, p_ls), |(i, j)| {
            ((i as f64) * 0.21 + (j as f64) * 0.47).cos() * 0.3
        });
        let beta_mu = array![0.10, -0.20, 0.30];
        let beta_ls = array![0.40, -0.10];
        let eta_mu = xmu.dot(&beta_mu);
        let eta_ls = xls.dot(&beta_ls);
        let q_seed = Array1::linspace(-1.0, 1.0, n);
        let (wiggle_block, knots) = BinomialLocationScaleWiggleFamily::buildwiggle_block_input(
            q_seed.view(),
            2,
            3,
            2,
            false,
        )
        .expect("wiggle block");
        let pw = wiggle_block.design.ncols();
        let beta_w = Array1::from_shape_fn(pw, |j| 0.05 * ((j + 1) as f64).sin());
        let y = Array1::from_shape_fn(n, |i| 0.5 + 0.1 * (i as f64).cos());
        let weights = Array1::from_elem(n, 1.0);
        let mu_design = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(xmu.clone()));
        let log_sigma_design =
            DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(xls.clone()));
        let family = GaussianLocationScaleWiggleFamily {
            y,
            weights,
            mu_design: Some(mu_design.clone()),
            log_sigma_design: Some(log_sigma_design.clone()),
            wiggle_knots: knots,
            wiggle_degree: 2,
            policy: crate::resource::ResourcePolicy::default_library(),
            cached_row_scalars: std::sync::RwLock::new(None),
        };
        // The wiggle block has dynamic geometry (q0-dependent basis): the
        // model is q = q0 + B(q0)·β_w, so η_w must be evaluated at the
        // *current* q0, not at the spec's static seed grid. Mirror what
        // `refresh_all_block_etas` does at fit time so the fixture state
        // satisfies the analytical formula's invariant.
        let xw_at_q0 = family
            .wiggle_design(eta_mu.view())
            .expect("wiggle basis at q0");
        let eta_w = xw_at_q0.dot(&beta_w);
        let states = vec![
            ParameterBlockState {
                beta: beta_mu,
                eta: eta_mu,
            },
            ParameterBlockState {
                beta: beta_ls,
                eta: eta_ls,
            },
            ParameterBlockState {
                beta: beta_w,
                eta: eta_w,
            },
        ];
        let specs = vec![
            ParameterBlockSpec {
                name: "mu".to_string(),
                design: mu_design,
                offset: Array1::zeros(n),
                penalties: Vec::new(),
                nullspace_dims: Vec::new(),
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
            ParameterBlockSpec {
                name: "log_sigma".to_string(),
                design: log_sigma_design,
                offset: Array1::zeros(n),
                penalties: Vec::new(),
                nullspace_dims: Vec::new(),
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
            ParameterBlockSpec {
                name: "wiggle".to_string(),
                design: wiggle_block.design,
                offset: Array1::zeros(n),
                penalties: Vec::new(),
                nullspace_dims: Vec::new(),
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
        ];
        (family, states, specs, xmu, xls, xw_at_q0)
    }

    #[test]
    fn gaussian_location_scale_wiggle_workspace_dh_operator_matches_dense() {
        let (family, states, specs, _xmu, _xls, _xw) = gls_wiggle_workspace_fixture();
        let p = states[0].beta.len() + states[1].beta.len() + states[2].beta.len();
        let d_beta = Array1::from_shape_fn(p, |i| 0.05 * ((i + 1) as f64).sin());

        let dense_dh = family
            .exact_newton_joint_hessian_directional_derivative(&states, &d_beta)
            .expect("dense dH build")
            .expect("dense dH present");
        assert_eq!(dense_dh.nrows(), p);

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");
        let dh_op = workspace
            .directional_derivative_operator(&d_beta)
            .expect("dH op call")
            .expect("dH op present");

        let pmu = states[0].beta.len();
        let pls = states[1].beta.len();
        let probes = [
            Array1::from_shape_fn(p, |i| if i == 0 { 1.0 } else { 0.0 }),
            Array1::from_shape_fn(p, |i| if i == pmu { 1.0 } else { 0.0 }),
            Array1::from_shape_fn(p, |i| if i == pmu + pls { 1.0 } else { 0.0 }),
            Array1::from_shape_fn(p, |i| 0.07 * ((i + 2) as f64).cos()),
        ];
        for (k, w) in probes.iter().enumerate() {
            let want = dense_dh.dot(w);
            let got = dh_op.mul_vec(w);
            for i in 0..p {
                let tol = 1e-9 * want[i].abs().max(1.0) + 1e-9;
                assert!(
                    (want[i] - got[i]).abs() <= tol,
                    "GLSW dH op matvec[{k}, {i}] mismatch: dense={:.6e}, op={:.6e}",
                    want[i],
                    got[i]
                );
            }
        }
    }

    #[test]
    fn gaussian_location_scale_wiggle_workspace_dh_operator_finite_difference() {
        let (family, states, specs, xmu, xls, _xw) = gls_wiggle_workspace_fixture();
        let p = states[0].beta.len() + states[1].beta.len() + states[2].beta.len();
        let u = Array1::from_shape_fn(p, |i| 0.05 * ((i + 1) as f64).cos());
        let v = Array1::from_shape_fn(p, |i| 0.07 * ((i + 2) as f64).sin());
        let pmu = states[0].beta.len();
        let pls = states[1].beta.len();
        let eps = 1e-5;
        let perturb = |sign: f64| -> Vec<ParameterBlockState> {
            let mut out = states.clone();
            for j in 0..pmu {
                out[0].beta[j] += sign * eps * u[j];
            }
            for j in 0..pls {
                out[1].beta[j] += sign * eps * u[pmu + j];
            }
            for j in 0..(p - pmu - pls) {
                out[2].beta[j] += sign * eps * u[pmu + pls + j];
            }
            out[0].eta = xmu.dot(&out[0].beta);
            out[1].eta = xls.dot(&out[1].beta);
            // Wiggle geometry is dynamic: η_w = B(q0)·β_w at the perturbed
            // q0, matching what `refresh_all_block_etas` would produce. Using
            // a static spec design here would compute the FD of a different
            // model than the analytical dH formula assumes (which carries
            // dq/dq0 = 1 + B'(q0)·β_w through the chain rule).
            let xw_perturbed = family
                .wiggle_design(out[0].eta.view())
                .expect("wiggle basis at perturbed q0");
            out[2].eta = xw_perturbed.dot(&out[2].beta);
            out
        };
        let states_plus = perturb(1.0);
        let states_minus = perturb(-1.0);
        let h_plus = family
            .exact_newton_joint_hessian(&states_plus)
            .expect("dense H+")
            .expect("dense H+ present");
        let h_minus = family
            .exact_newton_joint_hessian(&states_minus)
            .expect("dense H-")
            .expect("dense H- present");
        let fd = (h_plus.dot(&v) - h_minus.dot(&v)) / (2.0 * eps);

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");
        let dh_op = workspace
            .directional_derivative_operator(&u)
            .expect("dH op call")
            .expect("dH op present");
        let analytic = dh_op.mul_vec(&v);

        for i in 0..p {
            let tol = 5e-5 * fd[i].abs().max(1.0) + 5e-5;
            assert!(
                (fd[i] - analytic[i]).abs() <= tol,
                "GLSW dH FD mismatch at {i}: fd={:.6e}, analytic={:.6e}",
                fd[i],
                analytic[i]
            );
        }
    }

    #[test]
    fn gaussian_location_scale_wiggle_workspace_d2h_operator_matches_dense() {
        let (family, states, specs, _xmu, _xls, _xw) = gls_wiggle_workspace_fixture();
        let p = states[0].beta.len() + states[1].beta.len() + states[2].beta.len();
        let d_beta_u = Array1::from_shape_fn(p, |i| 0.05 * ((i + 1) as f64).sin());
        let d_beta_v = Array1::from_shape_fn(p, |i| 0.07 * ((i + 2) as f64).cos());

        let dense_d2h = family
            .exact_newton_joint_hessiansecond_directional_derivative(&states, &d_beta_u, &d_beta_v)
            .expect("dense d2H build")
            .expect("dense d2H present");
        assert_eq!(dense_d2h.nrows(), p);

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");
        let d2h_op = workspace
            .second_directional_derivative_operator(&d_beta_u, &d_beta_v)
            .expect("d2H op call")
            .expect("d2H op present");

        let pmu = states[0].beta.len();
        let pls = states[1].beta.len();
        let probes = [
            Array1::from_shape_fn(p, |i| if i == 0 { 1.0 } else { 0.0 }),
            Array1::from_shape_fn(p, |i| if i == pmu { 1.0 } else { 0.0 }),
            Array1::from_shape_fn(p, |i| if i == pmu + pls { 1.0 } else { 0.0 }),
            Array1::from_shape_fn(p, |i| 0.07 * ((i + 3) as f64).cos()),
        ];
        for (k, w) in probes.iter().enumerate() {
            let want = dense_d2h.dot(w);
            let got = d2h_op.mul_vec(w);
            for i in 0..p {
                let tol = 1e-9 * want[i].abs().max(1.0) + 1e-9;
                assert!(
                    (want[i] - got[i]).abs() <= tol,
                    "GLSW d2H op matvec[{k}, {i}] mismatch: dense={:.6e}, op={:.6e}",
                    want[i],
                    got[i]
                );
            }
        }
    }

    #[test]
    fn gaussian_location_scale_wiggle_workspace_d2h_operator_finite_difference() {
        let (family, states, specs, xmu, xls, _xw) = gls_wiggle_workspace_fixture();
        let p = states[0].beta.len() + states[1].beta.len() + states[2].beta.len();
        let u_fd = Array1::from_shape_fn(p, |i| 0.05 * ((i + 1) as f64).cos());
        let u = Array1::from_shape_fn(p, |i| 0.07 * ((i + 2) as f64).sin());
        let probe = Array1::from_shape_fn(p, |i| 0.04 * ((i + 3) as f64).sin());
        let pmu = states[0].beta.len();
        let pls = states[1].beta.len();
        let eps = 1e-5;
        let perturb = |sign: f64| -> Vec<ParameterBlockState> {
            let mut out = states.clone();
            for j in 0..pmu {
                out[0].beta[j] += sign * eps * u_fd[j];
            }
            for j in 0..pls {
                out[1].beta[j] += sign * eps * u_fd[pmu + j];
            }
            for j in 0..(p - pmu - pls) {
                out[2].beta[j] += sign * eps * u_fd[pmu + pls + j];
            }
            out[0].eta = xmu.dot(&out[0].beta);
            out[1].eta = xls.dot(&out[1].beta);
            // Wiggle geometry is dynamic: η_w = B(q0)·β_w at the perturbed q0.
            let xw_perturbed = family
                .wiggle_design(out[0].eta.view())
                .expect("wiggle basis at perturbed q0");
            out[2].eta = xw_perturbed.dot(&out[2].beta);
            out
        };
        let states_plus = perturb(1.0);
        let states_minus = perturb(-1.0);
        let dh_plus = family
            .exact_newton_joint_hessian_directional_derivative(&states_plus, &u)
            .expect("dense dH+")
            .expect("dense dH+ present");
        let dh_minus = family
            .exact_newton_joint_hessian_directional_derivative(&states_minus, &u)
            .expect("dense dH-")
            .expect("dense dH- present");
        let fd = (dh_plus.dot(&probe) - dh_minus.dot(&probe)) / (2.0 * eps);

        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace build")
            .expect("workspace present");
        let d2h_op = workspace
            .second_directional_derivative_operator(&u_fd, &u)
            .expect("d2H op call")
            .expect("d2H op present");
        let analytic = d2h_op.mul_vec(&probe);

        for i in 0..p {
            let tol = 5e-5 * fd[i].abs().max(1.0) + 5e-5;
            assert!(
                (fd[i] - analytic[i]).abs() <= tol,
                "GLSW d2H FD mismatch at {i}: fd={:.6e}, analytic={:.6e}",
                fd[i],
                analytic[i]
            );
        }
    }

    #[test]
    fn zeroweightrows_stay_inactive_in_builtin_diagonal_families() {
        let weights = Array1::from_vec(vec![0.0, 1.0]);

        let gaussian = GaussianLocationScaleFamily {
            y: Array1::from_vec(vec![2.0, -1.0]),
            weights: weights.clone(),
            mu_design: None,
            log_sigma_design: None,
            policy: crate::resource::ResourcePolicy::default_library(),
            cached_row_scalars: std::sync::RwLock::new(None),
        };
        let gaussian_eval = gaussian
            .evaluate(&[
                ParameterBlockState {
                    beta: Array1::zeros(0),
                    eta: Array1::from_vec(vec![0.5, -0.25]),
                },
                ParameterBlockState {
                    beta: Array1::zeros(0),
                    eta: Array1::from_vec(vec![0.1, -0.2]),
                },
            ])
            .expect("gaussian evaluate");
        match &gaussian_eval.blockworking_sets[GaussianLocationScaleFamily::BLOCK_MU] {
            BlockWorkingSet::Diagonal {
                working_response,
                working_weights,
            } => {
                assert_eq!(working_weights[0], 0.0);
                assert_eq!(working_response[0], 0.5);
                assert!(working_weights[1] > 0.0);
            }
            BlockWorkingSet::ExactNewton { .. } => panic!("expected diagonal Gaussian mu block"),
        }
        match &gaussian_eval.blockworking_sets[GaussianLocationScaleFamily::BLOCK_LOG_SIGMA] {
            BlockWorkingSet::Diagonal {
                working_response,
                working_weights,
            } => {
                assert_eq!(working_weights[0], 0.0);
                assert_eq!(working_response[0], 0.1);
                assert!(working_weights[1] > 0.0);
            }
            BlockWorkingSet::ExactNewton { .. } => {
                panic!("expected diagonal Gaussian log-sigma block")
            }
        }

        let poisson = PoissonLogFamily {
            y: Array1::from_vec(vec![3.0, 1.0]),
            weights: weights.clone(),
        };
        let poisson_eval = poisson
            .evaluate(&[ParameterBlockState {
                beta: Array1::zeros(0),
                eta: Array1::from_vec(vec![0.7, -0.4]),
            }])
            .expect("poisson evaluate");
        match &poisson_eval.blockworking_sets[PoissonLogFamily::BLOCK_ETA] {
            BlockWorkingSet::Diagonal {
                working_response,
                working_weights,
            } => {
                assert_eq!(working_weights[0], 0.0);
                assert_eq!(working_response[0], 0.7);
                assert!(working_weights[1] > 0.0);
            }
            BlockWorkingSet::ExactNewton { .. } => panic!("expected diagonal Poisson block"),
        }

        let gamma = GammaLogFamily {
            y: Array1::from_vec(vec![1.5, 0.8]),
            weights,
            shape: 2.5,
        };
        let gamma_eval = gamma
            .evaluate(&[ParameterBlockState {
                beta: Array1::zeros(0),
                eta: Array1::from_vec(vec![0.2, -0.1]),
            }])
            .expect("gamma evaluate");
        match &gamma_eval.blockworking_sets[GammaLogFamily::BLOCK_ETA] {
            BlockWorkingSet::Diagonal {
                working_response,
                working_weights,
            } => {
                assert_eq!(working_weights[0], 0.0);
                assert_eq!(working_response[0], 0.2);
                assert!(working_weights[1] > 0.0);
            }
            BlockWorkingSet::ExactNewton { .. } => panic!("expected diagonal Gamma block"),
        }
    }

    #[test]
    fn hard_clamped_poisson_and_gammarows_stay_locally_flat() {
        let poisson = PoissonLogFamily {
            y: Array1::from_vec(vec![1.0, 2.0, 3.0]),
            weights: Array1::from_vec(vec![1.0, 1.0, 1.0]),
        };
        let poisson_eta = Array1::from_vec(vec![-35.0, 0.2, 35.0]);
        let poisson_eval = poisson
            .evaluate(&[ParameterBlockState {
                beta: Array1::zeros(0),
                eta: poisson_eta.clone(),
            }])
            .expect("poisson evaluate");
        match &poisson_eval.blockworking_sets[PoissonLogFamily::BLOCK_ETA] {
            BlockWorkingSet::Diagonal {
                working_response,
                working_weights,
            } => {
                assert_eq!(working_weights[0], 0.0);
                assert_eq!(working_response[0], poisson_eta[0]);
                assert!(working_weights[1] > 0.0);
                assert_eq!(working_weights[2], 0.0);
                assert_eq!(working_response[2], poisson_eta[2]);
            }
            BlockWorkingSet::ExactNewton { .. } => panic!("expected diagonal Poisson block"),
        }

        let gamma = GammaLogFamily {
            y: Array1::from_vec(vec![0.8, 1.2, 2.5]),
            weights: Array1::from_vec(vec![1.0, 1.0, 1.0]),
            shape: 3.0,
        };
        let gamma_eta = Array1::from_vec(vec![-40.0, -0.3, 40.0]);
        let gamma_eval = gamma
            .evaluate(&[ParameterBlockState {
                beta: Array1::zeros(0),
                eta: gamma_eta.clone(),
            }])
            .expect("gamma evaluate");
        match &gamma_eval.blockworking_sets[GammaLogFamily::BLOCK_ETA] {
            BlockWorkingSet::Diagonal {
                working_response,
                working_weights,
            } => {
                assert_eq!(working_weights[0], 0.0);
                assert_eq!(working_response[0], gamma_eta[0]);
                assert!(working_weights[1] > 0.0);
                assert_eq!(working_weights[2], 0.0);
                assert_eq!(working_response[2], gamma_eta[2]);
            }
            BlockWorkingSet::ExactNewton { .. } => panic!("expected diagonal Gamma block"),
        }
    }

    #[test]
    fn poisson_log_canonical_diagonal_weight_is_fisher_and_observed() {
        let family = PoissonLogFamily {
            y: array![0.0, 3.0],
            weights: array![1.5, 0.5],
        };
        let eta = array![-0.4_f64, 0.7_f64];
        let eval = family
            .evaluate(&[ParameterBlockState {
                beta: Array1::zeros(0),
                eta: eta.clone(),
            }])
            .expect("poisson evaluate");

        match &eval.blockworking_sets[PoissonLogFamily::BLOCK_ETA] {
            BlockWorkingSet::Diagonal {
                working_response: _,
                working_weights,
            } => {
                for i in 0..eta.len() {
                    let fisher_weight = family.weights[i] * eta[i].exp();
                    assert!(
                        (working_weights[i] - fisher_weight).abs() < 1e-12,
                        "canonical Poisson-log observed and Fisher weights should coincide at row {i}: got {}, expected {}",
                        working_weights[i],
                        fisher_weight
                    );
                }
            }
            BlockWorkingSet::ExactNewton { .. } => panic!("expected diagonal Poisson block"),
        }
    }

    #[test]
    fn gamma_log_noncanonical_diagonal_uses_observed_not_fisher_weight_and_dw() {
        let family = GammaLogFamily {
            y: array![2.0, 0.25],
            weights: array![1.25, 0.75],
            shape: 3.0,
        };
        let eta = array![0.0_f64, -0.5_f64];
        let states = vec![ParameterBlockState {
            beta: Array1::zeros(0),
            eta: eta.clone(),
        }];
        let eval = family.evaluate(&states).expect("gamma evaluate");

        match &eval.blockworking_sets[GammaLogFamily::BLOCK_ETA] {
            BlockWorkingSet::Diagonal {
                working_response,
                working_weights,
            } => {
                for i in 0..eta.len() {
                    let mu = eta[i].exp();
                    let fisher_weight = family.weights[i] * family.shape;
                    let observed_weight = fisher_weight * family.y[i] / mu;
                    assert!(
                        (working_weights[i] - observed_weight).abs() < 1e-12,
                        "Gamma-log row {i} should use observed weight: got {}, expected {}",
                        working_weights[i],
                        observed_weight
                    );
                    assert!(
                        (working_weights[i] - fisher_weight).abs() > 1e-6,
                        "fixture should distinguish observed from Fisher at row {i}: observed {}, fisher {}",
                        working_weights[i],
                        fisher_weight
                    );

                    let score = fisher_weight * (family.y[i] / mu - 1.0);
                    let expected_response = eta[i] + score / observed_weight;
                    assert!(
                        (working_response[i] - expected_response).abs() < 1e-12,
                        "Gamma-log row {i} working response should be consistent with observed Newton weight: got {}, expected {}",
                        working_response[i],
                        expected_response
                    );
                }
            }
            BlockWorkingSet::ExactNewton { .. } => panic!("expected diagonal Gamma block"),
        }

        let d_eta = array![0.5_f64, -2.0_f64];
        let dw = family
            .diagonalworking_weights_directional_derivative(
                &states,
                GammaLogFamily::BLOCK_ETA,
                &d_eta,
            )
            .expect("gamma dW")
            .expect("gamma dW present");
        for i in 0..eta.len() {
            let observed_weight = family.weights[i] * family.shape * family.y[i] / eta[i].exp();
            let expected_dw = -observed_weight * d_eta[i];
            assert!(
                (dw[i] - expected_dw).abs() < 1e-12,
                "Gamma-log row {i} dW should differentiate observed weights: got {}, expected {}",
                dw[i],
                expected_dw
            );
        }
    }

    #[test]
    fn gaussian_log_sigmaweight_directional_derivative_iszero_on_active_floor_branch() {
        let family = GaussianLocationScaleFamily {
            y: Array1::from_vec(vec![0.3]),
            weights: Array1::from_vec(vec![1.0]),
            mu_design: None,
            log_sigma_design: None,
            policy: crate::resource::ResourcePolicy::default_library(),
            cached_row_scalars: std::sync::RwLock::new(None),
        };
        let states = vec![
            ParameterBlockState {
                beta: Array1::zeros(0),
                eta: Array1::from_vec(vec![0.0]),
            },
            ParameterBlockState {
                beta: Array1::zeros(0),
                eta: Array1::from_vec(vec![35.0]),
            },
        ];
        let d_eta = Array1::from_vec(vec![1.0]);

        let dw = family
            .diagonalworking_weights_directional_derivative(
                &states,
                GaussianLocationScaleFamily::BLOCK_LOG_SIGMA,
                &d_eta,
            )
            .expect("gaussian directional derivative")
            .expect("gaussian log-sigma derivative");
        assert_eq!(dw[0], 0.0);
    }

    #[test]
    fn gaussian_log_sigmaweight_directional_derivative_matches_finite_difference() {
        let family = GaussianLocationScaleFamily {
            y: Array1::from_vec(vec![1.2]),
            weights: Array1::from_vec(vec![1.0]),
            mu_design: None,
            log_sigma_design: None,
            policy: crate::resource::ResourcePolicy::default_library(),
            cached_row_scalars: std::sync::RwLock::new(None),
        };
        let etamu = Array1::from_vec(vec![0.1]);
        let eta_ls = Array1::from_vec(vec![0.4]);
        let states = vec![
            ParameterBlockState {
                beta: Array1::zeros(0),
                eta: etamu.clone(),
            },
            ParameterBlockState {
                beta: Array1::zeros(0),
                eta: eta_ls.clone(),
            },
        ];
        let d_eta = Array1::from_vec(vec![1.0]);

        let dw = family
            .diagonalworking_weights_directional_derivative(
                &states,
                GaussianLocationScaleFamily::BLOCK_LOG_SIGMA,
                &d_eta,
            )
            .expect("gaussian directional derivative")
            .expect("gaussian log-sigma derivative");

        let eps = 1e-6;
        let mut states_plus = states.clone();
        states_plus[GaussianLocationScaleFamily::BLOCK_LOG_SIGMA].eta[0] += eps;
        let eval_plus = family.evaluate(&states_plus).expect("gaussian eval plus");
        let w_plus =
            match &eval_plus.blockworking_sets[GaussianLocationScaleFamily::BLOCK_LOG_SIGMA] {
                BlockWorkingSet::Diagonal {
                    working_response: _,
                    working_weights,
                } => working_weights[0],
                BlockWorkingSet::ExactNewton { .. } => {
                    panic!("expected diagonal Gaussian log-sigma block")
                }
            };

        let mut states_minus = states;
        states_minus[GaussianLocationScaleFamily::BLOCK_LOG_SIGMA].eta[0] -= eps;
        let eval_minus = family.evaluate(&states_minus).expect("gaussian eval minus");
        let w_minus =
            match &eval_minus.blockworking_sets[GaussianLocationScaleFamily::BLOCK_LOG_SIGMA] {
                BlockWorkingSet::Diagonal {
                    working_response: _,
                    working_weights,
                } => working_weights[0],
                BlockWorkingSet::ExactNewton { .. } => {
                    panic!("expected diagonal Gaussian log-sigma block")
                }
            };

        let fd = (w_plus - w_minus) / (2.0 * eps);
        assert!((dw[0] - fd).abs() < 1e-6, "dw={} fd={}", dw[0], fd);
    }

    #[test]
    fn gaussian_sigma_helper_matches_exact_exp_link() {
        let eta0 = 701.0_f64;
        let eta = array![eta0];
        let (sigma, d1, d2, d3, d4) = exp_sigma_derivs_up_to_fourth_array(eta.view());
        let coded_sigma = safe_exp(eta0);
        assert!(
            (sigma[0] - coded_sigma).abs() < 1e-30,
            "Gaussian sigma helper should evaluate the exact exp sigma link at eta={eta0}; got {} vs {}",
            sigma[0],
            coded_sigma
        );
        assert!(
            (d1[0] - sigma[0]).abs() / sigma[0] < 1e-12,
            "Gaussian sigma helper first derivative should equal exp(eta) at eta={eta0}; got {} vs {}",
            d1[0],
            sigma[0]
        );
        assert!(
            (d2[0] - sigma[0]).abs() / sigma[0] < 1e-12,
            "Gaussian sigma helper second derivative should equal exp(eta) at eta={eta0}; got {} vs {}",
            d2[0],
            sigma[0]
        );
        assert!(
            (d3[0] - sigma[0]).abs() / sigma[0] < 1e-12,
            "Gaussian sigma helper third derivative should equal exp(eta) at eta={eta0}; got {} vs {}",
            d3[0],
            sigma[0]
        );
        assert!(
            (d4[0] - sigma[0]).abs() / sigma[0] < 1e-12,
            "Gaussian sigma helper fourth derivative should equal exp(eta) at eta={eta0}; got {} vs {}",
            d4[0],
            sigma[0]
        );
    }

    #[test]
    fn gaussian_log_sigma_design_keeps_shared_mean_basis() {
        let shared = array![[1.0, -1.5], [1.0, -0.25], [1.0, 0.75], [1.0, 2.0],];
        let shared_design =
            DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(shared.clone()));
        let prepared = prepared_gaussian_log_sigma_design(&shared_design, &shared_design)
            .expect("gaussian log-sigma design should accept shared columns");
        let prepared_dense = prepared.as_dense_cow();

        for i in 0..shared.nrows() {
            for j in 0..shared.ncols() {
                assert!(
                    (prepared_dense[[i, j]] - shared[[i, j]]).abs() < 1e-12,
                    "gaussian log-sigma design should preserve shared basis at ({i}, {j}): got {}, expected {}",
                    prepared_dense[[i, j]],
                    shared[[i, j]]
                );
            }
        }
    }

    #[test]
    fn gaussian_diagonal_log_sigma_block_uses_fisher_score_step_in_far_tail() {
        let family = GaussianLocationScaleFamily {
            y: array![0.0],
            weights: array![1.0],
            mu_design: None,
            log_sigma_design: None,
            policy: crate::resource::ResourcePolicy::default_library(),
            cached_row_scalars: std::sync::RwLock::new(None),
        };
        let eta_mu = array![0.0];
        let eta_ls0 = 701.0_f64;
        let states_at = |eta_ls: f64| {
            vec![
                ParameterBlockState {
                    beta: Array1::zeros(0),
                    eta: eta_mu.clone(),
                },
                ParameterBlockState {
                    beta: Array1::zeros(0),
                    eta: array![eta_ls],
                },
            ]
        };

        let eval = family.evaluate(&states_at(eta_ls0)).expect("evaluate");
        match &eval.blockworking_sets[GaussianLocationScaleFamily::BLOCK_LOG_SIGMA] {
            BlockWorkingSet::Diagonal {
                working_response,
                working_weights,
            } => {
                // logb link σ = b + e^η: at η ≫ log b the floor is dwarfed
                // (σ ≈ e^η ~ 1e304), so dlogσ/dη = 1 − b/σ → 1 to within
                // f64 precision and the IRLS step matches the pure-exp Fisher
                // step. Compute the expectation explicitly from the new link.
                let sigma = logb_sigma_from_eta_scalar(eta_ls0);
                let inv_s2 = (sigma * sigma).recip();
                let dlog = logb_dlog_sigma_deta(sigma, logb_sigma_jet1_scalar(eta_ls0).d1);
                let residual = family.y[0] - eta_mu[0];
                let expected_score =
                    family.weights[0] * (residual * residual * inv_s2 - 1.0) * dlog;
                let expected_info = 2.0 * family.weights[0] * dlog * dlog;
                let expected_response = eta_ls0 + expected_score / expected_info;

                assert!((working_weights[0] - expected_info).abs() < 1e-12);
                assert!(
                    (working_response[0] - expected_response).abs() < 1e-12,
                    "working response mismatch: got {}, expected {}",
                    working_response[0],
                    expected_response
                );
            }
            BlockWorkingSet::ExactNewton { .. } => {
                panic!("expected diagonal Gaussian log-sigma block")
            }
        }

        let loglik = |eta_ls: f64| family.log_likelihood_only(&states_at(eta_ls)).expect("ll");
        let h = 1e-4;
        let ll_plus = loglik(eta_ls0 + h);
        let ll0 = loglik(eta_ls0);
        let ll_minus = loglik(eta_ls0 - h);
        let score_fd = (ll_plus - ll_minus) / (2.0 * h);
        assert!(score_fd.is_finite());
        assert!(
            (score_fd + 1.0).abs() < 1e-6,
            "far-tail score should be -1, got {score_fd}"
        );
        assert!(
            (ll_plus - 2.0 * ll0 + ll_minus).abs() < 1e-5,
            "far-tail Gaussian log-sigma block should have near-zero observed curvature"
        );
    }

    #[test]
    fn gaussian_exact_joint_path_stays_finite_in_exp_link_far_tail() {
        let mu_design = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]]));
        let log_sigma_design =
            DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]]));
        let family = GaussianLocationScaleFamily {
            y: array![0.0],
            weights: array![1.0],
            mu_design: Some(mu_design.clone()),
            log_sigma_design: Some(log_sigma_design.clone()),
            policy: crate::resource::ResourcePolicy::default_library(),
            cached_row_scalars: std::sync::RwLock::new(None),
        };
        let beta_mu = array![0.0];
        let beta_ls = array![710.0];
        let states = vec![
            ParameterBlockState {
                beta: beta_mu.clone(),
                eta: mu_design.matrixvectormultiply(&beta_mu),
            },
            ParameterBlockState {
                beta: beta_ls.clone(),
                eta: log_sigma_design.matrixvectormultiply(&beta_ls),
            },
        ];

        let hessian = family
            .exact_newton_joint_hessian(&states)
            .expect("joint hessian")
            .expect("expected Gaussian exact joint hessian");
        assert!(
            hessian.iter().all(|value| value.is_finite()),
            "far-tail Gaussian exact Hessian should stay finite; got {hessian:?}"
        );

        let direction = array![0.25, -0.5];
        let dh = family
            .exact_newton_joint_hessian_directional_derivative(&states, &direction)
            .expect("joint dH")
            .expect("expected Gaussian exact joint hessian directional derivative");
        assert!(
            dh.iter().all(|value| value.is_finite()),
            "far-tail Gaussian exact Hessian directional derivative should stay finite; got {dh:?}"
        );
    }

    #[test]
    fn gaussian_location_scale_hotloop_optimized_matches_legacy_and_is_faster_locally() {
        let n = 4096usize;
        let y = Array1::from_shape_fn(n, |i| ((i as f64) * 0.003).sin() + 0.1);
        let mu = Array1::from_shape_fn(n, |i| ((i as f64) * 0.001).cos() - 0.2);
        let eta_ls = Array1::from_shape_fn(n, |i| ((i as f64) * 0.002).sin() * 0.8 - 0.1);
        let weights = Array1::from_shape_fn(n, |i| if i % 37 == 0 { 0.0 } else { 1.0 });
        let ln2pi = (2.0 * std::f64::consts::PI).ln();

        let legacy_eval = || {
            let mut ll = 0.0;
            let mut zmu = Array1::<f64>::zeros(n);
            let mut wmu = Array1::<f64>::zeros(n);
            let mut zls = Array1::<f64>::zeros(n);
            let mut wls = Array1::<f64>::zeros(n);
            for i in 0..n {
                let w = weights[i];
                let eta = eta_ls[i];
                let SigmaJet1 { sigma, d1 } = logb_sigma_jet1_scalar(eta);
                let inv_s2 = (sigma * sigma).recip();
                let r = y[i] - mu[i];
                ll += w * (-0.5 * (r * r * inv_s2 + ln2pi + 2.0 * sigma.ln()));
                if w == 0.0 {
                    wmu[i] = 0.0;
                    zmu[i] = mu[i];
                } else {
                    wmu[i] = floor_positiveweight(w * inv_s2, MIN_WEIGHT);
                    zmu[i] = mu[i] + r;
                }
                let dlogsigma_du = logb_dlog_sigma_deta(sigma, d1);
                let info_u =
                    floor_positiveweight(2.0 * w * dlogsigma_du * dlogsigma_du, MIN_WEIGHT);
                if info_u == 0.0 {
                    wls[i] = 0.0;
                    zls[i] = eta;
                } else {
                    wls[i] = info_u;
                    let score_ls = w * (r * r * inv_s2 - 1.0) * dlogsigma_du;
                    zls[i] = eta + score_ls / info_u;
                }
            }
            (ll, zmu, wmu, zls, wls)
        };

        let optimized_eval = || {
            let mut ll = 0.0;
            let mut zmu = Array1::<f64>::zeros(n);
            let mut wmu = Array1::<f64>::zeros(n);
            let mut zls = Array1::<f64>::zeros(n);
            let mut wls = Array1::<f64>::zeros(n);
            for i in 0..n {
                let eta = eta_ls[i];
                let SigmaJet1 { sigma, d1 } = logb_sigma_jet1_scalar(eta);
                let inv_s2 = (sigma * sigma).recip();
                let w = weights[i];
                let r = y[i] - mu[i];
                ll += w * (-0.5 * (r * r * inv_s2 + ln2pi + 2.0 * sigma.ln()));
                if w == 0.0 {
                    wmu[i] = 0.0;
                    zmu[i] = mu[i];
                } else {
                    wmu[i] = floor_positiveweight(w * inv_s2, MIN_WEIGHT);
                    zmu[i] = mu[i] + r;
                }
                let dlogsigma_du = logb_dlog_sigma_deta(sigma, d1);
                let info_u =
                    floor_positiveweight(2.0 * w * dlogsigma_du * dlogsigma_du, MIN_WEIGHT);
                if info_u == 0.0 {
                    wls[i] = 0.0;
                    zls[i] = eta;
                } else {
                    wls[i] = info_u;
                    let score_ls = w * (r * r * inv_s2 - 1.0) * dlogsigma_du;
                    zls[i] = eta + score_ls / info_u;
                }
            }
            (ll, zmu, wmu, zls, wls)
        };

        let (ll_legacy, zmu_legacy, wmu_legacy, zls_legacy, wls_legacy) = legacy_eval();
        let (ll_opt, zmu_opt, wmu_opt, zls_opt, wls_opt) = optimized_eval();
        assert!((ll_legacy - ll_opt).abs() < 1e-10);
        assert!((&zmu_legacy - &zmu_opt).iter().all(|v| v.abs() < 1e-12));
        assert!((&wmu_legacy - &wmu_opt).iter().all(|v| v.abs() < 1e-12));
        assert!((&zls_legacy - &zls_opt).iter().all(|v| v.abs() < 1e-12));
        assert!((&wls_legacy - &wls_opt).iter().all(|v| v.abs() < 1e-12));
    }

    fn simple_matern_term_collection(
        feature_cols: &[usize],
        length_scale: f64,
    ) -> TermCollectionSpec {
        TermCollectionSpec {
            linear_terms: Vec::new(),
            random_effect_terms: Vec::new(),
            smooth_terms: vec![SmoothTermSpec {
                name: "spatial".to_string(),
                basis: SmoothBasisSpec::Matern {
                    feature_cols: feature_cols.to_vec(),
                    spec: MaternBasisSpec {
                        periodic: None,
                        center_strategy: CenterStrategy::EqualMass { num_centers: 6 },
                        length_scale,
                        nu: MaternNu::ThreeHalves,
                        include_intercept: false,
                        double_penalty: false,
                        identifiability: MaternIdentifiability::CenterSumToZero,
                        aniso_log_scales: None,
                        nullspace_shrinkage_survived: None,
                    },
                    input_scales: None,
                },
                shape: ShapeConstraint::None,
                joint_null_rotation: None,
            }],
        }
    }

    fn empty_term_collection() -> TermCollectionSpec {
        TermCollectionSpec {
            linear_terms: Vec::new(),
            random_effect_terms: Vec::new(),
            smooth_terms: Vec::new(),
        }
    }

    fn spatial_kappa_options() -> SpatialLengthScaleOptimizationOptions {
        SpatialLengthScaleOptimizationOptions {
            enabled: true,
            max_outer_iter: 4,
            rel_tol: 1e-4,
            log_step: std::f64::consts::LN_2,
            min_length_scale: 0.1,
            max_length_scale: 2.0,
            pilot_subsample_threshold: 10_000,
        }
    }

    fn spatial_fit_smoke_options() -> BlockwiseFitOptions {
        BlockwiseFitOptions {
            // The location-scale-wiggle spatial smoke test can need more than
            // 24 blockwise cycles after the final outer REML refit; keep the
            // tolerance unchanged and allow enough iterations for the same
            // convergence criterion to be reached deterministically.
            inner_max_cycles: 48,
            inner_tol: 1e-4,
            outer_max_iter: 3,
            outer_tol: 1e-4,
            ..BlockwiseFitOptions::default()
        }
    }

    #[test]
    fn binomial_location_scale_exact_probit_tailobjects_stay_finite() {
        let n = 6usize;
        let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 0.0, 1.0]);
        let weights = Array1::from_elem(n, 1.0);
        let threshold_design = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
            Array2::from_elem((n, 1), 1.0),
        ));
        let log_sigma_design = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
            Array2::from_elem((n, 1), 1.0),
        ));
        let family = BinomialLocationScaleFamily {
            y,
            weights,
            link_kind: InverseLink::Standard(StandardLink::Probit),
            threshold_design: Some(threshold_design.clone()),
            log_sigma_design: Some(log_sigma_design.clone()),
            policy: crate::resource::ResourcePolicy::default_library(),
        };
        let beta_t = array![250.0];
        let beta_ls = array![0.0];
        let states = vec![
            ParameterBlockState {
                beta: beta_t.clone(),
                eta: threshold_design.matrixvectormultiply(&beta_t),
            },
            ParameterBlockState {
                beta: beta_ls.clone(),
                eta: log_sigma_design.matrixvectormultiply(&beta_ls),
            },
        ];

        let eval = family
            .evaluate(&states)
            .expect("evaluate tail-stable family");
        assert!(eval.log_likelihood.is_finite());
        let joint = family
            .exact_newton_joint_hessian(&states)
            .expect("joint hessian")
            .expect("expected exact joint hessian");
        assert!(joint.iter().all(|v| v.is_finite()));
        let direction = array![0.1, -0.2];
        let d_h = family
            .exact_newton_joint_hessian_directional_derivative(&states, &direction)
            .expect("joint dH")
            .expect("expected exact joint dH");
        assert!(d_h.iter().all(|v| v.is_finite()));
        let d2_h = family
            .exact_newton_joint_hessiansecond_directional_derivative(
                &states, &direction, &direction,
            )
            .expect("joint d2H")
            .expect("expected exact joint d2H");
        assert!(d2_h.iter().all(|v| v.is_finite()));
    }

    #[test]
    fn binomial_location_scale_many_smoothing_params_keeps_second_order_outer() {
        fn spec_with_penalties(name: &str, n: usize, p: usize, k: usize) -> ParameterBlockSpec {
            ParameterBlockSpec {
                name: name.to_string(),
                design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
                    Array2::from_elem((n, p), 1.0),
                )),
                offset: Array1::zeros(n),
                penalties: (0..k)
                    .map(|_| PenaltyMatrix::Dense(identity_penalty(p)))
                    .collect(),
                nullspace_dims: vec![0; k],
                initial_log_lambdas: Array1::zeros(k),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            }
        }

        let n = 8usize;
        let family = BinomialLocationScaleFamily {
            y: Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0]),
            weights: Array1::from_elem(n, 1.0),
            link_kind: InverseLink::Standard(StandardLink::Probit),
            threshold_design: None,
            log_sigma_design: None,
            policy: crate::resource::ResourcePolicy::default_library(),
        };
        let specs = vec![
            spec_with_penalties("threshold", n, 3, 2),
            spec_with_penalties("log_sigma", n, 6, 11),
        ];

        assert_eq!(
            family.exact_outer_derivative_order(&specs, &BlockwiseFitOptions::default()),
            crate::custom_family::ExactOuterDerivativeOrder::Second
        );
        let (_gradient, hessian) = crate::custom_family::custom_family_outer_derivatives(
            &family,
            &specs,
            &BlockwiseFitOptions::default(),
        );
        assert_eq!(
            hessian,
            crate::solver::outer_strategy::DeclaredHessianForm::Either
        );
    }

    #[test]
    fn binomial_location_scale_term_builder_requires_exact_spatial_joint_path() {
        let n = 8usize;
        let builder = BinomialLocationScaleTermBuilder {
            y: Array1::from_elem(n, 0.0),
            weights: Array1::from_elem(n, 1.0),
            link_kind: InverseLink::Standard(StandardLink::Probit),
            meanspec: simple_matern_term_collection(&[0, 1], 0.4),
            noisespec: simple_matern_term_collection(&[0, 1], 0.75),
            mean_offset: Array1::zeros(n),
            noise_offset: Array1::zeros(n),
        };
        assert!(builder.exact_spatial_joint_supported());
        assert!(builder.require_exact_spatial_joint());
        let mut data = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            let t = i as f64 / (n as f64 - 1.0);
            data[[i, 0]] = t;
            data[[i, 1]] = (2.0 * std::f64::consts::PI * t).sin();
        }
        let mean_design =
            build_term_collection_design(data.view(), builder.meanspec()).expect("mean design");
        let noise_design =
            build_term_collection_design(data.view(), builder.noisespec()).expect("noise design");
        let family = builder.build_family(&mean_design, &noise_design);
        assert!(family.exact_joint_supported());
    }

    #[test]
    fn binomial_location_scalewiggle_term_builder_requires_exact_spatial_joint_path() {
        let n = 8usize;
        let q_seed = Array1::linspace(-1.25, 1.25, n);
        let (wiggle_block, knots) = BinomialLocationScaleWiggleFamily::buildwiggle_block_input(
            q_seed.view(),
            2,
            3,
            2,
            false,
        )
        .expect("wiggle block");
        let builder = BinomialLocationScaleWiggleTermBuilder {
            y: Array1::from_elem(n, 0.0),
            weights: Array1::from_elem(n, 1.0),
            link_kind: InverseLink::Standard(StandardLink::Probit),
            meanspec: simple_matern_term_collection(&[0, 1], 0.4),
            noisespec: simple_matern_term_collection(&[0, 1], 0.75),
            mean_offset: Array1::zeros(n),
            noise_offset: Array1::zeros(n),
            wiggle_knots: knots,
            wiggle_degree: 2,
            wiggle_block,
        };
        assert!(builder.exact_spatial_joint_supported());
        assert!(builder.require_exact_spatial_joint());
        let mut data = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            let t = i as f64 / (n as f64 - 1.0);
            data[[i, 0]] = t;
            data[[i, 1]] = (2.0 * std::f64::consts::PI * t).sin();
        }
        let mean_design =
            build_term_collection_design(data.view(), builder.meanspec()).expect("mean design");
        let noise_design =
            build_term_collection_design(data.view(), builder.noisespec()).expect("noise design");
        let family = builder.build_family(&mean_design, &noise_design);
        assert!(family.exact_joint_supported());
        assert!(family.requires_joint_outer_hyper_path());
    }

    #[test]
    fn binomial_location_scale_builder_populateswarm_start_betas() {
        let n = 12usize;
        let mut data = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            let t = i as f64 / (n as f64 - 1.0);
            data[[i, 0]] = t;
            data[[i, 1]] = (2.0 * std::f64::consts::PI * t).sin();
        }
        let y = Array1::from_iter((0..n).map(|i| if i % 3 == 0 || i % 5 == 0 { 1.0 } else { 0.0 }));
        let weights = Array1::from_elem(n, 1.0);
        let builder = BinomialLocationScaleTermBuilder {
            mean_offset: Array1::zeros(y.len()),
            noise_offset: Array1::zeros(y.len()),
            y,
            weights,
            link_kind: InverseLink::Standard(StandardLink::Probit),
            meanspec: simple_matern_term_collection(&[0, 1], 0.45),
            noisespec: simple_matern_term_collection(&[0, 1], 0.8),
        };
        let mean_design =
            build_term_collection_design(data.view(), builder.meanspec()).expect("mean design");
        let noise_design =
            build_term_collection_design(data.view(), builder.noisespec()).expect("noise design");
        let rho = compose_theta_from_hints_test(
            builder.mean_penalty_count(&mean_design),
            builder.noise_penalty_count(&noise_design),
            &None,
            &None,
            &Array1::zeros(0),
        );
        let blocks = builder
            .build_blocks(&rho, &mean_design, &noise_design, None, None)
            .expect("build blocks");
        assert_eq!(blocks.len(), 2);
        assert!(blocks[0].initial_beta.is_some());
        assert!(blocks[1].initial_beta.is_some());
    }

    #[test]
    fn binomial_location_scalewiggle_builder_populateswarm_start_betas() {
        let n = 12usize;
        let mut data = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            let t = i as f64 / (n as f64 - 1.0);
            data[[i, 0]] = t;
            data[[i, 1]] = (2.0 * std::f64::consts::PI * t).cos();
        }
        let y = Array1::from_iter((0..n).map(|i| if i % 4 == 0 || i % 5 == 0 { 1.0 } else { 0.0 }));
        let weights = Array1::from_elem(n, 1.0);
        let q_seed = Array1::linspace(-1.25, 1.25, n);
        let (wiggle_block, knots) = BinomialLocationScaleWiggleFamily::buildwiggle_block_input(
            q_seed.view(),
            2,
            3,
            2,
            false,
        )
        .expect("wiggle block");
        let builder = BinomialLocationScaleWiggleTermBuilder {
            mean_offset: Array1::zeros(y.len()),
            noise_offset: Array1::zeros(y.len()),
            y,
            weights,
            link_kind: InverseLink::Standard(StandardLink::Probit),
            meanspec: simple_matern_term_collection(&[0, 1], 0.45),
            noisespec: simple_matern_term_collection(&[0, 1], 0.8),
            wiggle_knots: knots,
            wiggle_degree: 2,
            wiggle_block,
        };
        let mean_design =
            build_term_collection_design(data.view(), builder.meanspec()).expect("mean design");
        let noise_design =
            build_term_collection_design(data.view(), builder.noisespec()).expect("noise design");
        let rho = compose_theta_from_hints_test(
            builder.mean_penalty_count(&mean_design),
            builder.noise_penalty_count(&noise_design),
            &None,
            &None,
            &builder.extra_rho0().expect("extra rho0"),
        );
        let blocks = builder
            .build_blocks(&rho, &mean_design, &noise_design, None, None)
            .expect("build blocks");
        assert_eq!(blocks.len(), 3);
        assert!(blocks[0].initial_beta.is_some());
        assert!(blocks[1].initial_beta.is_some());
    }

    #[test]
    fn binomial_location_scale_exact_newton_spatial_joint_hyper_returns_fullhessian() {
        let n = 12usize;
        let mut data = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            let t = i as f64 / (n as f64 - 1.0);
            data[[i, 0]] = t;
            data[[i, 1]] = (2.0 * std::f64::consts::PI * t).cos();
        }
        let y = Array1::from_iter((0..n).map(|i| if i % 3 == 0 || i % 5 == 0 { 1.0 } else { 0.0 }));
        let weights = Array1::from_elem(n, 1.0);
        let meanspec = simple_matern_term_collection(&[0, 1], 0.45);
        let noisespec = simple_matern_term_collection(&[0, 1], 0.8);
        let builder = BinomialLocationScaleTermBuilder {
            mean_offset: Array1::zeros(y.len()),
            noise_offset: Array1::zeros(y.len()),
            y,
            weights,
            link_kind: InverseLink::Standard(StandardLink::Probit),
            meanspec: meanspec.clone(),
            noisespec: noisespec.clone(),
        };
        let mean_design =
            build_term_collection_design(data.view(), &meanspec).expect("build mean design");
        let noise_design =
            build_term_collection_design(data.view(), &noisespec).expect("build noise design");
        let meanspec_resolved =
            freeze_term_collection_from_design(&meanspec, &mean_design).expect("freeze mean spec");
        let noisespec_resolved = freeze_term_collection_from_design(&noisespec, &noise_design)
            .expect("freeze noise spec");
        let rho = compose_theta_from_hints_test(
            builder.mean_penalty_count(&mean_design),
            builder.noise_penalty_count(&noise_design),
            &None,
            &None,
            &Array1::zeros(0),
        );
        let blocks = builder
            .build_blocks(&rho, &mean_design, &noise_design, None, None)
            .expect("build blocks");
        let family = builder.build_family(&mean_design, &noise_design);
        let derivative_blocks = builder
            .build_psiderivative_blocks(
                data.view(),
                &meanspec_resolved,
                &noisespec_resolved,
                &mean_design,
                &noise_design,
            )
            .expect("psi derivative blocks");
        let eval = evaluate_custom_family_joint_hyper(
            &family,
            &blocks,
            &BlockwiseFitOptions {
                use_remlobjective: true,
                outer_max_iter: 1,
                ..BlockwiseFitOptions::default()
            },
            &rho,
            &derivative_blocks,
            None,
            crate::solver::estimate::reml::unified::EvalMode::ValueGradientHessian,
        )
        .expect("exact spatial joint hyper eval");
        assert!(eval.objective.is_finite());
        assert!(eval.gradient.iter().all(|v| v.is_finite()));
        let hess = eval
            .outer_hessian
            .materialize_dense()
            .expect("exact spatial joint hyper path should materialize a full [rho, psi] hessian")
            .expect("exact spatial joint hyper path should return a full [rho, psi] hessian");
        let psi_dim = derivative_blocks.iter().map(Vec::len).sum::<usize>();
        let theta_dim = rho.len() + psi_dim;
        assert_eq!(eval.gradient.len(), theta_dim);
        assert_eq!(hess.nrows(), theta_dim);
        assert_eq!(hess.ncols(), theta_dim);
    }

    #[test]
    fn binomial_location_scalewiggle_exact_newton_spatial_joint_hyper_returns_fullhessian() {
        let n = 14usize;
        let mut data = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            let t = i as f64 / (n as f64 - 1.0);
            data[[i, 0]] = t;
            data[[i, 1]] = (2.25 * std::f64::consts::PI * t).sin();
        }
        let y = Array1::from_iter((0..n).map(|i| if i % 3 == 0 || i % 5 == 0 { 1.0 } else { 0.0 }));
        let weights = Array1::from_elem(n, 1.0);
        let meanspec = simple_matern_term_collection(&[0, 1], 0.45);
        let noisespec = simple_matern_term_collection(&[0, 1], 0.8);
        let q_seed = Array1::linspace(-1.5, 1.5, n);
        let (wiggle_block, knots) = BinomialLocationScaleWiggleFamily::buildwiggle_block_input(
            q_seed.view(),
            2,
            4,
            2,
            false,
        )
        .expect("wiggle block");
        let builder = BinomialLocationScaleWiggleTermBuilder {
            mean_offset: Array1::zeros(y.len()),
            noise_offset: Array1::zeros(y.len()),
            y,
            weights,
            link_kind: InverseLink::Standard(StandardLink::Probit),
            meanspec: meanspec.clone(),
            noisespec: noisespec.clone(),
            wiggle_knots: knots,
            wiggle_degree: 2,
            wiggle_block,
        };
        let mean_design =
            build_term_collection_design(data.view(), &meanspec).expect("build mean design");
        let noise_design =
            build_term_collection_design(data.view(), &noisespec).expect("build noise design");
        let meanspec_resolved =
            freeze_term_collection_from_design(&meanspec, &mean_design).expect("freeze mean spec");
        let noisespec_resolved = freeze_term_collection_from_design(&noisespec, &noise_design)
            .expect("freeze noise spec");
        let rho = compose_theta_from_hints_test(
            builder.mean_penalty_count(&mean_design),
            builder.noise_penalty_count(&noise_design),
            &None,
            &None,
            &builder.extra_rho0().expect("wiggle rho0"),
        );
        let blocks = builder
            .build_blocks(&rho, &mean_design, &noise_design, None, None)
            .expect("build blocks");
        let family = builder.build_family(&mean_design, &noise_design);
        let derivative_blocks = builder
            .build_psiderivative_blocks(
                data.view(),
                &meanspec_resolved,
                &noisespec_resolved,
                &mean_design,
                &noise_design,
            )
            .expect("psi derivative blocks");
        let eval = evaluate_custom_family_joint_hyper(
            &family,
            &blocks,
            &BlockwiseFitOptions {
                use_remlobjective: true,
                outer_max_iter: 1,
                ..BlockwiseFitOptions::default()
            },
            &rho,
            &derivative_blocks,
            None,
            crate::solver::estimate::reml::unified::EvalMode::ValueGradientHessian,
        )
        .expect("exact wiggle spatial joint hyper eval");
        assert!(eval.objective.is_finite());
        assert!(eval.gradient.iter().all(|v| v.is_finite()));
        let hess = eval
            .outer_hessian
            .materialize_dense()
            .expect("exact wiggle spatial joint hyper path should materialize a full [rho, psi] hessian")
            .expect("exact wiggle spatial joint hyper path should return a full [rho, psi] hessian");
        let psi_dim = derivative_blocks.iter().map(Vec::len).sum::<usize>();
        let theta_dim = rho.len() + psi_dim;
        assert_eq!(eval.gradient.len(), theta_dim);
        assert_eq!(hess.nrows(), theta_dim);
        assert_eq!(hess.ncols(), theta_dim);
    }

    #[test]
    fn gaussian_location_scale_exact_newton_spatial_joint_hyper_returns_fullhessian() {
        let n = 12usize;
        let mut data = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            let t = i as f64 / (n as f64 - 1.0);
            data[[i, 0]] = t;
            data[[i, 1]] = (2.0 * std::f64::consts::PI * t).sin();
        }
        let y = Array1::from_iter((0..n).map(|i| {
            let x0 = data[[i, 0]];
            let x1 = data[[i, 1]];
            0.4 * x0 - 0.2 * x1 + 0.15
        }));
        let weights = Array1::from_elem(n, 1.0);
        let meanspec = simple_matern_term_collection(&[0, 1], 0.45);
        let noisespec = simple_matern_term_collection(&[0, 1], 0.8);
        let builder = GaussianLocationScaleTermBuilder {
            y,
            weights,
            meanspec: meanspec.clone(),
            noisespec: noisespec.clone(),
            mean_offset: Array1::zeros(n),
            noise_offset: Array1::zeros(n),
        };
        let mean_design =
            build_term_collection_design(data.view(), &meanspec).expect("build mean design");
        let noise_design =
            build_term_collection_design(data.view(), &noisespec).expect("build noise design");
        let meanspec_resolved =
            freeze_term_collection_from_design(&meanspec, &mean_design).expect("freeze mean spec");
        let noisespec_resolved = freeze_term_collection_from_design(&noisespec, &noise_design)
            .expect("freeze noise spec");
        let rho = compose_theta_from_hints_test(
            builder.mean_penalty_count(&mean_design),
            builder.noise_penalty_count(&noise_design),
            &None,
            &None,
            &Array1::zeros(0),
        );
        let blocks = builder
            .build_blocks(&rho, &mean_design, &noise_design, None, None)
            .expect("build blocks");
        let family = builder.build_family(&mean_design, &noise_design);
        let derivative_blocks = builder
            .build_psiderivative_blocks(
                data.view(),
                &meanspec_resolved,
                &noisespec_resolved,
                &mean_design,
                &noise_design,
            )
            .expect("psi derivative blocks");
        let eval = evaluate_custom_family_joint_hyper(
            &family,
            &blocks,
            &BlockwiseFitOptions {
                use_remlobjective: true,
                outer_max_iter: 1,
                ..BlockwiseFitOptions::default()
            },
            &rho,
            &derivative_blocks,
            None,
            crate::solver::estimate::reml::unified::EvalMode::ValueGradientHessian,
        )
        .expect("exact spatial joint hyper eval");
        assert!(eval.objective.is_finite());
        assert!(eval.gradient.iter().all(|v| v.is_finite()));
        let hess = eval
            .outer_hessian
            .materialize_dense()
            .expect("exact spatial joint hyper path should materialize a full [rho, psi] hessian")
            .expect("exact spatial joint hyper path should return a full [rho, psi] hessian");
        let psi_dim = derivative_blocks.iter().map(Vec::len).sum::<usize>();
        let theta_dim = rho.len() + psi_dim;
        assert_eq!(eval.gradient.len(), theta_dim);
        assert_eq!(hess.nrows(), theta_dim);
        assert_eq!(hess.ncols(), theta_dim);
        assert!(hess.iter().all(|v| v.is_finite()));
    }

    /// Shared assertion body for the `*_exposes_joint_psi_hook_surface` tests:
    /// pulls the joint ψ terms / second-order terms / mixed directional drift
    /// off `family` and checks their shapes. `label` names the family in the
    /// panic messages; `slope`/`intercept` parameterize the `d_beta` probe.
    fn assert_joint_psi_hook_surface<F: CustomFamily>(
        family: &F,
        block_states: &[ParameterBlockState],
        blocks: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
        slope: f64,
        intercept: f64,
        label: &str,
    ) {
        let psi_terms = family
            .exact_newton_joint_psi_terms(block_states, blocks, derivative_blocks, 0)
            .expect("joint psi terms call")
            .unwrap_or_else(|| panic!("{label} family should return joint psi terms"));
        let psi2_terms = family
            .exact_newton_joint_psisecond_order_terms(block_states, blocks, derivative_blocks, 0, 0)
            .expect("joint psi second-order call")
            .unwrap_or_else(|| panic!("{label} family should return joint psi second-order terms"));
        let total = block_states
            .iter()
            .map(|state| state.beta.len())
            .sum::<usize>();
        assert_eq!(psi_terms.score_psi.len(), total);
        if psi_terms.hessian_psi_operator.is_some() {
            assert_eq!(psi_terms.hessian_psi.dim(), (0, 0));
        } else {
            assert_eq!(psi_terms.hessian_psi.dim(), (total, total));
        }
        assert_eq!(psi2_terms.score_psi_psi.len(), total);
        if psi2_terms.hessian_psi_psi_operator.is_some() {
            assert_eq!(psi2_terms.hessian_psi_psi.dim(), (0, 0));
        } else {
            assert_eq!(psi2_terms.hessian_psi_psi.dim(), (total, total));
        }

        let mut d_beta_flat = Array1::<f64>::zeros(total);
        let mut at = 0usize;
        for state in block_states {
            let end = at + state.beta.len();
            d_beta_flat
                .slice_mut(s![at..end])
                .assign(&state.beta.mapv(|v| slope * v + intercept));
            at = end;
        }
        let mixed = family
            .exact_newton_joint_psihessian_directional_derivative(
                block_states,
                blocks,
                derivative_blocks,
                0,
                &d_beta_flat,
            )
            .expect("joint psi mixed drift call")
            .unwrap_or_else(|| panic!("{label} family should return joint psi mixed drift"));
        assert_eq!(mixed.dim(), (total, total));
    }

    #[test]
    fn binomial_location_scalewiggle_family_exposes_joint_psi_hook_surface() {
        let n = 12usize;
        let mut data = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            let t = i as f64 / (n as f64 - 1.0);
            data[[i, 0]] = t;
            data[[i, 1]] = (1.75 * std::f64::consts::PI * t).cos();
        }
        let y = Array1::from_iter((0..n).map(|i| if i % 4 == 0 || i % 5 == 0 { 1.0 } else { 0.0 }));
        let weights = Array1::from_elem(n, 1.0);
        let meanspec = simple_matern_term_collection(&[0, 1], 0.4);
        let noisespec = simple_matern_term_collection(&[0, 1], 0.7);
        let q_seed = Array1::linspace(-1.25, 1.25, n);
        let (wiggle_block, knots) = BinomialLocationScaleWiggleFamily::buildwiggle_block_input(
            q_seed.view(),
            2,
            3,
            2,
            false,
        )
        .expect("wiggle block");
        let builder = BinomialLocationScaleWiggleTermBuilder {
            mean_offset: Array1::zeros(y.len()),
            noise_offset: Array1::zeros(y.len()),
            y,
            weights,
            link_kind: InverseLink::Standard(StandardLink::Probit),
            meanspec: meanspec.clone(),
            noisespec: noisespec.clone(),
            wiggle_knots: knots,
            wiggle_degree: 2,
            wiggle_block,
        };
        let mean_design =
            build_term_collection_design(data.view(), &meanspec).expect("build mean design");
        let noise_design =
            build_term_collection_design(data.view(), &noisespec).expect("build noise design");
        let meanspec_resolved =
            freeze_term_collection_from_design(&meanspec, &mean_design).expect("freeze mean spec");
        let noisespec_resolved = freeze_term_collection_from_design(&noisespec, &noise_design)
            .expect("freeze noise spec");
        let rho = compose_theta_from_hints_test(
            builder.mean_penalty_count(&mean_design),
            builder.noise_penalty_count(&noise_design),
            &None,
            &None,
            &builder.extra_rho0().expect("wiggle rho0"),
        );
        let blocks = builder
            .build_blocks(&rho, &mean_design, &noise_design, None, None)
            .expect("build blocks");
        let family = builder.build_family(&mean_design, &noise_design);
        let mut block_states = Vec::<ParameterBlockState>::with_capacity(blocks.len());
        for (block_idx, spec) in blocks.iter().enumerate() {
            let mut beta = spec
                .initial_beta
                .clone()
                .unwrap_or_else(|| Array1::zeros(spec.design.ncols()));
            if block_idx == BinomialLocationScaleWiggleFamily::BLOCK_WIGGLE {
                beta.fill(0.04);
            }
            let (design, offset) = family
                .block_geometry(&block_states, spec)
                .expect("hook fixture block geometry");
            let eta = design.matrixvectormultiply(&beta) + &offset;
            block_states.push(ParameterBlockState { beta, eta });
        }
        family
            .evaluate(&block_states)
            .expect("hook fixture state should evaluate");
        let derivative_blocks = builder
            .build_psiderivative_blocks(
                data.view(),
                &meanspec_resolved,
                &noisespec_resolved,
                &mean_design,
                &noise_design,
            )
            .expect("psi derivative blocks");
        assert_joint_psi_hook_surface(
            &family,
            &block_states,
            &blocks,
            &derivative_blocks,
            0.25,
            0.1,
            "wiggle",
        );
    }

    #[test]
    fn gaussian_location_scale_family_exposes_joint_psi_hook_surface() {
        let n = 10usize;
        let mut data = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            let t = i as f64 / (n as f64 - 1.0);
            data[[i, 0]] = t;
            data[[i, 1]] = (2.0 * std::f64::consts::PI * t).cos();
        }
        let y = Array1::from_iter((0..n).map(|i| {
            let x0 = data[[i, 0]];
            let x1 = data[[i, 1]];
            0.3 * x0 - 0.15 * x1 + 0.2
        }));
        let weights = Array1::from_elem(n, 1.0);
        let meanspec = simple_matern_term_collection(&[0, 1], 0.4);
        let noisespec = simple_matern_term_collection(&[0, 1], 0.7);
        let builder = GaussianLocationScaleTermBuilder {
            y,
            weights,
            meanspec: meanspec.clone(),
            noisespec: noisespec.clone(),
            mean_offset: Array1::zeros(n),
            noise_offset: Array1::zeros(n),
        };
        let mean_design =
            build_term_collection_design(data.view(), &meanspec).expect("build mean design");
        let noise_design =
            build_term_collection_design(data.view(), &noisespec).expect("build noise design");
        let meanspec_resolved =
            freeze_term_collection_from_design(&meanspec, &mean_design).expect("freeze mean spec");
        let noisespec_resolved = freeze_term_collection_from_design(&noisespec, &noise_design)
            .expect("freeze noise spec");
        let rho = compose_theta_from_hints_test(
            builder.mean_penalty_count(&mean_design),
            builder.noise_penalty_count(&noise_design),
            &None,
            &None,
            &Array1::zeros(0),
        );
        let blocks = builder
            .build_blocks(&rho, &mean_design, &noise_design, None, None)
            .expect("build blocks");
        let family = builder.build_family(&mean_design, &noise_design);
        let fit = fit_custom_family(
            &family,
            &blocks,
            &BlockwiseFitOptions {
                use_remlobjective: true,
                outer_max_iter: 1,
                ..BlockwiseFitOptions::default()
            },
        )
        .expect("fit gaussian family for joint psi hooks");
        let derivative_blocks = builder
            .build_psiderivative_blocks(
                data.view(),
                &meanspec_resolved,
                &noisespec_resolved,
                &mean_design,
                &noise_design,
            )
            .expect("psi derivative blocks");
        assert_joint_psi_hook_surface(
            &family,
            &fit.block_states,
            &blocks,
            &derivative_blocks,
            0.2,
            0.15,
            "gaussian",
        );
    }

    #[test]
    fn gaussian_location_scale_terms_reject_invalidweights_early() {
        let n = 8usize;
        let mut data = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            data[[i, 0]] = i as f64;
            data[[i, 1]] = (i as f64).sin();
        }
        let spec = GaussianLocationScaleTermSpec {
            y: Array1::zeros(n),
            weights: Array1::from_vec(vec![1.0, 1.0, -0.5, 1.0, 1.0, 1.0, 1.0, 1.0]),
            meanspec: simple_matern_term_collection(&[0, 1], 0.35),
            log_sigmaspec: simple_matern_term_collection(&[0, 1], 0.6),
            mean_offset: Array1::zeros(n),
            log_sigma_offset: Array1::zeros(n),
        };

        let err = match fit_gaussian_location_scale_terms(
            data.view(),
            spec,
            &BlockwiseFitOptions::default(),
            &spatial_kappa_options(),
        ) {
            Ok(_) => panic!("term API should reject negative weights"),
            Err(err) => err,
        };
        assert!(err.contains("weights must be finite and non-negative"));
    }

    #[test]
    fn binomial_location_scale_terms_reject_invalid_response_early() {
        let n = 8usize;
        let mut data = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            data[[i, 0]] = i as f64;
            data[[i, 1]] = (i as f64).cos();
        }
        let spec = BinomialLocationScaleTermSpec {
            y: Array1::from_vec(vec![0.0, 1.0, 0.0, 2.0, 1.0, 0.0, 1.0, 0.0]),
            weights: Array1::from_elem(n, 1.0),
            link_kind: InverseLink::Standard(StandardLink::Probit),
            thresholdspec: simple_matern_term_collection(&[0, 1], 0.4),
            log_sigmaspec: simple_matern_term_collection(&[0, 1], 0.75),
            threshold_offset: Array1::zeros(n),
            log_sigma_offset: Array1::zeros(n),
        };

        let err = match fit_binomial_location_scale_terms(
            data.view(),
            spec,
            &BlockwiseFitOptions::default(),
            &spatial_kappa_options(),
        ) {
            Ok(_) => panic!("term API should reject invalid binomial responses"),
            Err(err) => err,
        };
        assert!(err.contains("binomial response must be finite in [0,1]"));
    }

    #[test]
    fn binomial_location_scale_terms_reject_free_log_sigma_terms_early() {
        let n = 8usize;
        let data = Array2::<f64>::zeros((n, 2));
        let spec = BinomialLocationScaleTermSpec {
            y: Array1::from_iter((0..n).map(|i| if i % 2 == 0 { 0.0 } else { 1.0 })),
            weights: Array1::from_elem(n, 1.0),
            link_kind: InverseLink::Standard(StandardLink::Logit),
            thresholdspec: simple_matern_term_collection(&[0, 1], 0.4),
            log_sigmaspec: simple_matern_term_collection(&[0, 1], 0.75),
            threshold_offset: Array1::zeros(n),
            log_sigma_offset: Array1::zeros(n),
        };

        let err = match fit_binomial_location_scale_terms(
            data.view(),
            spec,
            &BlockwiseFitOptions::default(),
            &spatial_kappa_options(),
        ) {
            Ok(_) => panic!("Bernoulli free log_sigma terms must be rejected"),
            Err(err) => err,
        };
        assert!(err.contains("identify only the composite q = -threshold / sigma"));
        assert!(err.contains("log_sigma must be intercept-only/fixed"));
    }

    #[test]
    fn binomial_location_scale_terms_reject_datarow_mismatch_early() {
        let n = 8usize;
        let data = Array2::<f64>::zeros((n - 1, 2));
        let spec = BinomialLocationScaleTermSpec {
            y: Array1::from_elem(n, 0.0),
            weights: Array1::from_elem(n, 1.0),
            link_kind: InverseLink::Standard(StandardLink::Probit),
            thresholdspec: simple_matern_term_collection(&[0, 1], 0.4),
            log_sigmaspec: simple_matern_term_collection(&[0, 1], 0.75),
            threshold_offset: Array1::zeros(n),
            log_sigma_offset: Array1::zeros(n),
        };

        let err = match fit_binomial_location_scale_terms(
            data.view(),
            spec,
            &BlockwiseFitOptions::default(),
            &spatial_kappa_options(),
        ) {
            Ok(_) => panic!("term API should reject data/y row mismatches"),
            Err(err) => err,
        };
        assert!(err.contains("data row count must match response length"));
    }

    #[test]
    fn gaussian_location_scale_termswith_matern_spatial_blocks_fit_finitely() {
        let n = 32usize;
        let mut data = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            let t = i as f64 / (n as f64 - 1.0);
            data[[i, 0]] = t;
            data[[i, 1]] = (2.0 * std::f64::consts::PI * t).sin();
        }
        let y = Array1::from_iter((0..n).map(|i| {
            let x0 = data[[i, 0]];
            let x1 = data[[i, 1]];
            0.5 * x0 - 0.25 * x1 + 0.1
        }));
        let weights = Array1::from_elem(n, 1.0);
        let spec = GaussianLocationScaleTermSpec {
            y,
            weights,
            meanspec: simple_matern_term_collection(&[0, 1], 0.35),
            log_sigmaspec: simple_matern_term_collection(&[0, 1], 0.6),
            mean_offset: Array1::zeros(n),
            log_sigma_offset: Array1::zeros(n),
        };
        let fit = fit_gaussian_location_scale_terms(
            data.view(),
            spec,
            &spatial_fit_smoke_options(),
            &spatial_kappa_options(),
        )
        .expect("gaussian location-scale spatial fit");
        assert!(fit.fit.penalized_objective.is_finite());
        assert_eq!(fit.fit.block_states.len(), 2);
    }

    /// Issue #365 (primary symptom): a *homoscedastic* Gaussian fit with a
    /// smooth `noise_formula` must NOT degrade the mean fit. The released
    /// repro fed `y = 1 + 0.7x + sin(x) + N(0, σ²)` with constant σ to a model
    /// carrying a smooth mean *and* a smooth log-σ block and got a mean RMSE of
    /// ~1.5 (the predicted mean range collapsed inward toward the grand mean),
    /// versus ~0.03 for a plain GAM. A smooth scale block that is free to
    /// wiggle can absorb mean-residual structure into the variance, which lets
    /// the joint REML over-smooth the mean block. This test pins the headline
    /// contract directly: adding the smooth scale block to homoscedastic data
    /// must leave the recovered mean tracking the truth, not flattened.
    ///
    /// It is deterministic (LCG uniforms pushed through the probit to draw the
    /// Gaussian residuals) and exercises the real end-to-end two-block joint
    /// solve, not a synthetic linear-algebra stub. A mean-flattening regression
    /// (the #365 failure mode) drives the RMSE far above the asserted bound.
    #[test]
    fn gaussian_location_scale_smooth_noise_homoscedastic_recovers_mean() {
        let n = 300usize;
        // Deterministic LCG -> uniform(0,1); probit gives standard-normal draws.
        let mut lcg: u64 = 0x2545_F491_4F6C_DD1D;
        let mut next_unit = || {
            lcg = lcg
                .wrapping_mul(6364136223846793005)
                .wrapping_add(1442695040888963407);
            // Top 53 bits -> (0,1), nudged off the open-interval endpoints so
            // the probit stays finite.
            let bits = (lcg >> 11) as f64 / ((1u64 << 53) as f64);
            bits.clamp(1.0e-6, 1.0 - 1.0e-6)
        };

        // x uniform on [-3, 3] (matches the released repro grid).
        let mut data = Array2::<f64>::zeros((n, 1));
        let mut xs = Vec::with_capacity(n);
        for i in 0..n {
            let x = -3.0 + 6.0 * next_unit();
            data[[i, 0]] = x;
            xs.push(x);
        }
        let true_mean: Vec<f64> = xs.iter().map(|&x| 1.0 + 0.7 * x + x.sin()).collect();
        // Constant true scale: the data are homoscedastic (het = 0).
        let true_sigma = (-0.5_f64).exp();
        let y = Array1::from_iter((0..n).map(|i| {
            let z = standard_normal_quantile(next_unit()).expect("finite probit draw");
            true_mean[i] + true_sigma * z
        }));
        let weights = Array1::from_elem(n, 1.0);

        let spec = GaussianLocationScaleTermSpec {
            y,
            weights,
            // Smooth mean AND smooth log-σ block: this is the exact
            // configuration that broke in #365 (linear noise terms were fine).
            meanspec: simple_matern_term_collection(&[0], 0.6),
            log_sigmaspec: simple_matern_term_collection(&[0], 0.6),
            mean_offset: Array1::zeros(n),
            log_sigma_offset: Array1::zeros(n),
        };
        let fit = fit_gaussian_location_scale_terms(
            data.view(),
            spec,
            &spatial_fit_smoke_options(),
            &spatial_kappa_options(),
        )
        .expect("gaussian location-scale smooth-noise homoscedastic fit");

        // The mean block (BLOCK_MU = 0) carries identity-link η = predicted mean
        // (mean_offset is zero), so its state η is the fitted mean directly.
        let mean_eta = &fit.fit.block_states[GaussianLocationScaleFamily::BLOCK_MU].eta;
        assert_eq!(mean_eta.len(), n);
        let mut sq_err = 0.0;
        for i in 0..n {
            let d = mean_eta[i] - true_mean[i];
            sq_err += d * d;
        }
        let mean_rmse = (sq_err / n as f64).sqrt();

        // A correctly converged mean tracks the truth to well within the noise
        // scale; the #365 collapse-to-grand-mean failure produces RMSE ~1.5.
        // The bound below is far below that failure level yet comfortably above
        // any honest small-n sampling/penalty bias, so it fails the bug and
        // passes the fix without being a tautology.
        assert!(
            mean_rmse < 0.5,
            "smooth noise_formula degraded the homoscedastic mean fit (issue #365): \
             mean RMSE = {mean_rmse:.4} (expected < 0.5; the regression produced ~1.5)"
        );
    }

    #[test]
    fn binomial_location_scale_termswith_matern_spatial_blocks_fit_finitely() {
        let n = 36usize;
        let mut data = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            let t = i as f64 / (n as f64 - 1.0);
            data[[i, 0]] = t;
            data[[i, 1]] = (3.0 * std::f64::consts::PI * t).cos();
        }
        let y = Array1::from_iter((0..n).map(|i| if i % 5 == 0 || i % 7 == 0 { 1.0 } else { 0.0 }));
        let weights = Array1::from_elem(n, 1.0);
        let spec = BinomialLocationScaleTermSpec {
            y,
            weights,
            link_kind: InverseLink::Standard(StandardLink::Probit),
            thresholdspec: simple_matern_term_collection(&[0, 1], 0.4),
            log_sigmaspec: empty_term_collection(),
            threshold_offset: Array1::zeros(n),
            log_sigma_offset: Array1::zeros(n),
        };
        let fit = fit_binomial_location_scale_terms(
            data.view(),
            spec,
            &spatial_fit_smoke_options(),
            &spatial_kappa_options(),
        )
        .expect("binomial location-scale spatial fit");
        assert!(fit.fit.penalized_objective.is_finite());
        assert_eq!(fit.fit.block_states.len(), 2);
    }

    #[test]
    fn binomial_location_scalewiggle_termswith_matern_spatial_blocks_fit_finitely() {
        let n = 30usize;
        let mut data = Array2::<f64>::zeros((n, 2));
        for i in 0..n {
            let t = i as f64 / (n as f64 - 1.0);
            data[[i, 0]] = t;
            data[[i, 1]] = (2.5 * std::f64::consts::PI * t).sin();
        }
        let y = Array1::from_iter((0..n).map(|i| if i % 4 == 0 || i % 9 == 0 { 1.0 } else { 0.0 }));
        let weights = Array1::from_elem(n, 1.0);
        let q_seed = Array1::linspace(-1.5, 1.5, n);
        let (wiggle_block, knots) = BinomialLocationScaleWiggleFamily::buildwiggle_block_input(
            q_seed.view(),
            2,
            4,
            2,
            false,
        )
        .expect("wiggle block");
        let spec = BinomialLocationScaleWiggleTermSpec {
            y,
            weights,
            link_kind: InverseLink::Standard(StandardLink::Probit),
            thresholdspec: simple_matern_term_collection(&[0, 1], 0.45),
            log_sigmaspec: empty_term_collection(),
            threshold_offset: Array1::zeros(n),
            log_sigma_offset: Array1::zeros(n),
            wiggle_knots: knots,
            wiggle_degree: 2,
            wiggle_block,
        };
        let fit = fit_binomial_location_scalewiggle_terms(
            data.view(),
            spec,
            &spatial_fit_smoke_options(),
            &spatial_kappa_options(),
        )
        .expect("binomial location-scale wiggle spatial fit");
        assert!(fit.fit.penalized_objective.is_finite());
        assert_eq!(fit.fit.block_states.len(), 3);
    }

    #[test]
    fn wiggle_family_evaluate_returns_exact_newton_blocks() {
        let n = 6usize;
        let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0]);
        let weights = Array1::from_vec(vec![1.0; n]);
        let threshold_block = intercept_block(n);
        let log_sigma_block = intercept_block(n);
        let q_seed = Array1::linspace(-1.5, 1.5, n);
        let (wiggle_block, knots) = BinomialLocationScaleWiggleFamily::buildwiggle_block_input(
            q_seed.view(),
            2,
            3,
            2,
            false,
        )
        .expect("wiggle block");
        let threshold_design = threshold_block.design.clone();
        let log_sigma_design = log_sigma_block.design.clone();
        let family = BinomialLocationScaleWiggleFamily {
            y: y.clone(),
            weights: weights.clone(),
            link_kind: InverseLink::Standard(StandardLink::Probit),
            threshold_design: Some(threshold_design),
            log_sigma_design: Some(log_sigma_design),
            wiggle_knots: knots,
            wiggle_degree: 2,
            policy: crate::resource::ResourcePolicy::default_library(),
        };

        let eta_t = Array1::from_vec(vec![0.4; n]);
        let eta_ls = Array1::from_vec(vec![-0.2; n]);
        let core_for_q0 =
            binomial_location_scale_core(&y, &weights, &eta_t, &eta_ls, None, &family.link_kind)
                .expect("core q0");
        let betaw = Array1::from_vec(vec![0.05; wiggle_block.design.ncols()]);
        let etaw = family
            .wiggle_design(core_for_q0.q0.view())
            .expect("wiggle design")
            .dot(&betaw);
        let eval = family
            .evaluate(&[
                ParameterBlockState {
                    beta: Array1::from_vec(vec![0.4]),
                    eta: eta_t,
                },
                ParameterBlockState {
                    beta: Array1::from_vec(vec![-0.2]),
                    eta: eta_ls,
                },
                ParameterBlockState {
                    beta: betaw.clone(),
                    eta: etaw,
                },
            ])
            .expect("evaluate");

        assert_eq!(eval.blockworking_sets.len(), 3);
        match &eval.blockworking_sets[0] {
            BlockWorkingSet::ExactNewton { gradient, hessian } => {
                let hessian = hessian.to_dense();
                assert_eq!(gradient.len(), 1);
                assert_eq!(hessian.dim(), (1, 1));
                assert!(gradient[0].is_finite());
                assert!(hessian[[0, 0]].is_finite());
            }
            BlockWorkingSet::Diagonal { .. } => panic!("threshold block should be exact newton"),
        }
        match &eval.blockworking_sets[1] {
            BlockWorkingSet::ExactNewton { gradient, hessian } => {
                let hessian = hessian.to_dense();
                assert_eq!(gradient.len(), 1);
                assert_eq!(hessian.dim(), (1, 1));
                assert!(gradient[0].is_finite());
                assert!(hessian[[0, 0]].is_finite());
            }
            BlockWorkingSet::Diagonal { .. } => panic!("log-sigma block should be exact newton"),
        }
        match &eval.blockworking_sets[2] {
            BlockWorkingSet::ExactNewton { gradient, hessian } => {
                let hessian = hessian.to_dense();
                assert_eq!(gradient.len(), betaw.len());
                assert_eq!(hessian.nrows(), betaw.len());
                assert_eq!(hessian.ncols(), betaw.len());
                assert!(gradient.iter().all(|v| v.is_finite()));
                assert!(hessian.iter().all(|v| v.is_finite()));
            }
            BlockWorkingSet::Diagonal { .. } => panic!("wiggle block should be exact newton"),
        }
    }

    #[test]
    fn wiggle_family_exact_newton_directional_derivative_matches_finite_difference() {
        let n = 7usize;
        let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0]);
        let weights = Array1::from_vec(vec![1.0; n]);
        let threshold_block = intercept_block(n);
        let log_sigma_block = intercept_block(n);
        let q_seed = Array1::linspace(-1.4, 1.4, n);
        let (wiggle_block, knots) = BinomialLocationScaleWiggleFamily::buildwiggle_block_input(
            q_seed.view(),
            3,
            4,
            2,
            false,
        )
        .expect("wiggle block");
        let threshold_design = threshold_block.design.clone();
        let log_sigma_design = log_sigma_block.design.clone();
        let family = BinomialLocationScaleWiggleFamily {
            y: y.clone(),
            weights: weights.clone(),
            link_kind: InverseLink::Standard(StandardLink::Probit),
            threshold_design: Some(threshold_design.clone()),
            log_sigma_design: Some(log_sigma_design.clone()),
            wiggle_knots: knots,
            wiggle_degree: 3,
            policy: crate::resource::ResourcePolicy::default_library(),
        };

        let beta_t = Array1::from_vec(vec![0.25]);
        let beta_ls = Array1::from_vec(vec![-0.15]);
        let eta_t = threshold_design.matrixvectormultiply(&beta_t);
        let eta_ls = log_sigma_design.matrixvectormultiply(&beta_ls);
        let core_for_q0 =
            binomial_location_scale_core(&y, &weights, &eta_t, &eta_ls, None, &family.link_kind)
                .expect("core q0");
        let betaw = Array1::from_vec(vec![0.04; wiggle_block.design.ncols()]);
        let etaw = family
            .wiggle_design(core_for_q0.q0.view())
            .expect("wiggle design")
            .dot(&betaw);

        let states = vec![
            ParameterBlockState {
                beta: beta_t.clone(),
                eta: eta_t.clone(),
            },
            ParameterBlockState {
                beta: beta_ls.clone(),
                eta: eta_ls.clone(),
            },
            ParameterBlockState {
                beta: betaw.clone(),
                eta: etaw.clone(),
            },
        ];

        let extract = |eval: FamilyEvaluation, idx: usize| -> Array2<f64> {
            match &eval.blockworking_sets[idx] {
                BlockWorkingSet::ExactNewton {
                    gradient: _,
                    hessian,
                } => hessian.to_dense(),
                BlockWorkingSet::Diagonal { .. } => panic!("expected exact newton"),
            }
        };

        let base_eval = family.evaluate(&states).expect("base eval");
        let eps = 1e-6;
        for block_idx in 0..3 {
            let d_beta = Array1::ones(states[block_idx].beta.len());
            let analytic = family
                .exact_newton_hessian_directional_derivative(&states, block_idx, &d_beta)
                .expect("analytic dH")
                .expect("expected derivative");

            let mut plus_states = states.clone();
            plus_states[block_idx].beta = &plus_states[block_idx].beta + &(eps * &d_beta);
            plus_states[BinomialLocationScaleWiggleFamily::BLOCK_T].eta = threshold_design
                .matrixvectormultiply(
                    &plus_states[BinomialLocationScaleWiggleFamily::BLOCK_T].beta,
                );
            plus_states[BinomialLocationScaleWiggleFamily::BLOCK_LOG_SIGMA].eta = log_sigma_design
                .matrixvectormultiply(
                    &plus_states[BinomialLocationScaleWiggleFamily::BLOCK_LOG_SIGMA].beta,
                );
            let plus_core_q0 = binomial_location_scale_core(
                &y,
                &weights,
                &plus_states[BinomialLocationScaleWiggleFamily::BLOCK_T].eta,
                &plus_states[BinomialLocationScaleWiggleFamily::BLOCK_LOG_SIGMA].eta,
                None,
                &family.link_kind,
            )
            .expect("plus core q0");
            plus_states[BinomialLocationScaleWiggleFamily::BLOCK_WIGGLE].eta = family
                .wiggle_design(plus_core_q0.q0.view())
                .expect("plus wiggle design")
                .dot(&plus_states[BinomialLocationScaleWiggleFamily::BLOCK_WIGGLE].beta);

            let h_plus = extract(family.evaluate(&plus_states).expect("plus eval"), block_idx);
            let h_base = extract(base_eval.clone(), block_idx);
            let fd = (h_plus - h_base) / eps;
            crate::test_support::assert_matrix_derivativefd(
                &fd,
                &analytic,
                5e-4,
                &format!("block {} dH", block_idx),
            );
        }
    }

    #[test]
    fn wiggle_threshold_block_exacthessian_matches_autodiffobjective() {
        let n = 7usize;
        let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0]);
        let weights = Array1::from_vec(vec![1.0; n]);
        let threshold_block = intercept_block(n);
        let log_sigma_block = intercept_block(n);
        let q_seed = Array1::linspace(-1.4, 1.4, n);
        let (wiggle_block, knots) = BinomialLocationScaleWiggleFamily::buildwiggle_block_input(
            q_seed.view(),
            3,
            4,
            2,
            false,
        )
        .expect("wiggle block");
        let threshold_design = threshold_block.design.clone();
        let log_sigma_design = log_sigma_block.design.clone();
        let family = BinomialLocationScaleWiggleFamily {
            y: y.clone(),
            weights: weights.clone(),
            link_kind: InverseLink::Standard(StandardLink::Logit),
            threshold_design: Some(threshold_design.clone()),
            log_sigma_design: Some(log_sigma_design.clone()),
            wiggle_knots: knots.clone(),
            wiggle_degree: 3,
            policy: crate::resource::ResourcePolicy::default_library(),
        };

        let beta_t0 = 0.25;
        let beta_ls0 = -0.15;
        let beta_t = array![beta_t0];
        let beta_ls = array![beta_ls0];
        let eta_t = threshold_design.matrixvectormultiply(&beta_t);
        let eta_ls = log_sigma_design.matrixvectormultiply(&beta_ls);
        let core_for_q0 =
            binomial_location_scale_core(&y, &weights, &eta_t, &eta_ls, None, &family.link_kind)
                .expect("core q0");
        let betaw = Array1::from_vec(vec![0.04; wiggle_block.design.ncols()]);
        let etaw = family
            .wiggle_design(core_for_q0.q0.view())
            .expect("wiggle design")
            .dot(&betaw);
        let states = vec![
            ParameterBlockState {
                beta: beta_t,
                eta: eta_t,
            },
            ParameterBlockState {
                beta: beta_ls,
                eta: eta_ls,
            },
            ParameterBlockState {
                beta: betaw.clone(),
                eta: etaw,
            },
        ];

        let eval = family.evaluate(&states).expect("evaluate wiggle family");
        let blockhessian = match &eval.blockworking_sets[BinomialLocationScaleWiggleFamily::BLOCK_T]
        {
            BlockWorkingSet::ExactNewton { hessian, .. } => hessian.to_dense(),
            BlockWorkingSet::Diagonal { .. } => panic!("expected exact newton threshold block"),
        };
        let (_, _, hess_ad) = second_derivative(
            |bt| wiggle_negloglik_threshold_numdual(bt, beta_ls0, &betaw, &y, &weights, &knots, 3),
            beta_t0,
        );
        assert!(
            (blockhessian[[0, 0]] - hess_ad).abs() <= 5e-6,
            "wiggle threshold exact hessian mismatch: evaluate()={} autodiff={}",
            blockhessian[[0, 0]],
            hess_ad
        );
    }

    #[test]
    fn gaussian_log_sigma_psi_terms_match_autodiff_scalar_objective() {
        let y = array![0.25, -0.4, 1.1];
        let weights = array![1.0, 0.7, 1.3];
        let x_mu0 = array![1.0, -0.35, 0.6];
        let x_ls0 = array![0.8, -0.25, 0.45];
        let x_ls_psi = array![0.2, -0.15, 0.1];
        let x_ls_psi_psi = array![0.05, -0.03, 0.04];
        let beta_mu0 = 0.35_f64;
        let beta_ls0 = -0.2_f64;

        let x_mu0_mat = x_mu0.clone().insert_axis(Axis(1));
        let x_ls0_mat = x_ls0.clone().insert_axis(Axis(1));
        let family = GaussianLocationScaleFamily {
            y: y.clone(),
            weights: weights.clone(),
            mu_design: Some(DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
                x_mu0_mat.clone(),
            ))),
            log_sigma_design: Some(DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
                x_ls0_mat.clone(),
            ))),
            policy: crate::resource::ResourcePolicy::default_library(),
            cached_row_scalars: std::sync::RwLock::new(None),
        };
        let specs = vec![
            gaussian_psi_test_spec("mu", x_mu0_mat.clone()),
            gaussian_psi_test_spec("log_sigma", x_ls0_mat.clone()),
        ];
        let states = vec![
            ParameterBlockState {
                beta: array![beta_mu0],
                eta: x_mu0_mat.column(0).to_owned() * beta_mu0,
            },
            ParameterBlockState {
                beta: array![beta_ls0],
                eta: x_ls0_mat.column(0).to_owned() * beta_ls0,
            },
        ];
        let derivative_blocks = vec![
            Vec::new(),
            vec![CustomFamilyBlockPsiDerivative {
                penalty_index: None,
                x_psi: x_ls_psi.clone().insert_axis(Axis(1)),
                s_psi: Array2::zeros((1, 1)),
                s_psi_components: None,
                s_psi_penalty_components: None,
                x_psi_psi: Some(vec![x_ls_psi_psi.clone().insert_axis(Axis(1))]),
                s_psi_psi: Some(vec![Array2::zeros((1, 1))]),
                s_psi_psi_components: None,
                s_psi_psi_penalty_components: None,
                implicit_operator: None,
                implicit_axis: 0,
                implicit_group_id: None,
            }],
        ];

        let psi_terms = family
            .exact_newton_joint_psi_terms(&states, &specs, &derivative_blocks, 0)
            .expect("joint psi terms")
            .expect("expected gaussian psi terms");

        let vars = [beta_mu0, beta_ls0, 0.0_f64];
        let (_, dpsi, _) = second_derivative(
            |psi| {
                gaussian_negloglik_log_sigma_psi_only_numdual(
                    psi,
                    beta_mu0,
                    beta_ls0,
                    &y,
                    &weights,
                    &x_mu0,
                    &x_ls0,
                    &x_ls_psi,
                    &x_ls_psi_psi,
                )
            },
            0.0,
        );
        let (_, _, _, score_mu_psi) = second_partial_derivative(
            |(beta_mu, psi)| {
                gaussian_negloglik_log_sigma_mu_psi_numdual(
                    beta_mu,
                    psi,
                    beta_ls0,
                    &y,
                    &weights,
                    &x_mu0,
                    &x_ls0,
                    &x_ls_psi,
                    &x_ls_psi_psi,
                )
            },
            (beta_mu0, 0.0),
        );
        let (_, _, _, score_ls_psi) = second_partial_derivative(
            |(beta_ls, psi)| {
                gaussian_negloglik_log_sigma_ls_psi_numdual(
                    beta_ls,
                    psi,
                    beta_mu0,
                    &y,
                    &weights,
                    &x_mu0,
                    &x_ls0,
                    &x_ls_psi,
                    &x_ls_psi_psi,
                )
            },
            (beta_ls0, 0.0),
        );
        let (_, _, _, _, _, _, _, h_mu_mu_psi) = third_partial_derivative_vec(
            |v| {
                gaussian_negloglik_log_sigma_beta_vec_numdual(
                    v,
                    &y,
                    &weights,
                    &x_mu0,
                    &x_ls0,
                    &x_ls_psi,
                    &x_ls_psi_psi,
                )
            },
            &vars,
            0,
            0,
            2,
        );
        let (_, _, _, _, _, _, _, h_mu_ls_psi) = third_partial_derivative_vec(
            |v| {
                gaussian_negloglik_log_sigma_beta_vec_numdual(
                    v,
                    &y,
                    &weights,
                    &x_mu0,
                    &x_ls0,
                    &x_ls_psi,
                    &x_ls_psi_psi,
                )
            },
            &vars,
            0,
            1,
            2,
        );
        let (_, _, _, _, _, _, _, h_ls_ls_psi) = third_partial_derivative_vec(
            |v| {
                gaussian_negloglik_log_sigma_beta_vec_numdual(
                    v,
                    &y,
                    &weights,
                    &x_mu0,
                    &x_ls0,
                    &x_ls_psi,
                    &x_ls_psi_psi,
                )
            },
            &vars,
            1,
            1,
            2,
        );

        assert!(
            (psi_terms.objective_psi - dpsi).abs() <= 1e-10,
            "Gaussian log-sigma psi objective derivative mismatch: analytic={} autodiff={}",
            psi_terms.objective_psi,
            dpsi
        );
        assert!(
            (psi_terms.score_psi[0] - score_mu_psi).abs() <= 1e-10,
            "Gaussian log-sigma psi score_mu mismatch: analytic={} autodiff={}",
            psi_terms.score_psi[0],
            score_mu_psi
        );
        assert!(
            (psi_terms.score_psi[1] - score_ls_psi).abs() <= 1e-10,
            "Gaussian log-sigma psi score_ls mismatch: analytic={} autodiff={}",
            psi_terms.score_psi[1],
            score_ls_psi
        );
        assert!(
            (psi_terms.hessian_psi[[0, 0]] - h_mu_mu_psi).abs() <= 1e-9,
            "Gaussian log-sigma psi hessian(mu,mu) mismatch: analytic={} autodiff={}",
            psi_terms.hessian_psi[[0, 0]],
            h_mu_mu_psi
        );
        // The (μ, log-σ) cross block of the analytic coefficient Hessian uses
        // Fisher information `E[H_{μ,ls}] = 2κ·E[m] = 0` (`hmu_ls[i] = 0` in
        // `gaussian_joint_psi_firstweights`; #684), so its ψ-derivative is
        // identically 0. The AD reference is the observed `∂³N/∂β_μ∂β_ls∂ψ`,
        // which carries the observed contribution `Σ_i X_μ_i · (2 m_i κ_i)·
        // X_ls,i(ψ)`. Subtracting that observed ψ-drift puts the AD reference
        // back on the same Fisher footing as the analytic block. Per-row,
        // `∂(2mκ)/∂η_ls = -2 m·P` with `P = 2κ² − κ'` (from `dm/dη_ls = -2κm`
        // and `dκ/dη_ls = κ'`), and `dX_ls/dψ = x_ls_psi`, so the chain rule
        // gives `∂(observed cross)/∂ψ = Σ_i X_μ_i·[-2 m P·z_ls_psi·X_ls,i
        // + 2 m κ·x_ls_psi,i]` with `z_ls_psi = X_ls_psi·β_ls`.
        let rows_gap =
            gaussian_jointrow_scalars(&y, &(&x_mu0 * beta_mu0), &(&x_ls0 * beta_ls0), &weights)
                .expect("gaussian row scalars for psi corrections");
        let mu_ls_psi_correction: f64 = (0..y.len())
            .map(|i| {
                let m = rows_gap.m[i];
                let k = rows_gap.kappa[i];
                let kp = rows_gap.kappa_prime[i];
                let p = 2.0 * k * k - kp;
                let xm = x_mu0[i];
                let xl = x_ls0[i];
                let xp = x_ls_psi[i];
                let z_ls_psi = xp * beta_ls0;
                // Fisher − observed = 0 − ∂(2mκ·X_ls)/∂ψ at ψ=0
                xm * (2.0 * m * p * z_ls_psi * xl - 2.0 * m * k * xp)
            })
            .sum();
        assert!(
            (psi_terms.hessian_psi[[0, 1]] - (h_mu_ls_psi + mu_ls_psi_correction)).abs() <= 1e-9,
            "Gaussian log-sigma psi hessian(mu,ls) mismatch: analytic={} reference={} (ad={} + Fisher correction={})",
            psi_terms.hessian_psi[[0, 1]],
            h_mu_ls_psi + mu_ls_psi_correction,
            h_mu_ls_psi,
            mu_ls_psi_correction
        );
        // The (ls,ls) coefficient-Hessian block uses the Fisher curvature
        // `2κ²a` (#566), so its ψ-derivative `hessian_psi[[1,1]]` is the Fisher
        // ψ-drift, while the AD reference `∂³N/∂β_ls²∂ψ` is the observed drift.
        // They differ by the ψ-derivative of the Fisher−observed coefficient
        // gap `H^gap_lsls(ψ) = Σ_i Δ_i(η_ls(ψ))·X_ls,i(ψ)²` with
        // `Δ = (a−n)·P`, `P = 2κ² − κ'`, `P' = 4κκ' − κ''`,
        // `∂Δ/∂η_ls = 2κn·P + (a−n)P'`. With `dη_ls/dψ = X_ls_psi·β_ls` and
        // `dX_ls/dψ = x_ls_psi` (the ψ-drift of the log-σ design), product rule
        // gives the per-row correction below. The η-drift is the code's own
        // `z_ls_psi = X_ls_psi·β_ls` (the η_ls induced by the design ψ-drift)
        // and the design drift is `dX_ls/dψ = x_ls_psi`. η_μ is ψ-independent.
        let ls_ls_psi_correction: f64 = (0..y.len())
            .map(|i| {
                let a = rows_gap.obs_weight[i];
                let n = rows_gap.n[i];
                let k = rows_gap.kappa[i];
                let kp = rows_gap.kappa_prime[i];
                let kdp = rows_gap.kappa_dprime[i];
                let p = 2.0 * k * k - kp;
                let p1 = 4.0 * k * kp - kdp;
                let delta = (a - n) * p;
                let ddelta_deta = 2.0 * k * n * p + (a - n) * p1;
                let x0 = x_ls0[i];
                let xp = x_ls_psi[i];
                let z_ls_psi = xp * beta_ls0; // dη_ls/dψ = X_ls_psi·β_ls
                ddelta_deta * z_ls_psi * x0 * x0 + delta * 2.0 * x0 * xp
            })
            .sum();
        assert!(
            (psi_terms.hessian_psi[[1, 1]] - (h_ls_ls_psi + ls_ls_psi_correction)).abs() <= 1e-9,
            "Gaussian log-sigma psi hessian(ls,ls) mismatch: analytic={} reference={} (ad={} + Fisher correction={})",
            psi_terms.hessian_psi[[1, 1]],
            h_ls_ls_psi + ls_ls_psi_correction,
            h_ls_ls_psi,
            ls_ls_psi_correction
        );
    }

    #[test]
    fn gaussian_log_sigma_psi_second_order_terms_match_autodiff_scalar_objective() {
        let y = array![0.25, -0.4, 1.1];
        let weights = array![1.0, 0.7, 1.3];
        let x_mu0 = array![1.0, -0.35, 0.6];
        let x_ls0 = array![0.8, -0.25, 0.45];
        let x_ls_psi = array![0.2, -0.15, 0.1];
        let x_ls_psi_psi = array![0.05, -0.03, 0.04];
        let beta_mu0 = 0.35_f64;
        let beta_ls0 = -0.2_f64;

        let x_mu0_mat = x_mu0.clone().insert_axis(Axis(1));
        let x_ls0_mat = x_ls0.clone().insert_axis(Axis(1));
        let family = GaussianLocationScaleFamily {
            y: y.clone(),
            weights: weights.clone(),
            mu_design: Some(DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
                x_mu0_mat.clone(),
            ))),
            log_sigma_design: Some(DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
                x_ls0_mat.clone(),
            ))),
            policy: crate::resource::ResourcePolicy::default_library(),
            cached_row_scalars: std::sync::RwLock::new(None),
        };
        let specs = vec![
            gaussian_psi_test_spec("mu", x_mu0_mat.clone()),
            gaussian_psi_test_spec("log_sigma", x_ls0_mat.clone()),
        ];
        let states = vec![
            ParameterBlockState {
                beta: array![beta_mu0],
                eta: x_mu0_mat.column(0).to_owned() * beta_mu0,
            },
            ParameterBlockState {
                beta: array![beta_ls0],
                eta: x_ls0_mat.column(0).to_owned() * beta_ls0,
            },
        ];
        let derivative_blocks = vec![
            Vec::new(),
            vec![CustomFamilyBlockPsiDerivative {
                penalty_index: None,
                x_psi: x_ls_psi.clone().insert_axis(Axis(1)),
                s_psi: Array2::zeros((1, 1)),
                s_psi_components: None,
                s_psi_penalty_components: None,
                x_psi_psi: Some(vec![x_ls_psi_psi.clone().insert_axis(Axis(1))]),
                s_psi_psi: Some(vec![Array2::zeros((1, 1))]),
                s_psi_psi_components: None,
                s_psi_psi_penalty_components: None,
                implicit_operator: None,
                implicit_axis: 0,
                implicit_group_id: None,
            }],
        ];

        let psi2_terms = family
            .exact_newton_joint_psisecond_order_terms(&states, &specs, &derivative_blocks, 0, 0)
            .expect("joint psi psi terms")
            .expect("expected gaussian psi psi terms");

        let vars = [beta_mu0, beta_ls0, 0.0_f64];
        let (_, _, d2psi) = second_derivative(
            |psi| {
                gaussian_negloglik_log_sigma_psi_only_numdual(
                    psi,
                    beta_mu0,
                    beta_ls0,
                    &y,
                    &weights,
                    &x_mu0,
                    &x_ls0,
                    &x_ls_psi,
                    &x_ls_psi_psi,
                )
            },
            0.0,
        );
        let (_, _, _, _, _, _, _, score_mu_psi_psi) = third_partial_derivative_vec(
            |v| {
                gaussian_negloglik_log_sigma_beta_vec_numdual(
                    v,
                    &y,
                    &weights,
                    &x_mu0,
                    &x_ls0,
                    &x_ls_psi,
                    &x_ls_psi_psi,
                )
            },
            &vars,
            0,
            2,
            2,
        );
        let (_, _, _, _, _, _, _, score_ls_psi_psi) = third_partial_derivative_vec(
            |v| {
                gaussian_negloglik_log_sigma_beta_vec_numdual(
                    v,
                    &y,
                    &weights,
                    &x_mu0,
                    &x_ls0,
                    &x_ls_psi,
                    &x_ls_psi_psi,
                )
            },
            &vars,
            1,
            2,
            2,
        );

        assert!(
            (psi2_terms.objective_psi_psi - d2psi).abs() <= 1e-10,
            "Gaussian log-sigma psi second objective mismatch: analytic={} autodiff={}",
            psi2_terms.objective_psi_psi,
            d2psi
        );
        assert!(
            (psi2_terms.score_psi_psi[0] - score_mu_psi_psi).abs() <= 1e-9,
            "Gaussian log-sigma psi second score_mu mismatch: analytic={} autodiff={}",
            psi2_terms.score_psi_psi[0],
            score_mu_psi_psi
        );
        assert!(
            (psi2_terms.score_psi_psi[1] - score_ls_psi_psi).abs() <= 1e-9,
            "Gaussian log-sigma psi second score_ls mismatch: analytic={} autodiff={}",
            psi2_terms.score_psi_psi[1],
            score_ls_psi_psi
        );
    }

    // Sibling oracle: μ also depends on ψ. Used by the joint psi-second-order
    // guardrail; the original oracle leaves μ fixed in ψ.
    fn gaussian_negloglik_log_sigma_psi_full_numdual<D: DualNum<f64> + Copy>(
        beta_mu: D,
        beta_ls: D,
        psi: D,
        y: &Array1<f64>,
        weights: &Array1<f64>,
        x_mu0: &Array1<f64>,
        x_ls0: &Array1<f64>,
        x_mu_psi: &Array1<f64>,
        x_ls_psi: &Array1<f64>,
        x_mu_psi_psi: &Array1<f64>,
        x_ls_psi_psi: &Array1<f64>,
    ) -> D {
        let half = D::from(0.5);
        let mut out = D::zero();
        for i in 0..y.len() {
            let x_mu = D::from(x_mu0[i])
                + psi * D::from(x_mu_psi[i])
                + half * psi * psi * D::from(x_mu_psi_psi[i]);
            let eta_mu = x_mu * beta_mu;
            let x_ls = D::from(x_ls0[i])
                + psi * D::from(x_ls_psi[i])
                + half * psi * psi * D::from(x_ls_psi_psi[i]);
            let eta_ls = x_ls * beta_ls;
            let sigma = D::from(LOGB_SIGMA_FLOOR) + eta_ls.exp();
            let resid = D::from(y[i]) - eta_mu;
            out += D::from(weights[i]) * (half * (resid / sigma).powi(2) + sigma.ln());
        }
        out
    }

    // Oracle with multi-column designs (β vectors). Used by the joint
    // static-Hessian guardrail and its directional derivatives.
    fn gaussian_negloglik_logb_dense_numdual<D: DualNum<f64> + Copy>(
        beta_mu: &[D],
        beta_ls: &[D],
        y: &Array1<f64>,
        weights: &Array1<f64>,
        xmu: &Array2<f64>,
        x_ls: &Array2<f64>,
    ) -> D {
        let half = D::from(0.5);
        let n = y.len();
        let mut out = D::zero();
        for i in 0..n {
            let mut eta_mu = D::zero();
            for k in 0..beta_mu.len() {
                eta_mu += D::from(xmu[[i, k]]) * beta_mu[k];
            }
            let mut eta_ls = D::zero();
            for k in 0..beta_ls.len() {
                eta_ls += D::from(x_ls[[i, k]]) * beta_ls[k];
            }
            let sigma = D::from(LOGB_SIGMA_FLOOR) + eta_ls.exp();
            let resid = D::from(y[i]) - eta_mu;
            out += D::from(weights[i]) * (half * (resid / sigma).powi(2) + sigma.ln());
        }
        out
    }

    fn gaussian_logb_design_test_data() -> (
        Array1<f64>,
        Array1<f64>,
        Array2<f64>,
        Array2<f64>,
        Array1<f64>,
        Array1<f64>,
    ) {
        // n=5, two-column designs (intercept + smooth feature). β_ls0 chosen so
        // that η_ls ≈ −0.4 on the central row → κ ≈ 0.985, which is noticeably
        // less than 1 so κ' chain-rule contributions register at strict tolerance.
        let y = array![0.25, -0.4, 1.1, 0.05, -0.2];
        let weights = array![1.0, 0.7, 1.3, 0.9, 1.1];
        let xmu = ndarray::arr2(&[[1.0, -0.6], [1.0, -0.2], [1.0, 0.1], [1.0, 0.4], [1.0, 0.7]]);
        let x_ls = ndarray::arr2(&[[1.0, 0.5], [1.0, -0.1], [1.0, 0.3], [1.0, -0.4], [1.0, 0.2]]);
        // β_ls = (−0.4, 0.05): η_ls hovers around −0.4, so σ ≈ 0.68 and κ ≈ 0.985.
        let beta_mu = array![0.35, -0.25];
        let beta_ls = array![-0.4, 0.05];
        (y, weights, xmu, x_ls, beta_mu, beta_ls)
    }

    #[test]
    fn gaussian_joint_static_hessian_matches_autodiff() {
        let (y, weights, xmu, x_ls, beta_mu, beta_ls) = gaussian_logb_design_test_data();
        let etamu = xmu.dot(&beta_mu);
        let eta_ls = x_ls.dot(&beta_ls);

        let rows =
            gaussian_jointrow_scalars(&y, &etamu, &eta_ls, &weights).expect("gaussian row scalars");
        let weights0 = gaussian_joint_psi_firstweights(
            &rows,
            &Array1::zeros(y.len()),
            &Array1::zeros(y.len()),
        );
        let xmu_dense = DenseOrOperator::Borrowed(&xmu);
        let xls_dense = DenseOrOperator::Borrowed(&x_ls);
        let analytic = gaussian_joint_hessian_from_designs(
            &xmu_dense,
            &xls_dense,
            &weights0.hmumu,
            &weights0.hmu_ls,
            &weights0.h_ls_ls,
        )
        .expect("gaussian joint static hessian from designs");

        // AD ground truth: full p×p Hessian via second_partial_derivative,
        // packing β_full = (β_μ, β_ls) and stepping (i, j) pairs.
        let pmu = beta_mu.len();
        let p_ls = beta_ls.len();
        let total = pmu + p_ls;
        let mut beta_full = vec![0.0_f64; total];
        for k in 0..pmu {
            beta_full[k] = beta_mu[k];
        }
        for k in 0..p_ls {
            beta_full[pmu + k] = beta_ls[k];
        }

        // AD ground truth: full p×p Hessian. Diagonal (i==i) via second_derivative
        // (1D second derivative); off-diagonal (i<j) via second_partial_derivative
        // on a closure that injects two HyperDual variables into β.
        let mut ad = Array2::<f64>::zeros((total, total));
        for i in 0..total {
            for j in i..total {
                let val = if i == j {
                    let g = |x: num_dual::Dual2<f64, f64>| {
                        let mut bm = vec![num_dual::Dual2::from_re(0.0); pmu];
                        let mut bl = vec![num_dual::Dual2::from_re(0.0); p_ls];
                        for k in 0..pmu {
                            bm[k] = num_dual::Dual2::from_re(beta_full[k]);
                        }
                        for k in 0..p_ls {
                            bl[k] = num_dual::Dual2::from_re(beta_full[pmu + k]);
                        }
                        if i < pmu {
                            bm[i] = x;
                        } else {
                            bl[i - pmu] = x;
                        }
                        gaussian_negloglik_logb_dense_numdual(&bm, &bl, &y, &weights, &xmu, &x_ls)
                    };
                    let (_, _, d2) = second_derivative(g, beta_full[i]);
                    d2
                } else {
                    let f =
                        |(a, b): (num_dual::HyperDual<f64, f64>, num_dual::HyperDual<f64, f64>)| {
                            let mut bm = vec![num_dual::HyperDual::from_re(0.0); pmu];
                            let mut bl = vec![num_dual::HyperDual::from_re(0.0); p_ls];
                            for k in 0..pmu {
                                bm[k] = num_dual::HyperDual::from_re(beta_full[k]);
                            }
                            for k in 0..p_ls {
                                bl[k] = num_dual::HyperDual::from_re(beta_full[pmu + k]);
                            }
                            if i < pmu {
                                bm[i] = a;
                            } else {
                                bl[i - pmu] = a;
                            }
                            if j < pmu {
                                bm[j] = b;
                            } else {
                                bl[j - pmu] = b;
                            }
                            gaussian_negloglik_logb_dense_numdual(
                                &bm, &bl, &y, &weights, &xmu, &x_ls,
                            )
                        };
                    let (_, _, _, d2xy) =
                        second_partial_derivative(f, (beta_full[i], beta_full[j]));
                    d2xy
                };
                ad[[i, j]] = val;
                if i != j {
                    ad[[j, i]] = val;
                }
            }
        }

        // Both the (log-σ, log-σ) and (μ, log-σ) blocks ship the Fisher/expected
        // information by deliberate design (#566 / #684): the score stays the
        // exact observed gradient so the joint Newton lands on the true MLE, but
        // the curvature feeding the REML log-determinant / EDF is the
        // expectation, exactly as gamlss/mgcv `gaulss` Fisher-scores the scale
        // channel and as `gaussian_joint_psi_firstweights` already pins
        // (`hmu_ls = 0`, `h_ls_ls = 2κ²a`). The AD reference computes the
        // observed Hessian, so on each Fisher-replaced block the analytic value
        // differs from AD by the per-row amount `fisher − observed`. We add
        // those exact, separately derived corrections to the AD observed
        // Hessian so the comparison both
        //   (a) validates the AD machinery against the analytic mean blocks,
        //   (b) pins each analytic Fisher block to its closed form via a
        //       non-circular `observed + (Fisher − observed)` reference.
        let mut reference = ad.clone();
        let fisher_minus_observed_ls_ls: Array1<f64> = Array1::from_shape_fn(y.len(), |i| {
            let a = rows.obs_weight[i];
            let n = rows.n[i];
            let k = rows.kappa[i];
            let kp = rows.kappa_prime[i];
            let fisher = 2.0 * k * k * a;
            let observed = 2.0 * k * k * n + kp * (a - n);
            fisher - observed
        });
        let ls_correction = x_ls
            .t()
            .dot(&Array2::from_diag(&fisher_minus_observed_ls_ls).dot(&x_ls));
        for a in 0..p_ls {
            for b in 0..p_ls {
                reference[[pmu + a, pmu + b]] += ls_correction[[a, b]];
            }
        }
        // (μ, log-σ) cross block: observed ∂²ℓ/∂η_μ∂η_ls = 2 m κ (zero in
        // expectation since E[m] = 0 under correct model), Fisher = 0.
        // Correction = fisher − observed = −2 m κ.
        let fisher_minus_observed_mu_ls: Array1<f64> = Array1::from_shape_fn(y.len(), |i| {
            let m = rows.m[i];
            let k = rows.kappa[i];
            -2.0 * m * k
        });
        let mu_ls_correction = xmu
            .t()
            .dot(&Array2::from_diag(&fisher_minus_observed_mu_ls).dot(&x_ls));
        for a in 0..pmu {
            for b in 0..p_ls {
                reference[[a, pmu + b]] += mu_ls_correction[[a, b]];
                reference[[pmu + b, a]] += mu_ls_correction[[a, b]];
            }
        }

        for i in 0..total {
            for j in 0..total {
                let diff = (analytic[[i, j]] - reference[[i, j]]).abs();
                assert!(
                    diff <= 1e-10,
                    "Gaussian static joint H[{i},{j}] mismatch (κ < 1 case): analytic={} reference={} (ad={}) diff={}",
                    analytic[[i, j]],
                    reference[[i, j]],
                    ad[[i, j]],
                    diff
                );
            }
        }
        // Symmetry guardrail: floating-point skew must be at the noise floor.
        let skew = (&analytic - &analytic.t())
            .mapv(f64::abs)
            .fold(0.0_f64, |acc, &v| acc.max(v));
        assert!(
            skew <= 1e-12,
            "Gaussian static joint Hessian skew exceeds noise floor: {skew}"
        );
    }

    #[test]
    fn gaussian_joint_first_directional_hessian_matches_autodiff() {
        let (y, weights, xmu, x_ls, beta_mu, beta_ls) = gaussian_logb_design_test_data();
        let etamu = xmu.dot(&beta_mu);
        let eta_ls = x_ls.dot(&beta_ls);

        let pmu = beta_mu.len();
        let p_ls = beta_ls.len();
        let total = pmu + p_ls;
        // Direction v over the joint β = (β_μ, β_ls).
        let v: Array1<f64> = Array1::from_shape_fn(total, |k| 0.13 + 0.07 * (k as f64));
        let v_mu = v.slice(s![0..pmu]).to_owned();
        let v_ls = v.slice(s![pmu..total]).to_owned();
        let ximu = xmu.dot(&v_mu);
        let xi_ls = x_ls.dot(&v_ls);

        let rows =
            gaussian_jointrow_scalars(&y, &etamu, &eta_ls, &weights).expect("gaussian row scalars");
        let (dhmumu, dhmu_ls, dh_ls_ls) =
            gaussian_joint_first_directionalweights(&rows, &ximu, &xi_ls);
        let xmu_dense = DenseOrOperator::Borrowed(&xmu);
        let xls_dense = DenseOrOperator::Borrowed(&x_ls);
        let analytic = gaussian_joint_hessian_from_designs(
            &xmu_dense, &xls_dense, &dhmumu, &dhmu_ls, &dh_ls_ls,
        )
        .expect("gaussian joint first-directional H from designs");

        // AD: differentiate N along (β + ε·v), evaluating ∂³N/∂β_i ∂β_j ∂ε at ε=0
        // via third_partial_derivative_vec on the augmented vector
        // [β_μ, β_ls, ε] of length total + 1.
        let mut vars = vec![0.0_f64; total + 1];
        for k in 0..pmu {
            vars[k] = beta_mu[k];
        }
        for k in 0..p_ls {
            vars[pmu + k] = beta_ls[k];
        }
        // vars[total] = ε = 0 by default.

        let g = |z: &[num_dual::HyperHyperDual<f64, f64>]| {
            // Reconstruct β + ε·v.
            let mut bm = vec![num_dual::HyperHyperDual::from_re(0.0); pmu];
            let mut bl = vec![num_dual::HyperHyperDual::from_re(0.0); p_ls];
            let eps = z[total];
            for k in 0..pmu {
                bm[k] = z[k] + eps * num_dual::HyperHyperDual::from_re(v[k]);
            }
            for k in 0..p_ls {
                bl[k] = z[pmu + k] + eps * num_dual::HyperHyperDual::from_re(v[pmu + k]);
            }
            gaussian_negloglik_logb_dense_numdual(&bm, &bl, &y, &weights, &xmu, &x_ls)
        };

        let mut ad = Array2::<f64>::zeros((total, total));
        for i in 0..total {
            for j in i..total {
                let (_, _, _, _, _, _, _, d3) = third_partial_derivative_vec(g, &vars, i, j, total);
                ad[[i, j]] = d3;
                if i != j {
                    ad[[j, i]] = d3;
                }
            }
        }

        // (ls,ls) is the Fisher curvature `2κ²a` (#566), not the observed
        // `2κ²n + κ'(a−n)` that AD differentiates. The per-row Fisher−observed
        // gap is `Δ = (a−n)·P` with `P = 2κ² − κ'` (η_ls only). Its directional
        // derivative along (ξ_μ=ximu, ξ_ls=xi_ls), using ∂G/∂η_μ = 2m,
        // ∂G/∂η_ls = 2κn (G=a−n) and P' = 4κκ' − κ'', is
        //   dΔ = 2m·P·ξ_μ + (2κn·P + (a−n)·P')·ξ_ls.
        // We add this to the AD observed dH so the (ls,ls) reference matches the
        // Fisher closed form while the mean/cross blocks stay pinned to AD.
        let mut reference = ad.clone();
        let d_fisher_minus_observed: Array1<f64> = Array1::from_shape_fn(y.len(), |i| {
            let a = rows.obs_weight[i];
            let n = rows.n[i];
            let m = rows.m[i];
            let k = rows.kappa[i];
            let kp = rows.kappa_prime[i];
            let kdp = rows.kappa_dprime[i];
            let p = 2.0 * k * k - kp;
            let p1 = 4.0 * k * kp - kdp;
            2.0 * m * p * ximu[i] + (2.0 * k * n * p + (a - n) * p1) * xi_ls[i]
        });
        let ls_correction = x_ls
            .t()
            .dot(&Array2::from_diag(&d_fisher_minus_observed).dot(&x_ls));
        for a in 0..p_ls {
            for b in 0..p_ls {
                reference[[pmu + a, pmu + b]] += ls_correction[[a, b]];
            }
        }

        for i in 0..total {
            for j in 0..total {
                let diff = (analytic[[i, j]] - reference[[i, j]]).abs();
                assert!(
                    diff <= 1e-10,
                    "Gaussian dH (first-directional) [{i},{j}] mismatch: analytic={} reference={} (ad={}) diff={}",
                    analytic[[i, j]],
                    reference[[i, j]],
                    ad[[i, j]],
                    diff
                );
            }
        }
        let skew = (&analytic - &analytic.t())
            .mapv(f64::abs)
            .fold(0.0_f64, |acc, &v| acc.max(v));
        assert!(
            skew <= 1e-12,
            "Gaussian first-directional dH skew exceeds noise floor: {skew}"
        );
    }

    #[test]
    fn gaussian_joint_second_directional_hessian_matches_autodiff() {
        let (y, weights, xmu, x_ls, beta_mu, beta_ls) = gaussian_logb_design_test_data();
        let etamu = xmu.dot(&beta_mu);
        let eta_ls = x_ls.dot(&beta_ls);

        let pmu = beta_mu.len();
        let p_ls = beta_ls.len();
        let total = pmu + p_ls;
        let u: Array1<f64> = Array1::from_shape_fn(total, |k| 0.18 - 0.05 * (k as f64));
        let v: Array1<f64> = Array1::from_shape_fn(total, |k| -0.11 + 0.09 * (k as f64));
        let u_mu = u.slice(s![0..pmu]).to_owned();
        let u_ls = u.slice(s![pmu..total]).to_owned();
        let v_mu = v.slice(s![0..pmu]).to_owned();
        let v_ls = v.slice(s![pmu..total]).to_owned();
        let ximu_u = xmu.dot(&u_mu);
        let xi_ls_u = x_ls.dot(&u_ls);
        let ximuv = xmu.dot(&v_mu);
        let xi_lsv = x_ls.dot(&v_ls);

        let rows =
            gaussian_jointrow_scalars(&y, &etamu, &eta_ls, &weights).expect("gaussian row scalars");
        let (d2hmumu, d2hmu_ls, d2h_ls_ls) =
            gaussian_jointsecond_directionalweights(&rows, &ximu_u, &xi_ls_u, &ximuv, &xi_lsv);
        let xmu_dense = DenseOrOperator::Borrowed(&xmu);
        let xls_dense = DenseOrOperator::Borrowed(&x_ls);
        let analytic = gaussian_joint_hessian_from_designs(
            &xmu_dense, &xls_dense, &d2hmumu, &d2hmu_ls, &d2h_ls_ls,
        )
        .expect("gaussian joint second-directional H from designs");

        // AD ground truth for ∂⁴N/∂β_i ∂β_j ∂ε_u ∂ε_v at (ε_u, ε_v) = (0, 0).
        // num-dual ships native AD up to third order; the fourth order is
        // obtained by central FD in ε_v of the AD third partial that already
        // covers (β_i, β_j, ε_u). Augmented vector layout:
        //   [β_μ ; β_ls ; ε_u]    of length total + 1 (ε_v lives outside AD).
        let mut vars_base = vec![0.0_f64; total + 1];
        for k in 0..pmu {
            vars_base[k] = beta_mu[k];
        }
        for k in 0..p_ls {
            vars_base[pmu + k] = beta_ls[k];
        }
        // vars_base[total] = ε_u = 0.

        let h = 1e-4;
        let mut ad = Array2::<f64>::zeros((total, total));
        for i in 0..total {
            for j in i..total {
                let g_plus = |z: &[num_dual::HyperHyperDual<f64, f64>]| {
                    let mut bm = vec![num_dual::HyperHyperDual::from_re(0.0); pmu];
                    let mut bl = vec![num_dual::HyperHyperDual::from_re(0.0); p_ls];
                    let eps_u = z[total];
                    for k in 0..pmu {
                        bm[k] = z[k]
                            + eps_u * num_dual::HyperHyperDual::from_re(u[k])
                            + num_dual::HyperHyperDual::from_re(h * v[k]);
                    }
                    for k in 0..p_ls {
                        bl[k] = z[pmu + k]
                            + eps_u * num_dual::HyperHyperDual::from_re(u[pmu + k])
                            + num_dual::HyperHyperDual::from_re(h * v[pmu + k]);
                    }
                    gaussian_negloglik_logb_dense_numdual(&bm, &bl, &y, &weights, &xmu, &x_ls)
                };
                let g_minus = |z: &[num_dual::HyperHyperDual<f64, f64>]| {
                    let mut bm = vec![num_dual::HyperHyperDual::from_re(0.0); pmu];
                    let mut bl = vec![num_dual::HyperHyperDual::from_re(0.0); p_ls];
                    let eps_u = z[total];
                    for k in 0..pmu {
                        bm[k] = z[k] + eps_u * num_dual::HyperHyperDual::from_re(u[k])
                            - num_dual::HyperHyperDual::from_re(h * v[k]);
                    }
                    for k in 0..p_ls {
                        bl[k] = z[pmu + k] + eps_u * num_dual::HyperHyperDual::from_re(u[pmu + k])
                            - num_dual::HyperHyperDual::from_re(h * v[pmu + k]);
                    }
                    gaussian_negloglik_logb_dense_numdual(&bm, &bl, &y, &weights, &xmu, &x_ls)
                };
                let (_, _, _, _, _, _, _, d3_plus) =
                    third_partial_derivative_vec(g_plus, &vars_base, i, j, total);
                let (_, _, _, _, _, _, _, d3_minus) =
                    third_partial_derivative_vec(g_minus, &vars_base, i, j, total);
                let val = (d3_plus - d3_minus) / (2.0 * h);
                ad[[i, j]] = val;
                if i != j {
                    ad[[j, i]] = val;
                }
            }
        }

        // Tolerance: the 4th-order ground truth uses one FD step on top of an
        // AD third partial, so we relax from 1e-10 to a value compatible with
        // the central-difference truncation (O(h²) ≈ 1e-8) and the rounding
        // floor of the AD third partial (≈ 1e-10 / h ≈ 1e-6).
        // (ls,ls) is the Fisher curvature `2κ²a` (#566); AD differentiates the
        // observed `2κ²n + κ'(a−n)`. The second-directional correction is the
        // η-Hessian of the Fisher−observed gap `Δ = (a−n)·P`, `P = 2κ² − κ'`
        // (η_ls only), contracted with the two η-directions. Writing G = a−n
        // (G' = ∂_η_ls G = 2κn, G'' = 2n(κ'−2κ²), ∂_η_μ G = 2m, ∂²_η_μ G = −2w,
        // ∂²_{η_μ,η_ls} G = −4κm), P' = 4κκ' − κ'',
        // P'' = 6κ'² + κ''(6κ − 1) (using κ''' = κ''(1−2κ) − 2κ'²):
        //   Δ_{μμ}  = −2w·P
        //   Δ_{μls} = 2m(P' − 2κP)
        //   Δ_{lsls}= G''·P + 2G'·P' + G·P''
        // Both η-directions are linear in β (no curvature term), so the
        // second directional derivative is the bilinear contraction below. We
        // add it to the AD observed d²H (ls,ls) block to form the Fisher
        // reference while the mean/cross blocks stay pinned to AD.
        let mut reference = ad.clone();
        let d2_fisher_minus_observed: Array1<f64> = Array1::from_shape_fn(y.len(), |i| {
            let a = rows.obs_weight[i];
            let n = rows.n[i];
            let m = rows.m[i];
            let w = rows.w[i];
            let k = rows.kappa[i];
            let kp = rows.kappa_prime[i];
            let kdp = rows.kappa_dprime[i];
            let g = a - n;
            let p = 2.0 * k * k - kp;
            let p1 = 4.0 * k * kp - kdp;
            let p2 = 6.0 * kp * kp + kdp * (6.0 * k - 1.0);
            let g1 = 2.0 * k * n;
            let g2 = 2.0 * n * (kp - 2.0 * k * k);
            let d_mumu = -2.0 * w * p;
            let d_muls = 2.0 * m * (p1 - 2.0 * k * p);
            let d_lsls = g2 * p + 2.0 * g1 * p1 + g * p2;
            d_mumu * ximu_u[i] * ximuv[i]
                + d_muls * (ximu_u[i] * xi_lsv[i] + xi_ls_u[i] * ximuv[i])
                + d_lsls * xi_ls_u[i] * xi_lsv[i]
        });
        let ls_correction = x_ls
            .t()
            .dot(&Array2::from_diag(&d2_fisher_minus_observed).dot(&x_ls));
        for a in 0..p_ls {
            for b in 0..p_ls {
                reference[[pmu + a, pmu + b]] += ls_correction[[a, b]];
            }
        }

        let tol = 5e-6;
        for i in 0..total {
            for j in 0..total {
                let diff = (analytic[[i, j]] - reference[[i, j]]).abs();
                assert!(
                    diff <= tol,
                    "Gaussian d2H (second-directional) [{i},{j}] mismatch: analytic={} reference={} (ad={}) diff={}",
                    analytic[[i, j]],
                    reference[[i, j]],
                    ad[[i, j]],
                    diff
                );
            }
        }
        let skew = (&analytic - &analytic.t())
            .mapv(f64::abs)
            .fold(0.0_f64, |acc, &v| acc.max(v));
        assert!(
            skew <= 1e-10,
            "Gaussian second-directional d2H skew exceeds noise floor: {skew}"
        );
    }

    #[test]
    fn gaussian_joint_psi_second_order_terms_match_autodiff() {
        // ψ-coupled scenario: both μ and η_ls depend on ψ via per-row
        // first/second drift vectors, with non-trivial coefficients.
        let y = array![0.25, -0.4, 1.1, 0.05, -0.2];
        let weights = array![1.0, 0.7, 1.3, 0.9, 1.1];
        let x_mu0 = array![1.0, -0.35, 0.6, 0.1, 0.45];
        let x_ls0 = array![0.8, -0.25, 0.45, -0.1, 0.3];
        let x_mu_psi = array![0.2, 0.15, -0.1, 0.05, 0.3];
        let x_ls_psi = array![0.18, -0.12, 0.25, -0.2, 0.07];
        let x_mu_psi_psi = array![0.04, -0.03, 0.05, 0.06, -0.02];
        let x_ls_psi_psi = array![0.05, -0.03, 0.04, 0.07, -0.04];
        // β_ls chosen so η_ls ≈ −0.4 (κ ≈ 0.985, noticeably less than 1).
        let beta_mu0 = 0.35_f64;
        let beta_ls0 = -0.4_f64;

        // Per-row predictor drifts.
        let etamu = &x_mu0 * beta_mu0;
        let eta_ls = &x_ls0 * beta_ls0;
        let zmu_psi = &x_mu_psi * beta_mu0;
        let z_ls_psi = &x_ls_psi * beta_ls0;
        let zmu_psi_psi = &x_mu_psi_psi * beta_mu0;
        let z_ls_psi_psi = &x_ls_psi_psi * beta_ls0;

        let rows =
            gaussian_jointrow_scalars(&y, &etamu, &eta_ls, &weights).expect("gaussian row scalars");
        let secondweights = gaussian_joint_psisecondweights(
            &rows,
            &zmu_psi,
            &z_ls_psi,
            &zmu_psi,
            &z_ls_psi,
            &zmu_psi_psi,
            &z_ls_psi_psi,
        );
        let analytic = secondweights.objective_psi_psirow.sum();

        // AD: differentiate the full ψ-dependent oracle twice in ψ at ψ=0.
        let (_, _, ad) = second_derivative(
            |psi| {
                gaussian_negloglik_log_sigma_psi_full_numdual(
                    num_dual::Dual2::from_re(beta_mu0),
                    num_dual::Dual2::from_re(beta_ls0),
                    psi,
                    &y,
                    &weights,
                    &x_mu0,
                    &x_ls0,
                    &x_mu_psi,
                    &x_ls_psi,
                    &x_mu_psi_psi,
                    &x_ls_psi_psi,
                )
            },
            0.0,
        );

        let diff = (analytic - ad).abs();
        assert!(
            diff <= 1e-10,
            "Gaussian joint ψ-ψ objective mismatch (κ < 1, μ and σ both ψ-dependent): analytic={} ad={} diff={}",
            analytic,
            ad,
            diff
        );
    }

    #[test]
    fn wiggle_family_block_hessians_match_jointhessian_principal_blocks() {
        let n = 7usize;
        let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0]);
        let weights = Array1::from_vec(vec![1.0; n]);
        let threshold_block = intercept_block(n);
        let log_sigma_block = intercept_block(n);
        let q_seed = Array1::linspace(-1.4, 1.4, n);
        let (wiggle_block, knots) = BinomialLocationScaleWiggleFamily::buildwiggle_block_input(
            q_seed.view(),
            3,
            4,
            2,
            false,
        )
        .expect("wiggle block");
        let threshold_design = threshold_block.design.clone();
        let log_sigma_design = log_sigma_block.design.clone();
        let family = BinomialLocationScaleWiggleFamily {
            y: y.clone(),
            weights: weights.clone(),
            link_kind: InverseLink::Standard(StandardLink::Probit),
            threshold_design: Some(threshold_design.clone()),
            log_sigma_design: Some(log_sigma_design.clone()),
            wiggle_knots: knots,
            wiggle_degree: 3,
            policy: crate::resource::ResourcePolicy::default_library(),
        };

        let beta_t = Array1::from_vec(vec![0.25]);
        let beta_ls = Array1::from_vec(vec![-0.15]);
        let eta_t = threshold_design.matrixvectormultiply(&beta_t);
        let eta_ls = log_sigma_design.matrixvectormultiply(&beta_ls);
        let core_for_q0 =
            binomial_location_scale_core(&y, &weights, &eta_t, &eta_ls, None, &family.link_kind)
                .expect("core q0");
        let betaw = Array1::from_vec(vec![0.04; wiggle_block.design.ncols()]);
        let etaw = family
            .wiggle_design(core_for_q0.q0.view())
            .expect("wiggle design")
            .dot(&betaw);
        let states = vec![
            ParameterBlockState {
                beta: beta_t.clone(),
                eta: eta_t,
            },
            ParameterBlockState {
                beta: beta_ls.clone(),
                eta: eta_ls,
            },
            ParameterBlockState {
                beta: betaw.clone(),
                eta: etaw,
            },
        ];

        let eval = family.evaluate(&states).expect("evaluate wiggle family");
        let joint = family
            .exact_newton_joint_hessian(&states)
            .expect("joint hessian")
            .expect("expected joint exact hessian");
        let beta_layout = GamlssBetaLayout::withwiggle(beta_t.len(), beta_ls.len(), betaw.len());
        let ranges = [
            (0usize, beta_layout.pt),
            (beta_layout.pt, beta_layout.pt + beta_layout.pls),
            (
                beta_layout.pt + beta_layout.pls,
                beta_layout.pt + beta_layout.pls + beta_layout.pw,
            ),
        ];

        for (block_idx, (start, end)) in ranges.into_iter().enumerate() {
            let blockhessian = match &eval.blockworking_sets[block_idx] {
                BlockWorkingSet::ExactNewton { hessian, .. } => hessian.to_dense(),
                BlockWorkingSet::Diagonal { .. } => panic!("expected exact newton block"),
            };
            let joint_block = joint.slice(s![start..end, start..end]).to_owned();
            crate::test_support::assert_matrix_derivativefd(
                &joint_block,
                &blockhessian,
                1e-10,
                &format!("wiggle block {block_idx} principal block"),
            );
        }
    }

    /// Build the nontrivial-design BLS Wiggle family + designs + wiggle block
    /// shared by the FD-gradient and FD-joint-Hessian tests below.
    fn wiggle_nontrivial_fixture() -> (
        BinomialLocationScaleWiggleFamily,
        DesignMatrix,
        DesignMatrix,
        ParameterBlockInput,
        Array1<f64>,
        Array1<f64>,
    ) {
        let n = 9usize;
        let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0]);
        let weights = Array1::from_vec(vec![1.0; n]);
        let t_grid = Array1::linspace(0.0, 1.0, n);
        let threshold_x = Array2::from_shape_fn((n, 3), |(i, j)| match j {
            0 => 1.0,
            1 => t_grid[i] - 0.5,
            2 => (2.0 * std::f64::consts::PI * t_grid[i]).sin(),
            _ => unreachable!(),
        });
        let log_sigma_x = Array2::from_shape_fn((n, 2), |(i, j)| match j {
            0 => 1.0,
            1 => (3.0 * std::f64::consts::PI * t_grid[i]).cos(),
            _ => unreachable!(),
        });
        let threshold_design =
            DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(threshold_x.clone()));
        let log_sigma_design =
            DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(log_sigma_x.clone()));
        let q_seed = Array1::linspace(-1.3, 1.1, n);
        let (wiggle_block, knots) = BinomialLocationScaleWiggleFamily::buildwiggle_block_input(
            q_seed.view(),
            3,
            4,
            2,
            false,
        )
        .expect("wiggle block");
        let family = BinomialLocationScaleWiggleFamily {
            y: y.clone(),
            weights: weights.clone(),
            link_kind: InverseLink::Standard(StandardLink::Probit),
            threshold_design: Some(threshold_design.clone()),
            log_sigma_design: Some(log_sigma_design.clone()),
            wiggle_knots: knots,
            wiggle_degree: 3,
            policy: crate::resource::ResourcePolicy::default_library(),
        };
        (
            family,
            threshold_design,
            log_sigma_design,
            wiggle_block,
            y,
            weights,
        )
    }

    /// Rebuild the three-block state for the nontrivial-design wiggle fixture.
    fn rebuild_wiggle_nontrivial_states(
        family: &BinomialLocationScaleWiggleFamily,
        threshold_design: &DesignMatrix,
        log_sigma_design: &DesignMatrix,
        y: &Array1<f64>,
        weights: &Array1<f64>,
        beta_t: &Array1<f64>,
        beta_ls: &Array1<f64>,
        betaw: &Array1<f64>,
    ) -> Vec<ParameterBlockState> {
        let eta_t = threshold_design.matrixvectormultiply(beta_t);
        let eta_ls = log_sigma_design.matrixvectormultiply(beta_ls);
        let core_q0 =
            binomial_location_scale_core(y, weights, &eta_t, &eta_ls, None, &family.link_kind)
                .expect("core q0");
        let etaw = family
            .wiggle_design(core_q0.q0.view())
            .expect("wiggle design")
            .dot(betaw);
        vec![
            ParameterBlockState {
                beta: beta_t.clone(),
                eta: eta_t,
            },
            ParameterBlockState {
                beta: beta_ls.clone(),
                eta: eta_ls,
            },
            ParameterBlockState {
                beta: betaw.clone(),
                eta: etaw,
            },
        ]
    }

    /// Extract the exact-Newton gradient for one block of a wiggle evaluation.
    fn extract_wiggle_gradient(eval: &FamilyEvaluation, block_idx: usize) -> Array1<f64> {
        match &eval.blockworking_sets[block_idx] {
            BlockWorkingSet::ExactNewton {
                gradient,
                hessian: _,
            } => gradient.clone(),
            BlockWorkingSet::Diagonal { .. } => panic!("expected exact newton"),
        }
    }

    #[test]
    fn wiggle_familygradients_match_finite_differencewith_nontrivial_designs() {
        let (family, threshold_design, log_sigma_design, wiggle_block, y, weights) =
            wiggle_nontrivial_fixture();

        let rebuild_states = |beta_t: &Array1<f64>,
                              beta_ls: &Array1<f64>,
                              betaw: &Array1<f64>|
         -> Vec<ParameterBlockState> {
            rebuild_wiggle_nontrivial_states(
                &family,
                &threshold_design,
                &log_sigma_design,
                &y,
                &weights,
                beta_t,
                beta_ls,
                betaw,
            )
        };

        let objective = |beta_t: &Array1<f64>, beta_ls: &Array1<f64>, betaw: &Array1<f64>| {
            let states = rebuild_states(beta_t, beta_ls, betaw);
            -family.evaluate(&states).expect("evaluate").log_likelihood
        };

        let extractgradient = extract_wiggle_gradient;

        let beta_t = Array1::from_vec(vec![0.15, -0.3, 0.2]);
        let beta_ls = Array1::from_vec(vec![-0.2, 0.1]);
        let betaw = Array1::from_vec(vec![0.04; wiggle_block.design.ncols()]);
        let states = rebuild_states(&beta_t, &beta_ls, &betaw);
        let eval = family.evaluate(&states).expect("evaluate");
        let eps = 1e-6;

        for block_idx in 0..3 {
            let analytic = extractgradient(&eval, block_idx);
            let mut fd = Array1::<f64>::zeros(analytic.len());
            for j in 0..analytic.len() {
                let mut beta_t_plus = beta_t.clone();
                let mut beta_ls_plus = beta_ls.clone();
                let mut betaw_plus = betaw.clone();
                let mut beta_t_minus = beta_t.clone();
                let mut beta_ls_minus = beta_ls.clone();
                let mut betaw_minus = betaw.clone();
                match block_idx {
                    BinomialLocationScaleWiggleFamily::BLOCK_T => {
                        beta_t_plus[j] += eps;
                        beta_t_minus[j] -= eps;
                    }
                    BinomialLocationScaleWiggleFamily::BLOCK_LOG_SIGMA => {
                        beta_ls_plus[j] += eps;
                        beta_ls_minus[j] -= eps;
                    }
                    BinomialLocationScaleWiggleFamily::BLOCK_WIGGLE => {
                        betaw_plus[j] += eps;
                        betaw_minus[j] -= eps;
                    }
                    _ => unreachable!(),
                }
                let f_plus = objective(&beta_t_plus, &beta_ls_plus, &betaw_plus);
                let f_minus = objective(&beta_t_minus, &beta_ls_minus, &betaw_minus);
                fd[j] = (f_plus - f_minus) / (2.0 * eps);
            }
            crate::test_support::assert_matrix_derivativefd(
                &fd.insert_axis(Axis(1)),
                &(-&analytic).insert_axis(Axis(1)),
                2e-4,
                &format!("wiggle block {block_idx} score"),
            );
        }
    }

    #[test]
    fn wiggle_family_joint_hessian_matches_fd_gradients_with_nontrivial_designs() {
        let (family, threshold_design, log_sigma_design, wiggle_block, y, weights) =
            wiggle_nontrivial_fixture();

        let rebuild_states = |beta_t: &Array1<f64>,
                              beta_ls: &Array1<f64>,
                              betaw: &Array1<f64>|
         -> Vec<ParameterBlockState> {
            rebuild_wiggle_nontrivial_states(
                &family,
                &threshold_design,
                &log_sigma_design,
                &y,
                &weights,
                beta_t,
                beta_ls,
                betaw,
            )
        };

        let extractgradient = extract_wiggle_gradient;

        let beta_t = Array1::from_vec(vec![0.15, -0.3, 0.2]);
        let beta_ls = Array1::from_vec(vec![-0.2, 0.1]);
        let betaw = Array1::from_vec(vec![0.04; wiggle_block.design.ncols()]);
        let states = rebuild_states(&beta_t, &beta_ls, &betaw);
        let h_joint = family
            .exact_newton_joint_hessian(&states)
            .expect("joint hessian")
            .expect("expected joint exact hessian");
        let pt = beta_t.len();
        let pls = beta_ls.len();
        let eps = 1e-6;
        let total = pt + pls + betaw.len();
        let mut fd = Array2::<f64>::zeros((total, total));
        let source_offsets = [0usize, pt, pt + pls];

        for source_block in 0..3 {
            let source_len = states[source_block].beta.len();
            for j in 0..source_len {
                let mut beta_t_plus = beta_t.clone();
                let mut beta_ls_plus = beta_ls.clone();
                let mut betaw_plus = betaw.clone();
                let mut beta_t_minus = beta_t.clone();
                let mut beta_ls_minus = beta_ls.clone();
                let mut betaw_minus = betaw.clone();
                match source_block {
                    BinomialLocationScaleWiggleFamily::BLOCK_T => {
                        beta_t_plus[j] += eps;
                        beta_t_minus[j] -= eps;
                    }
                    BinomialLocationScaleWiggleFamily::BLOCK_LOG_SIGMA => {
                        beta_ls_plus[j] += eps;
                        beta_ls_minus[j] -= eps;
                    }
                    BinomialLocationScaleWiggleFamily::BLOCK_WIGGLE => {
                        betaw_plus[j] += eps;
                        betaw_minus[j] -= eps;
                    }
                    _ => unreachable!(),
                }
                let eval_plus = family
                    .evaluate(&rebuild_states(&beta_t_plus, &beta_ls_plus, &betaw_plus))
                    .expect("eval plus");
                let eval_minus = family
                    .evaluate(&rebuild_states(&beta_t_minus, &beta_ls_minus, &betaw_minus))
                    .expect("eval minus");

                let mut row_offset = 0usize;
                for target_block in 0..3 {
                    let grad_plus = extractgradient(&eval_plus, target_block);
                    let grad_minus = extractgradient(&eval_minus, target_block);
                    let col = (&grad_plus - &grad_minus).mapv(|v| -v / (2.0 * eps));
                    let col_idx = source_offsets[source_block] + j;
                    fd.slice_mut(s![
                        row_offset..row_offset + grad_plus.len(),
                        col_idx..col_idx + 1
                    ])
                    .assign(&col.insert_axis(Axis(1)));
                    row_offset += grad_plus.len();
                }
            }
        }

        crate::test_support::assert_matrix_derivativefd(
            &fd,
            &h_joint,
            4e-4,
            "wiggle joint hessian",
        );
    }

    #[test]
    fn wiggle_family_joint_exacthessian_directional_derivative_matches_finite_difference() {
        assert!(file!().ends_with(".rs"));
        let n = 7usize;
        let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0]);
        let weights = Array1::from_vec(vec![1.0; n]);
        let threshold_block = intercept_block(n);
        let log_sigma_block = intercept_block(n);
        let q_seed = Array1::linspace(-1.4, 1.4, n);
        let (wiggle_block, knots) = BinomialLocationScaleWiggleFamily::buildwiggle_block_input(
            q_seed.view(),
            3,
            4,
            2,
            false,
        )
        .expect("wiggle block");
        let threshold_design = threshold_block.design.clone();
        let log_sigma_design = log_sigma_block.design.clone();
        let family = BinomialLocationScaleWiggleFamily {
            y: y.clone(),
            weights: weights.clone(),
            link_kind: InverseLink::Standard(StandardLink::Probit),
            threshold_design: Some(threshold_design.clone()),
            log_sigma_design: Some(log_sigma_design.clone()),
            wiggle_knots: knots,
            wiggle_degree: 3,
            policy: crate::resource::ResourcePolicy::default_library(),
        };

        let beta_t = Array1::from_vec(vec![0.25]);
        let beta_ls = Array1::from_vec(vec![-0.15]);
        let eta_t = threshold_design.matrixvectormultiply(&beta_t);
        let eta_ls = log_sigma_design.matrixvectormultiply(&beta_ls);
        let core_for_q0 =
            binomial_location_scale_core(&y, &weights, &eta_t, &eta_ls, None, &family.link_kind)
                .expect("core q0");
        let betaw = Array1::from_vec(vec![0.04; wiggle_block.design.ncols()]);
        let etaw = family
            .wiggle_design(core_for_q0.q0.view())
            .expect("wiggle design")
            .dot(&betaw);
        let states = vec![
            ParameterBlockState {
                beta: beta_t,
                eta: eta_t,
            },
            ParameterBlockState {
                beta: beta_ls,
                eta: eta_ls,
            },
            ParameterBlockState {
                beta: betaw.clone(),
                eta: etaw,
            },
        ];

        let base_h = family
            .exact_newton_joint_hessian(&states)
            .expect("joint hessian")
            .expect("expected joint exact hessian");
        let direction = Array1::ones(base_h.nrows());
        let analytic = family
            .exact_newton_joint_hessian_directional_derivative(&states, &direction)
            .expect("joint dH")
            .expect("expected joint exact dH");

        let eps = 1e-6;
        let mut plus_states = states.clone();
        let beta_layout = GamlssBetaLayout::withwiggle(
            plus_states[BinomialLocationScaleWiggleFamily::BLOCK_T]
                .beta
                .len(),
            plus_states[BinomialLocationScaleWiggleFamily::BLOCK_LOG_SIGMA]
                .beta
                .len(),
            plus_states[BinomialLocationScaleWiggleFamily::BLOCK_WIGGLE]
                .beta
                .len(),
        );
        let (dir_t, dir_ls, dirw) = beta_layout
            .split_three(&direction, "wiggle test direction split")
            .expect("split wiggle test direction");
        plus_states[BinomialLocationScaleWiggleFamily::BLOCK_T].beta =
            &plus_states[BinomialLocationScaleWiggleFamily::BLOCK_T].beta + &(eps * dir_t);
        plus_states[BinomialLocationScaleWiggleFamily::BLOCK_LOG_SIGMA].beta =
            &plus_states[BinomialLocationScaleWiggleFamily::BLOCK_LOG_SIGMA].beta + &(eps * dir_ls);
        plus_states[BinomialLocationScaleWiggleFamily::BLOCK_WIGGLE].beta =
            &plus_states[BinomialLocationScaleWiggleFamily::BLOCK_WIGGLE].beta + &(eps * dirw);
        plus_states[BinomialLocationScaleWiggleFamily::BLOCK_T].eta = threshold_design
            .matrixvectormultiply(&plus_states[BinomialLocationScaleWiggleFamily::BLOCK_T].beta);
        plus_states[BinomialLocationScaleWiggleFamily::BLOCK_LOG_SIGMA].eta = log_sigma_design
            .matrixvectormultiply(
                &plus_states[BinomialLocationScaleWiggleFamily::BLOCK_LOG_SIGMA].beta,
            );
        let plus_core_q0 = binomial_location_scale_core(
            &y,
            &weights,
            &plus_states[BinomialLocationScaleWiggleFamily::BLOCK_T].eta,
            &plus_states[BinomialLocationScaleWiggleFamily::BLOCK_LOG_SIGMA].eta,
            None,
            &family.link_kind,
        )
        .expect("plus core q0");
        plus_states[BinomialLocationScaleWiggleFamily::BLOCK_WIGGLE].eta = family
            .wiggle_design(plus_core_q0.q0.view())
            .expect("plus wiggle design")
            .dot(&plus_states[BinomialLocationScaleWiggleFamily::BLOCK_WIGGLE].beta);

        let h_plus = family
            .exact_newton_joint_hessian(&plus_states)
            .expect("plus joint hessian")
            .expect("expected plus joint hessian");
        let fd = (h_plus - base_h) / eps;
        crate::test_support::assert_matrix_derivativefd(&fd, &analytic, 2e-3, "joint dH");
    }

    #[test]
    fn wiggle_family_joint_exacthessiansecond_directional_derivative_matches_finite_difference() {
        assert!(file!().ends_with(".rs"));
        let n = 7usize;
        let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0]);
        let weights = Array1::from_vec(vec![1.0; n]);
        let threshold_block = intercept_block(n);
        let log_sigma_block = intercept_block(n);
        let q_seed = Array1::linspace(-1.4, 1.4, n);
        let (wiggle_block, knots) = BinomialLocationScaleWiggleFamily::buildwiggle_block_input(
            q_seed.view(),
            4,
            4,
            2,
            false,
        )
        .expect("wiggle block");
        let threshold_design = threshold_block.design.clone();
        let log_sigma_design = log_sigma_block.design.clone();
        let family = BinomialLocationScaleWiggleFamily {
            y: y.clone(),
            weights: weights.clone(),
            link_kind: InverseLink::Standard(StandardLink::Probit),
            threshold_design: Some(threshold_design.clone()),
            log_sigma_design: Some(log_sigma_design.clone()),
            wiggle_knots: knots,
            wiggle_degree: 4,
            policy: crate::resource::ResourcePolicy::default_library(),
        };

        let rebuild_states = |beta_t: &Array1<f64>,
                              beta_ls: &Array1<f64>,
                              betaw: &Array1<f64>|
         -> Vec<ParameterBlockState> {
            let eta_t = threshold_design.matrixvectormultiply(beta_t);
            let eta_ls = log_sigma_design.matrixvectormultiply(beta_ls);
            let core_q0 = binomial_location_scale_core(
                &y,
                &weights,
                &eta_t,
                &eta_ls,
                None,
                &family.link_kind,
            )
            .expect("core q0");
            let etaw = family
                .wiggle_design(core_q0.q0.view())
                .expect("wiggle design")
                .dot(betaw);
            vec![
                ParameterBlockState {
                    beta: beta_t.clone(),
                    eta: eta_t,
                },
                ParameterBlockState {
                    beta: beta_ls.clone(),
                    eta: eta_ls,
                },
                ParameterBlockState {
                    beta: betaw.clone(),
                    eta: etaw,
                },
            ]
        };

        let beta_t = Array1::from_vec(vec![0.25]);
        let beta_ls = Array1::from_vec(vec![-0.15]);
        let betaw = Array1::from_vec(vec![0.03; wiggle_block.design.ncols()]);
        let states = rebuild_states(&beta_t, &beta_ls, &betaw);

        let pt = beta_t.len();
        let pls = beta_ls.len();
        let pw = betaw.len();
        let total = pt + pls + pw;
        let direction_u = Array1::from_shape_fn(total, |k| 0.2 + 0.1 * (k as f64));
        let directionv = Array1::from_shape_fn(total, |k| -0.15 + 0.07 * (k as f64));

        let analytic = family
            .exact_newton_joint_hessiansecond_directional_derivative(
                &states,
                &direction_u,
                &directionv,
            )
            .expect("joint d2H")
            .expect("expected joint exact d2H");

        let eps = 1e-6;
        let beta_layout = GamlssBetaLayout::withwiggle(pt, pls, pw);
        let (step_t, step_ls, stepw) = beta_layout
            .split_three(&directionv, "wiggle d2H test directionv")
            .expect("split wiggle test direction");

        let states_plus = rebuild_states(
            &(&beta_t + &(eps * &step_t)),
            &(&beta_ls + &(eps * &step_ls)),
            &(&betaw + &(eps * &stepw)),
        );
        let states_minus = rebuild_states(
            &(&beta_t - &(eps * &step_t)),
            &(&beta_ls - &(eps * &step_ls)),
            &(&betaw - &(eps * &stepw)),
        );
        let d_h_plus = family
            .exact_newton_joint_hessian_directional_derivative(&states_plus, &direction_u)
            .expect("joint dH plus")
            .expect("expected joint exact dH plus");
        let d_h_minus = family
            .exact_newton_joint_hessian_directional_derivative(&states_minus, &direction_u)
            .expect("joint dH minus")
            .expect("expected joint exact dH minus");
        let fd = (d_h_plus - d_h_minus) / (2.0 * eps);

        crate::test_support::assert_matrix_derivativefd(&fd, &analytic, 4e-3, "joint d2H");
    }

    #[test]
    fn wiggle_family_joint_hessian_cross_blocks_match_finite_difference_of_gradients() {
        let n = 7usize;
        let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0]);
        let weights = Array1::from_vec(vec![1.0; n]);
        let threshold_block = intercept_block(n);
        let log_sigma_block = intercept_block(n);
        let q_seed = Array1::linspace(-1.4, 1.4, n);
        let (wiggle_block, knots) = BinomialLocationScaleWiggleFamily::buildwiggle_block_input(
            q_seed.view(),
            3,
            4,
            2,
            false,
        )
        .expect("wiggle block");
        let threshold_design = threshold_block.design.clone();
        let log_sigma_design = log_sigma_block.design.clone();
        let family = BinomialLocationScaleWiggleFamily {
            y: y.clone(),
            weights: weights.clone(),
            link_kind: InverseLink::Standard(StandardLink::Probit),
            threshold_design: Some(threshold_design.clone()),
            log_sigma_design: Some(log_sigma_design.clone()),
            wiggle_knots: knots,
            wiggle_degree: 3,
            policy: crate::resource::ResourcePolicy::default_library(),
        };

        let rebuild_states = |beta_t: &Array1<f64>,
                              beta_ls: &Array1<f64>,
                              betaw: &Array1<f64>|
         -> Vec<ParameterBlockState> {
            let eta_t = threshold_design.matrixvectormultiply(beta_t);
            let eta_ls = log_sigma_design.matrixvectormultiply(beta_ls);
            let core_q0 = binomial_location_scale_core(
                &y,
                &weights,
                &eta_t,
                &eta_ls,
                None,
                &family.link_kind,
            )
            .expect("core q0");
            let etaw = family
                .wiggle_design(core_q0.q0.view())
                .expect("wiggle design")
                .dot(betaw);
            vec![
                ParameterBlockState {
                    beta: beta_t.clone(),
                    eta: eta_t,
                },
                ParameterBlockState {
                    beta: beta_ls.clone(),
                    eta: eta_ls,
                },
                ParameterBlockState {
                    beta: betaw.clone(),
                    eta: etaw,
                },
            ]
        };

        let extractgradient = |eval: &FamilyEvaluation, block_idx: usize| -> Array1<f64> {
            match &eval.blockworking_sets[block_idx] {
                BlockWorkingSet::ExactNewton {
                    gradient,
                    hessian: _,
                } => gradient.clone(),
                BlockWorkingSet::Diagonal { .. } => panic!("expected exact newton"),
            }
        };

        let beta_t = Array1::from_vec(vec![0.25]);
        let beta_ls = Array1::from_vec(vec![-0.15]);
        let betaw = Array1::from_vec(vec![0.04; wiggle_block.design.ncols()]);
        let states = rebuild_states(&beta_t, &beta_ls, &betaw);

        let h_joint = family
            .exact_newton_joint_hessian(&states)
            .expect("joint hessian")
            .expect("expected joint exact hessian");

        let pt = beta_t.len();
        let pls = beta_ls.len();
        let pw = betaw.len();
        let eps = 1e-6;

        let fd_cross_block = |target_block: usize, source_block: usize| -> Array2<f64> {
            let mut out = Array2::<f64>::zeros((
                states[target_block].beta.len(),
                states[source_block].beta.len(),
            ));
            for j in 0..states[source_block].beta.len() {
                let mut beta_t_plus = beta_t.clone();
                let mut beta_ls_plus = beta_ls.clone();
                let mut betaw_plus = betaw.clone();
                let mut beta_t_minus = beta_t.clone();
                let mut beta_ls_minus = beta_ls.clone();
                let mut betaw_minus = betaw.clone();
                match source_block {
                    BinomialLocationScaleWiggleFamily::BLOCK_T => {
                        beta_t_plus[j] += eps;
                        beta_t_minus[j] -= eps;
                    }
                    BinomialLocationScaleWiggleFamily::BLOCK_LOG_SIGMA => {
                        beta_ls_plus[j] += eps;
                        beta_ls_minus[j] -= eps;
                    }
                    BinomialLocationScaleWiggleFamily::BLOCK_WIGGLE => {
                        betaw_plus[j] += eps;
                        betaw_minus[j] -= eps;
                    }
                    _ => panic!("unexpected block"),
                }

                let eval_plus = family
                    .evaluate(&rebuild_states(&beta_t_plus, &beta_ls_plus, &betaw_plus))
                    .expect("eval plus");
                let eval_minus = family
                    .evaluate(&rebuild_states(&beta_t_minus, &beta_ls_minus, &betaw_minus))
                    .expect("eval minus");
                let grad_plus = extractgradient(&eval_plus, target_block);
                let grad_minus = extractgradient(&eval_minus, target_block);
                let col = (&grad_plus - &grad_minus).mapv(|v| -v / (2.0 * eps));
                out.slice_mut(ndarray::s![.., j]).assign(&col);
            }
            out
        };

        let fd_t_ls = fd_cross_block(
            BinomialLocationScaleWiggleFamily::BLOCK_T,
            BinomialLocationScaleWiggleFamily::BLOCK_LOG_SIGMA,
        );
        let fd_tw = fd_cross_block(
            BinomialLocationScaleWiggleFamily::BLOCK_T,
            BinomialLocationScaleWiggleFamily::BLOCK_WIGGLE,
        );
        let fd_lsw = fd_cross_block(
            BinomialLocationScaleWiggleFamily::BLOCK_LOG_SIGMA,
            BinomialLocationScaleWiggleFamily::BLOCK_WIGGLE,
        );

        let h_t_ls = h_joint.slice(ndarray::s![0..pt, pt..pt + pls]).to_owned();
        let h_tw = h_joint
            .slice(ndarray::s![0..pt, pt + pls..pt + pls + pw])
            .to_owned();
        let h_lsw = h_joint
            .slice(ndarray::s![pt..pt + pls, pt + pls..pt + pls + pw])
            .to_owned();

        crate::test_support::assert_matrix_derivativefd(&fd_t_ls, &h_t_ls, 2e-4, "H_t_ls");
        crate::test_support::assert_matrix_derivativefd(&fd_tw, &h_tw, 4e-4, "H_tw");
        crate::test_support::assert_matrix_derivativefd(&fd_lsw, &h_lsw, 6e-4, "H_lsw");
    }

    #[test]
    fn nonwiggle_family_evaluate_returns_exact_newton_blockswhen_designs_are_present() {
        let n = 6usize;
        let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0]);
        let weights = Array1::from_vec(vec![1.0; n]);
        let threshold_design = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
            Array2::from_shape_fn((n, 2), |(i, j)| {
                let t = i as f64 / (n as f64 - 1.0);
                match j {
                    0 => 1.0,
                    1 => t - 0.5,
                    _ => unreachable!(),
                }
            }),
        ));
        let log_sigma_design = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
            Array2::from_shape_fn((n, 2), |(i, j)| {
                let t = i as f64 / (n as f64 - 1.0);
                match j {
                    0 => 1.0,
                    1 => (2.0 * std::f64::consts::PI * t).cos(),
                    _ => unreachable!(),
                }
            }),
        ));
        let family = BinomialLocationScaleFamily {
            y: y.clone(),
            weights: weights.clone(),
            link_kind: InverseLink::Standard(StandardLink::Probit),
            threshold_design: Some(threshold_design.clone()),
            log_sigma_design: Some(log_sigma_design.clone()),
            policy: crate::resource::ResourcePolicy::default_library(),
        };

        let beta_t = array![0.2, -0.15];
        let beta_ls = array![-0.1, 0.05];
        let states = vec![
            ParameterBlockState {
                beta: beta_t.clone(),
                eta: threshold_design.matrixvectormultiply(&beta_t),
            },
            ParameterBlockState {
                beta: beta_ls.clone(),
                eta: log_sigma_design.matrixvectormultiply(&beta_ls),
            },
        ];

        let eval = family.evaluate(&states).expect("evaluate nonwiggle family");
        assert_eq!(eval.blockworking_sets.len(), 2);
        let joint = family
            .exact_newton_joint_hessian(&states)
            .expect("joint hessian")
            .expect("expected joint exact hessian");
        let pt = beta_t.len();
        let pls = beta_ls.len();

        for (block_idx, (start, end)) in [(0usize, pt), (pt, pt + pls)].into_iter().enumerate() {
            let blockhessian = match &eval.blockworking_sets[block_idx] {
                BlockWorkingSet::ExactNewton { hessian, .. } => hessian.to_dense(),
                BlockWorkingSet::Diagonal { .. } => panic!("expected exact newton block"),
            };
            let joint_block = joint.slice(s![start..end, start..end]).to_owned();
            crate::test_support::assert_matrix_derivativefd(
                &joint_block,
                &blockhessian,
                1e-10,
                &format!("nonwiggle block {block_idx} principal block"),
            );
        }
    }

    #[test]
    fn nonwiggle_family_joint_exacthessian_directional_derivative_matches_finite_difference() {
        let n = 8usize;
        let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0]);
        let weights = Array1::from_vec(vec![1.0; n]);
        let threshold_design = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
            Array2::from_shape_fn((n, 2), |(i, j)| {
                let t = i as f64 / (n as f64 - 1.0);
                match j {
                    0 => 1.0,
                    1 => (2.0 * std::f64::consts::PI * t).sin(),
                    _ => unreachable!(),
                }
            }),
        ));
        let log_sigma_design = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
            Array2::from_shape_fn((n, 2), |(i, j)| {
                let t = i as f64 / (n as f64 - 1.0);
                match j {
                    0 => 1.0,
                    1 => t - 0.5,
                    _ => unreachable!(),
                }
            }),
        ));
        let family = BinomialLocationScaleFamily {
            y: y.clone(),
            weights: weights.clone(),
            link_kind: InverseLink::Standard(StandardLink::Probit),
            threshold_design: Some(threshold_design.clone()),
            log_sigma_design: Some(log_sigma_design.clone()),
            policy: crate::resource::ResourcePolicy::default_library(),
        };

        let rebuild_states = |beta_t: &Array1<f64>, beta_ls: &Array1<f64>| {
            vec![
                ParameterBlockState {
                    beta: beta_t.clone(),
                    eta: threshold_design.matrixvectormultiply(beta_t),
                },
                ParameterBlockState {
                    beta: beta_ls.clone(),
                    eta: log_sigma_design.matrixvectormultiply(beta_ls),
                },
            ]
        };

        let beta_t = array![0.2, -0.1];
        let beta_ls = array![-0.15, 0.08];
        let states = rebuild_states(&beta_t, &beta_ls);
        let base_h = family
            .exact_newton_joint_hessian(&states)
            .expect("joint hessian")
            .expect("expected joint exact hessian");
        let direction = array![0.2, 0.3, -0.15, 0.1];
        let analytic = family
            .exact_newton_joint_hessian_directional_derivative(&states, &direction)
            .expect("joint dH")
            .expect("expected joint exact dH");

        let eps = 1e-6;
        let dir_t = direction.slice(s![0..beta_t.len()]).to_owned();
        let dir_ls = direction.slice(s![beta_t.len()..]).to_owned();
        let states_plus =
            rebuild_states(&(&beta_t + &(eps * &dir_t)), &(&beta_ls + &(eps * &dir_ls)));
        let h_plus = family
            .exact_newton_joint_hessian(&states_plus)
            .expect("plus joint hessian")
            .expect("expected plus joint hessian");
        let fd = (h_plus - base_h) / eps;
        crate::test_support::assert_matrix_derivativefd(&fd, &analytic, 2e-3, "nonwiggle joint dH");
    }

    #[test]
    fn nonwiggle_family_joint_exacthessiansecond_directional_derivative_matches_finite_difference()
    {
        let n = 8usize;
        let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0]);
        let weights = Array1::from_vec(vec![1.0; n]);
        let threshold_design = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
            Array2::from_shape_fn((n, 2), |(i, j)| {
                let t = i as f64 / (n as f64 - 1.0);
                match j {
                    0 => 1.0,
                    1 => (2.0 * std::f64::consts::PI * t).sin(),
                    _ => unreachable!(),
                }
            }),
        ));
        let log_sigma_design = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
            Array2::from_shape_fn((n, 2), |(i, j)| {
                let t = i as f64 / (n as f64 - 1.0);
                match j {
                    0 => 1.0,
                    1 => t - 0.5,
                    _ => unreachable!(),
                }
            }),
        ));
        let family = BinomialLocationScaleFamily {
            y: y.clone(),
            weights: weights.clone(),
            link_kind: InverseLink::Standard(StandardLink::Probit),
            threshold_design: Some(threshold_design.clone()),
            log_sigma_design: Some(log_sigma_design.clone()),
            policy: crate::resource::ResourcePolicy::default_library(),
        };

        let rebuild_states = |beta_t: &Array1<f64>, beta_ls: &Array1<f64>| {
            vec![
                ParameterBlockState {
                    beta: beta_t.clone(),
                    eta: threshold_design.matrixvectormultiply(beta_t),
                },
                ParameterBlockState {
                    beta: beta_ls.clone(),
                    eta: log_sigma_design.matrixvectormultiply(beta_ls),
                },
            ]
        };

        let beta_t = array![0.2, -0.1];
        let beta_ls = array![-0.15, 0.08];
        let states = rebuild_states(&beta_t, &beta_ls);
        let direction_u = array![0.2, 0.3, -0.15, 0.1];
        let directionv = array![-0.05, 0.12, 0.08, -0.09];
        let analytic = family
            .exact_newton_joint_hessiansecond_directional_derivative(
                &states,
                &direction_u,
                &directionv,
            )
            .expect("joint d2H")
            .expect("expected joint exact d2H");

        let eps = 1e-6;
        let step_t = directionv.slice(s![0..beta_t.len()]).to_owned();
        let step_ls = directionv.slice(s![beta_t.len()..]).to_owned();
        let states_plus = rebuild_states(
            &(&beta_t + &(eps * &step_t)),
            &(&beta_ls + &(eps * &step_ls)),
        );
        let states_minus = rebuild_states(
            &(&beta_t - &(eps * &step_t)),
            &(&beta_ls - &(eps * &step_ls)),
        );
        let d_h_plus = family
            .exact_newton_joint_hessian_directional_derivative(&states_plus, &direction_u)
            .expect("joint dH plus")
            .expect("expected joint exact dH plus");
        let d_h_minus = family
            .exact_newton_joint_hessian_directional_derivative(&states_minus, &direction_u)
            .expect("joint dH minus")
            .expect("expected joint exact dH minus");
        let fd = (d_h_plus - d_h_minus) / (2.0 * eps);
        crate::test_support::assert_matrix_derivativefd(
            &fd,
            &analytic,
            4e-3,
            "nonwiggle joint d2H",
        );
    }

    #[test]
    fn wiggle_basis_is_structurally_monotone_for_nonnegative_coefficients() {
        let q_seed = Array1::linspace(-2.0, 2.0, 17);
        let degree = 3usize;
        let num_internal_knots = 6usize;
        let penalty_order = 2usize;

        let (block, knots) = BinomialLocationScaleWiggleFamily::buildwiggle_block_input(
            q_seed.view(),
            degree,
            num_internal_knots,
            penalty_order,
            false,
        )
        .expect("wiggle block");
        let design = match &block.design {
            DesignMatrix::Dense(x) => x.to_dense_arc(),
            DesignMatrix::Sparse(_) => panic!("expected dense wiggle design"),
        };
        let beta = Array1::from_elem(design.ncols(), 0.2);
        let derivative =
            monotone_wiggle_basis_with_derivative_order(q_seed.view(), &knots, degree, 1)
                .expect("wiggle derivative basis")
                .dot(&beta);
        assert!(
            derivative.iter().all(|&value| value >= -1e-12),
            "I-spline wiggle derivative must stay non-negative for non-negative coefficients: min={}",
            derivative.iter().fold(f64::INFINITY, |acc, &v| acc.min(v))
        );
    }

    #[test]
    fn degeneratewiggle_seed_uses_broad_fallback_domain() {
        let q_seed = Array1::zeros(9);
        let degree = 3usize;
        let knots = initializewiggle_knots_from_seed(q_seed.view(), degree, 5)
            .expect("initialize degenerate wiggle knots");
        let bs_degree = monotone_wiggle_internal_degree(degree).expect("cubic wiggle degree") + 1;
        let domain_min = knots[bs_degree];
        let domain_max = knots[knots.len() - bs_degree - 1];
        assert!(
            domain_min <= -2.9,
            "unexpected left fallback boundary: {domain_min}"
        );
        assert!(
            domain_max >= 2.9,
            "unexpected right fallback boundary: {domain_max}"
        );
    }

    #[test]
    fn wiggle_block_design_matches_ispline_basis() {
        let q_seed = Array1::linspace(-1.0, 1.0, 11);
        let degree = 2usize;
        let num_internal_knots = 4usize;
        let penalty_order = 2usize;

        let (block, knots) = BinomialLocationScaleWiggleFamily::buildwiggle_block_input(
            q_seed.view(),
            degree,
            num_internal_knots,
            penalty_order,
            false,
        )
        .expect("wiggle block");
        let (basis, _) = create_basis::<Dense>(
            q_seed.view(),
            KnotSource::Provided(knots.view()),
            monotone_wiggle_internal_degree(degree).expect("wiggle degree"),
            BasisOptions::i_spline(),
        )
        .expect("I-spline basis");
        let expected = (*basis).clone();

        let got = match &block.design {
            DesignMatrix::Dense(x) => x.to_dense_arc(),
            DesignMatrix::Sparse(_) => panic!("expected dense wiggle design"),
        };
        assert_eq!(got.dim(), expected.dim());
        for i in 0..got.nrows() {
            for j in 0..got.ncols() {
                assert!(
                    (got[[i, j]] - expected[[i, j]]).abs() < 1e-10,
                    "wiggle design mismatch at ({}, {}): got {}, expected {}",
                    i,
                    j,
                    got[[i, j]],
                    expected[[i, j]]
                );
            }
        }
    }

    #[test]
    fn split_wiggle_penalty_orders_uses_requested_order_one_as_primary() {
        let (primary, extras) = split_wiggle_penalty_orders(2, &[1, 2, 3, 3]);
        assert_eq!(primary, 1);
        assert_eq!(extras, vec![2, 3]);
    }

    #[test]
    fn append_selected_wiggle_penalty_orders_keeps_order_one() {
        let q_seed = Array1::linspace(-1.0, 1.0, 11);
        let degree = 3usize;
        let num_internal_knots = 5usize;
        let cfg = WiggleBlockConfig {
            degree,
            num_internal_knots,
            penalty_order: 1,
            double_penalty: false,
        };
        let selected = select_wiggle_basis_from_seed(q_seed.view(), &cfg, &[1, 3])
            .expect("selected wiggle basis");

        assert_eq!(selected.block.penalties.len(), 2);
        assert_eq!(selected.block.nullspace_dims, vec![1, 3]);
    }

    #[test]
    fn binomial_location_scale_generative_matches_coremu() {
        let n = 7usize;
        let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0]);
        let weights = Array1::from_vec(vec![1.0; n]);
        let eta_t = Array1::from_vec(vec![0.8, -0.4, 0.2, -1.1, 0.0, 0.5, -0.7]);
        let eta_ls = Array1::from_vec(vec![-3.0, -1.2, -0.1, 0.3, 1.1, 2.0, 4.0]);

        let family = BinomialLocationScaleFamily {
            y: y.clone(),
            weights: weights.clone(),
            link_kind: InverseLink::Standard(StandardLink::Probit),
            threshold_design: None,
            log_sigma_design: None,
            policy: crate::resource::ResourcePolicy::default_library(),
        };
        let states = vec![
            ParameterBlockState {
                beta: Array1::zeros(1),
                eta: eta_t.clone(),
            },
            ParameterBlockState {
                beta: Array1::zeros(1),
                eta: eta_ls.clone(),
            },
        ];
        let spec = family.generativespec(&states).expect("generative spec");
        let core =
            binomial_location_scale_core(&y, &weights, &eta_t, &eta_ls, None, &family.link_kind)
                .expect("core");
        for i in 0..n {
            assert!(
                (spec.mean[i] - core.mu[i]).abs() < 1e-7,
                "mean mismatch at {i}: got {}, expected {}",
                spec.mean[i],
                core.mu[i]
            );
        }
    }

    #[test]
    fn wiggle_geometry_and_generative_use_same_sigma_link_as_core() {
        let n = 8usize;
        let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0]);
        let weights = Array1::from_vec(vec![1.0; n]);
        let eta_t = Array1::from_vec(vec![0.5, -0.6, 0.1, -0.3, 0.9, -0.2, 0.4, -0.8]);
        let eta_ls = Array1::from_vec(vec![-2.5, -1.5, -0.5, 0.0, 0.7, 1.4, 2.2, 3.0]);

        let q_seed = Array1::linspace(-1.5, 1.5, n);
        let (wiggle_block, knots) = BinomialLocationScaleWiggleFamily::buildwiggle_block_input(
            q_seed.view(),
            2,
            3,
            2,
            false,
        )
        .expect("wiggle block");

        let family = BinomialLocationScaleWiggleFamily {
            y: y.clone(),
            weights: weights.clone(),
            link_kind: InverseLink::Standard(StandardLink::Probit),
            threshold_design: None,
            log_sigma_design: None,
            wiggle_knots: knots,
            wiggle_degree: 2,
            policy: crate::resource::ResourcePolicy::default_library(),
        };

        let core_for_q0 =
            binomial_location_scale_core(&y, &weights, &eta_t, &eta_ls, None, &family.link_kind)
                .expect("core q0");
        let betaw = Array1::from_vec(vec![0.15; wiggle_block.design.ncols()]);
        let etaw = family
            .wiggle_design(core_for_q0.q0.view())
            .expect("wiggle design")
            .dot(&betaw);

        let states = vec![
            ParameterBlockState {
                beta: Array1::zeros(1),
                eta: eta_t.clone(),
            },
            ParameterBlockState {
                beta: Array1::zeros(1),
                eta: eta_ls.clone(),
            },
            ParameterBlockState {
                beta: betaw.clone(),
                eta: etaw.clone(),
            },
        ];

        let wigglespec = wiggle_block
            .clone()
            .intospec("wiggle")
            .expect("wiggle spec");
        let (geom_x, _) = family
            .block_geometry(&states, &wigglespec)
            .expect("block geometry");
        let geom = match geom_x {
            DesignMatrix::Dense(x) => x.to_dense(),
            DesignMatrix::Sparse(_) => panic!("expected dense wiggle geometry design"),
        };
        let expected_geom = family
            .wiggle_design(core_for_q0.q0.view())
            .expect("expected wiggle geometry");
        assert_eq!(geom.dim(), expected_geom.dim());
        for i in 0..geom.nrows() {
            for j in 0..geom.ncols() {
                assert!(
                    (geom[[i, j]] - expected_geom[[i, j]]).abs() < 1e-12,
                    "geometry mismatch at ({i}, {j}): got {}, expected {}",
                    geom[[i, j]],
                    expected_geom[[i, j]]
                );
            }
        }

        let generated = family.generativespec(&states).expect("generative spec");
        let core = binomial_location_scale_core(
            &y,
            &weights,
            &eta_t,
            &eta_ls,
            Some(&etaw),
            &family.link_kind,
        )
        .expect("core with wiggle");
        for i in 0..n {
            assert!(
                (generated.mean[i] - core.mu[i]).abs() < 1e-7,
                "wiggle mean mismatch at {i}: got {}, expected {}",
                generated.mean[i],
                core.mu[i]
            );
        }
    }

    #[test]
    fn poisson_extreme_eta_stays_finite_with_safe_exp() {
        use crate::families::custom_family::{CustomFamily, ParameterBlockState};
        let poisson = PoissonLogFamily {
            y: Array1::from_vec(vec![1.0, 2.0, 3.0]),
            weights: Array1::from_vec(vec![1.0, 1.0, 1.0]),
        };
        let extreme_eta = Array1::from_vec(vec![0.5, 709.0, -0.3]);
        let eval_result = poisson.evaluate(&[ParameterBlockState {
            beta: Array1::zeros(0),
            eta: extreme_eta,
        }]);
        if let Ok(eval) = eval_result {
            match &eval.blockworking_sets[0] {
                crate::families::custom_family::BlockWorkingSet::Diagonal {
                    working_response,
                    working_weights,
                } => {
                    let all_finite = working_response.iter().all(|v| v.is_finite())
                        && working_weights.iter().all(|v| v.is_finite())
                        && eval.log_likelihood.is_finite();
                    assert!(
                        all_finite,
                        "Poisson evaluate should produce finite outputs for all eta, \
                         but got non-finite values: ll={}, z={:?}, w={:?}",
                        eval.log_likelihood, working_response, working_weights
                    );
                }
                _ => panic!("expected Diagonal block"),
            }
        }
    }

    /// The batched outer-gradient override on `BinomialLocationScaleFamily`
    /// must produce a gradient that agrees with the central finite
    /// difference of the same family's outer cost. This is the strongest
    /// available correctness property: it does not depend on whether the
    /// generic per-coordinate path is reachable in this build, only on the
    /// scale-invariant identity `g_k = (V(ρ + h e_k) − V(ρ − h e_k)) / (2h)`
    /// at converged β̂. Because the unified evaluator already routes
    /// `ValueAndGradient` calls through the batched override (custom_family.rs
    /// at the `batched_outer_gradient_terms` call site), this also pins the
    /// wiring: any future regression that detaches the override from the
    /// dispatcher will trip the FD check via stale (zero) gradients.
    #[test]
    fn binomial_location_scale_batched_gradient_matches_finite_difference() {
        use crate::families::custom_family::BlockwiseFitOptions;

        // 7-row, two-block intercept-only problem with a unit-Identity
        // penalty per block. Larger n risks PIRLS taking many iterations and
        // amplifying FD round-off; small p keeps the leverage-block sizes
        // (p_t = 1, p_ls = 1) tiny so the manual reference is trivial to
        // sanity-check.
        let base = binomial_location_scale_base_fixture();
        let family = BinomialLocationScaleFamily {
            y: base.y,
            weights: base.weights,
            link_kind: InverseLink::Standard(StandardLink::Probit),
            threshold_design: Some(base.threshold_design),
            log_sigma_design: Some(base.log_sigma_design),
            policy: crate::resource::ResourcePolicy::default_library(),
        };

        let specs = vec![base.threshold_spec, base.log_sigma_spec];
        let rho = array![0.05, -0.15];
        let options = BlockwiseFitOptions {
            use_remlobjective: true,
            ridge_floor: 1e-10,
            outer_max_iter: 1,
            ..BlockwiseFitOptions::default()
        };

        let eval_outer = |rho: &Array1<f64>| {
            let derivative_blocks = vec![Vec::<CustomFamilyBlockPsiDerivative>::new(); specs.len()];
            let result = evaluate_custom_family_joint_hyper(
                &family,
                &specs,
                &options,
                rho,
                &derivative_blocks,
                None,
                crate::solver::estimate::reml::unified::EvalMode::ValueAndGradient,
            )
            .expect("objective+gradient at rho");
            (result.objective, result.gradient)
        };

        let (f0, g0) = eval_outer(&rho);
        assert!(f0.is_finite(), "outer cost must be finite at rho");
        assert_eq!(g0.len(), rho.len());

        let h = 1e-5;
        // Same noise-floor convention as the existing wiggle-family FD test
        // (custom_family.rs `outer_lamlgradient_matches_finite_differencewhen_joint_exact_path_is_active`):
        // below floor `EPS·|cost|/h`, the FD estimator can't resolve the
        // true gradient.
        let cost_magnitude = f0.abs().max(1.0);
        let noise_floor = (10.0 * f64::EPSILON * cost_magnitude / h).max(1e-9);

        for k in 0..rho.len() {
            let mut rho_p = rho.clone();
            let mut rho_m = rho.clone();
            rho_p[k] += h;
            rho_m[k] -= h;
            let (fp, _) = eval_outer(&rho_p);
            let (fm, _) = eval_outer(&rho_m);
            let gfd = (fp - fm) / (2.0 * h);
            let both_in_noise = g0[k].abs() < noise_floor && gfd.abs() < noise_floor;
            if !both_in_noise {
                let abs_err = (g0[k] - gfd).abs();
                let rel_err = abs_err / gfd.abs().max(g0[k].abs()).max(1e-12);
                assert!(
                    rel_err < 1e-3 || abs_err < 1e-6,
                    "batched gradient mismatch at coord {k}: \
                     batched={:.6e}, fd={:.6e}, abs_err={:.3e}, rel_err={:.3e}",
                    g0[k],
                    gfd,
                    abs_err,
                    rel_err,
                );
            }
        }
    }

    fn binomial_mean_wiggle_operator_fixture() -> (
        BinomialMeanWiggleFamily,
        Vec<ParameterBlockState>,
        Vec<ParameterBlockSpec>,
        Array2<f64>,
    ) {
        let x_eta = array![
            [1.0, -0.9],
            [1.0, -0.45],
            [1.0, -0.1],
            [1.0, 0.2],
            [1.0, 0.55],
            [1.0, 0.9],
        ];
        let beta_eta = array![-0.15, 0.7];
        let eta = x_eta.dot(&beta_eta);
        let degree = 3usize;
        let knots =
            initializewiggle_knots_from_seed(eta.view(), degree, 4).expect("mean-wiggle knots");
        let family = BinomialMeanWiggleFamily {
            y: array![0.0, 1.0, 0.0, 1.0, 1.0, 0.0],
            weights: array![1.0, 0.8, 1.2, 1.0, 0.7, 1.1],
            link_kind: InverseLink::Standard(StandardLink::Logit),
            wiggle_knots: knots,
            wiggle_degree: degree,
            policy: crate::resource::ResourcePolicy::default_library(),
        };
        let basis = family.wiggle_design(eta.view()).expect("wiggle basis");
        let beta_w = Array1::from_iter((0..basis.ncols()).map(|j| 0.015 * (j as f64 + 1.0)));
        let etaw = basis.dot(&beta_w);
        let states = vec![
            ParameterBlockState {
                beta: beta_eta,
                eta: eta.clone(),
            },
            ParameterBlockState {
                beta: beta_w,
                eta: etaw,
            },
        ];
        let specs = vec![
            ParameterBlockSpec {
                name: "eta".to_string(),
                design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(x_eta.clone())),
                offset: Array1::zeros(eta.len()),
                penalties: vec![],
                nullspace_dims: vec![],
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
            ParameterBlockSpec {
                name: "wiggle".to_string(),
                design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(basis)),
                offset: Array1::zeros(eta.len()),
                penalties: vec![],
                nullspace_dims: vec![],
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
        ];
        (family, states, specs, x_eta)
    }

    fn assert_close_matrix(a: &Array2<f64>, b: &Array2<f64>, tol: f64, label: &str) {
        assert_eq!(a.dim(), b.dim(), "{label} shape mismatch");
        let max_err = a
            .iter()
            .zip(b.iter())
            .map(|(&x, &y)| (x - y).abs())
            .fold(0.0_f64, f64::max);
        assert!(
            max_err < tol,
            "{label} max error {max_err:.3e} >= {tol:.3e}"
        );
    }

    #[test]
    fn binomial_mean_wiggle_hessian_operators_match_dense_derivatives() {
        let (family, states, specs, x_eta) = binomial_mean_wiggle_operator_fixture();
        let p_eta = x_eta.ncols();
        let pw = states[BinomialMeanWiggleFamily::BLOCK_WIGGLE].beta.len();
        let total = p_eta + pw;
        let dir_u = Array1::from_iter((0..total).map(|j| 0.03 * (j as f64 + 1.0).sin()));
        let dir_v = Array1::from_iter((0..total).map(|j| -0.02 * (j as f64 + 0.5).cos()));

        let dense_h = family
            .exact_newton_joint_hessian_with_specs(&states, &specs)
            .expect("dense H")
            .expect("dense H available");
        let workspace = family
            .exact_newton_joint_hessian_workspace(&states, &specs)
            .expect("workspace")
            .expect("workspace available");
        let h_columns =
            Array2::from_shape_fn((total, total), |(i, j)| if i == j { 1.0 } else { 0.0 });
        let op_h = crate::solver::estimate::reml::unified::HyperOperator::mul_mat(
            family
                .bmw_static_hessian_operator(&states, Arc::new(x_eta.clone()))
                .expect("static op")
                .as_ref(),
            &h_columns,
        );
        assert_close_matrix(&op_h, &dense_h, 1e-10, "static H operator");
        let hv = workspace
            .hessian_matvec(&dir_u)
            .expect("workspace HVP")
            .expect("workspace HVP available");
        let hv_dense = dense_h.dot(&dir_u);
        let hv_err = (&hv - &hv_dense).mapv(f64::abs).sum();
        assert!(hv_err < 1e-10, "workspace HVP mismatch {hv_err:.3e}");

        let dense_dh = family
            .exact_newton_joint_hessian_directional_derivative_with_specs(&states, &specs, &dir_u)
            .expect("dense dH")
            .expect("dense dH available");
        let op_dh = workspace
            .directional_derivative_operator(&dir_u)
            .expect("dH operator")
            .expect("dH operator available")
            .to_dense();
        assert_close_matrix(&op_dh, &dense_dh, 1e-10, "directional dH operator");

        let dense_d2h = family
            .exact_newton_joint_hessian_second_directional_derivative_with_specs(
                &states, &specs, &dir_u, &dir_v,
            )
            .expect("dense d2H")
            .expect("dense d2H available");
        let op_d2h = workspace
            .second_directional_derivative_operator(&dir_u, &dir_v)
            .expect("d2H operator")
            .expect("d2H operator available")
            .to_dense();
        assert_close_matrix(
            &op_d2h,
            &dense_d2h,
            1e-10,
            "second directional d2H operator",
        );
    }

    #[test]
    fn binomial_mean_wiggle_planner_keeps_second_order_at_large_n() {
        let n = 50_001usize;
        let family = BinomialMeanWiggleFamily {
            y: Array1::zeros(n),
            weights: Array1::ones(n),
            link_kind: InverseLink::Standard(StandardLink::Logit),
            wiggle_knots: initializewiggle_knots_from_seed(
                Array1::linspace(-1.0, 1.0, 9).view(),
                3,
                4,
            )
            .expect("large-n knots"),
            wiggle_degree: 3,
            policy: crate::resource::ResourcePolicy::default_library(),
        };
        let specs = vec![
            ParameterBlockSpec {
                name: "eta".to_string(),
                design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros(
                    (n, 2),
                ))),
                offset: Array1::zeros(n),
                penalties: vec![],
                nullspace_dims: vec![],
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
            ParameterBlockSpec {
                name: "wiggle".to_string(),
                design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros(
                    (n, 34),
                ))),
                offset: Array1::zeros(n),
                penalties: vec![],
                nullspace_dims: vec![],
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
        ];
        assert!(family.inner_coefficient_hessian_hvp_available(&specs));
        assert_eq!(
            family.exact_outer_derivative_order(&specs, &BlockwiseFitOptions::default()),
            crate::custom_family::ExactOuterDerivativeOrder::Second
        );
    }

    /// Regression guard for #684 on the ψ / influence-Jacobian (IFT) joint
    /// Hessian. The Newton/REML dense↔workspace path is pinned by
    /// `gaussian_location_scale_workspace_matvec_matches_dense`, but nothing
    /// pinned the *separate* representation used by the three
    /// `exact_newton_joint_psi*` builders — which is exactly where the observed
    /// `2κm` Fisher-cross drift slipped in uncaught. The Gaussian mean⊥scale
    /// Fisher cross E[H_{μ,ls}] = 2κ·E[m] = 0 (m = r·weight/σ², E[r] = 0) must
    /// be exactly 0 on the ψ joint Hessian and on ALL of its ψ-directional
    /// derivatives (1st, 2nd, and mixed β·ψ), because a function identically 0
    /// has identically-0 derivatives. The fixtures carry NONZERO residuals
    /// (y ≠ η_μ), so the old buggy `2κm` cross is genuinely nonzero — this test
    /// FAILS against the pre-fix code.
    #[test]
    fn gaussian_location_scale_psi_joint_hessian_pins_fisher_cross_zero() {
        use crate::solver::estimate::reml::unified::HyperOperator;

        // Materialize an `ExactNewtonJointPsiTerms` joint Hessian regardless of
        // whether the family returns it dense or operator-backed.
        fn materialize(
            dense: &Array2<f64>,
            operator: Option<&dyn HyperOperator>,
            total: usize,
        ) -> Array2<f64> {
            match operator {
                Some(op) => op.to_dense(),
                None => {
                    assert_eq!(dense.dim(), (total, total));
                    dense.clone()
                }
            }
        }

        // Max |entry| over the rectangular block H[r0..r1, c0..c1].
        fn block_max_abs(h: &Array2<f64>, r0: usize, r1: usize, c0: usize, c1: usize) -> f64 {
            let mut m = 0.0_f64;
            for r in r0..r1 {
                for c in c0..c1 {
                    m = m.max(h[[r, c]].abs());
                }
            }
            m
        }

        const CROSS_TOL: f64 = 1e-12;

        // ---- Non-wiggle GaussianLocationScaleFamily ----------------------
        {
            let (family, states, specs) = gls_workspace_fixture();
            let p_mu = states[GaussianLocationScaleFamily::BLOCK_MU].beta.len();
            let p_ls = states[GaussianLocationScaleFamily::BLOCK_LOG_SIGMA]
                .beta
                .len();
            let total = p_mu + p_ls;

            // Nonzero ψ design-Jacobian on the MEAN (μ) block so psi_index 0
            // resolves a nonzero z_primary_psi: the observed `2κmD` cross would
            // then leak into H_{μ,ls} on the old code. A second-order payload
            // (x_psi_psi) feeds the 2nd-order builder too.
            let x_mu_psi = Array2::from_shape_fn((family.y.len(), p_mu), |(i, j)| {
                0.2 + 0.11 * ((i as f64) * 0.37 + (j as f64) * 0.53).sin()
            });
            let x_mu_psi_psi = Array2::from_shape_fn((family.y.len(), p_mu), |(i, j)| {
                0.07 * ((i as f64) * 0.19 + (j as f64) * 0.23).cos()
            });
            let derivative_blocks = vec![
                vec![CustomFamilyBlockPsiDerivative {
                    penalty_index: None,
                    x_psi: x_mu_psi,
                    s_psi: Array2::zeros((p_mu, p_mu)),
                    s_psi_components: None,
                    s_psi_penalty_components: None,
                    x_psi_psi: Some(vec![x_mu_psi_psi]),
                    s_psi_psi: Some(vec![Array2::zeros((p_mu, p_mu))]),
                    s_psi_psi_components: None,
                    s_psi_psi_penalty_components: None,
                    implicit_operator: None,
                    implicit_axis: 0,
                    implicit_group_id: None,
                }],
                Vec::new(),
            ];

            // The dense Fisher joint Hessian itself must have a zero μ↔logσ
            // cross (cross=0 Fisher; #684) — sanity that the dense path agrees
            // with the ψ-path's zero, since the ψ-Hessian is the ψ-derivative
            // of exactly this curvature object.
            let dense_h = family
                .exact_newton_joint_hessian(&states)
                .expect("dense joint Hessian build")
                .expect("dense joint Hessian present");
            assert!(
                block_max_abs(&dense_h, 0, p_mu, p_mu, total) <= CROSS_TOL,
                "#684: dense Fisher joint Hessian μ↔logσ cross block must be 0, got max |.|={:.3e}",
                block_max_abs(&dense_h, 0, p_mu, p_mu, total)
            );

            // 1st-order ψ joint Hessian.
            let psi = family
                .exact_newton_joint_psi_terms(&states, &specs, &derivative_blocks, 0)
                .expect("psi terms call")
                .expect("gaussian psi terms present");
            let h_psi = materialize(&psi.hessian_psi, psi.hessian_psi_operator.as_deref(), total);
            let cross = block_max_abs(&h_psi, 0, p_mu, p_mu, total);
            assert!(
                cross <= CROSS_TOL,
                "#684: ψ joint Hessian μ↔logσ cross block must be Fisher-0 (observed 2κm \
                 drift), got max |.|={cross:.3e}"
            );

            // 2nd-order ψ joint Hessian.
            let psi2 = family
                .exact_newton_joint_psisecond_order_terms(&states, &specs, &derivative_blocks, 0, 0)
                .expect("psi 2nd-order call")
                .expect("gaussian psi 2nd-order present");
            let h_psi2 = materialize(
                &psi2.hessian_psi_psi,
                psi2.hessian_psi_psi_operator.as_deref(),
                total,
            );
            let cross2 = block_max_abs(&h_psi2, 0, p_mu, p_mu, total);
            assert!(
                cross2 <= CROSS_TOL,
                "#684: 2nd-order ψ joint Hessian μ↔logσ cross block must be Fisher-0, \
                 got max |.|={cross2:.3e}"
            );

            // Mixed β·ψ directional derivative of the ψ joint Hessian.
            let d_beta = Array1::from_shape_fn(total, |i| 0.05 + 0.13 * ((i + 1) as f64).sin());
            let mixed = family
                .exact_newton_joint_psihessian_directional_derivative(
                    &states,
                    &specs,
                    &derivative_blocks,
                    0,
                    &d_beta,
                )
                .expect("psi mixed-drift call")
                .expect("gaussian psi mixed-drift present");
            assert_eq!(mixed.dim(), (total, total));
            let crossm = block_max_abs(&mixed, 0, p_mu, p_mu, total);
            assert!(
                crossm <= CROSS_TOL,
                "#684: mixed β·ψ ψ-Hessian μ↔logσ cross block must be Fisher-0, \
                 got max |.|={crossm:.3e}"
            );
        }

        // ---- Wiggle GaussianLocationScaleWiggleFamily --------------------
        {
            let (family, states, specs, ..) = gls_wiggle_workspace_fixture();
            let p_mu = states[GaussianLocationScaleWiggleFamily::BLOCK_MU]
                .beta
                .len();
            let p_ls = states[GaussianLocationScaleWiggleFamily::BLOCK_LOG_SIGMA]
                .beta
                .len();
            let p_w = states[GaussianLocationScaleWiggleFamily::BLOCK_WIGGLE]
                .beta
                .len();
            let total = p_mu + p_ls + p_w;
            // Block column offsets in the flattened joint coefficient space.
            let mu0 = 0usize;
            let ls0 = p_mu;
            let ls1 = p_mu + p_ls;
            let w0 = p_mu + p_ls;
            let w1 = total;

            // ψ design-Jacobian on the MEAN (μ) block (psi_index 0). The wiggle
            // block does not carry an independent ψ axis here; a nonzero mean ψ
            // is enough to exercise BOTH mean⊥scale crosses (coeff_ml = 2κmD and
            // l = 2κm) and their derivatives on the old code.
            let x_mu_psi = Array2::from_shape_fn((family.y.len(), p_mu), |(i, j)| {
                0.18 + 0.09 * ((i as f64) * 0.41 + (j as f64) * 0.29).sin()
            });
            let x_mu_psi_psi = Array2::from_shape_fn((family.y.len(), p_mu), |(i, j)| {
                0.06 * ((i as f64) * 0.17 + (j as f64) * 0.31).cos()
            });
            let derivative_blocks = vec![
                vec![CustomFamilyBlockPsiDerivative {
                    penalty_index: None,
                    x_psi: x_mu_psi,
                    s_psi: Array2::zeros((p_mu, p_mu)),
                    s_psi_components: None,
                    s_psi_penalty_components: None,
                    x_psi_psi: Some(vec![x_mu_psi_psi]),
                    s_psi_psi: Some(vec![Array2::zeros((p_mu, p_mu))]),
                    s_psi_psi_components: None,
                    s_psi_psi_penalty_components: None,
                    implicit_operator: None,
                    implicit_axis: 0,
                    implicit_group_id: None,
                }],
                Vec::new(),
                Vec::new(),
            ];

            // Assert BOTH mean⊥scale cross blocks are Fisher-0 on the ψ joint
            // Hessian: μ↔logσ AND wiggle↔logσ. Leave the within-mean (μ↔wiggle)
            // and within-scale (logσ↔logσ) blocks unasserted (genuinely
            // nonzero).
            let assert_wiggle_crosses_zero = |h: &Array2<f64>, label: &str| {
                let c_ml = block_max_abs(h, mu0, ls0, ls0, ls1);
                let c_wl = block_max_abs(h, w0, w1, ls0, ls1);
                assert!(
                    c_ml <= CROSS_TOL,
                    "#684 (wiggle {label}): μ↔logσ cross block must be Fisher-0 \
                     (observed 2κmD drift), got max |.|={c_ml:.3e}"
                );
                assert!(
                    c_wl <= CROSS_TOL,
                    "#684 (wiggle {label}): wiggle↔logσ cross block must be Fisher-0 \
                     (observed 2κm drift; the wiggle is mean-side), got max |.|={c_wl:.3e}"
                );
            };

            // Dense Fisher joint Hessian sanity: both mean⊥scale crosses zero.
            let dense_h = family
                .exact_newton_joint_hessian(&states)
                .expect("wiggle dense joint Hessian build")
                .expect("wiggle dense joint Hessian present");
            assert_eq!(dense_h.dim(), (total, total));
            assert_wiggle_crosses_zero(&dense_h, "dense Fisher");

            // 1st-order ψ.
            let psi = family
                .exact_newton_joint_psi_terms(&states, &specs, &derivative_blocks, 0)
                .expect("wiggle psi terms call")
                .expect("wiggle psi terms present");
            let h_psi = materialize(&psi.hessian_psi, psi.hessian_psi_operator.as_deref(), total);
            assert_wiggle_crosses_zero(&h_psi, "1st-order ψ");

            // 2nd-order ψ.
            let psi2 = family
                .exact_newton_joint_psisecond_order_terms(&states, &specs, &derivative_blocks, 0, 0)
                .expect("wiggle psi 2nd-order call")
                .expect("wiggle psi 2nd-order present");
            let h_psi2 = materialize(
                &psi2.hessian_psi_psi,
                psi2.hessian_psi_psi_operator.as_deref(),
                total,
            );
            assert_wiggle_crosses_zero(&h_psi2, "2nd-order ψ");

            // Mixed β·ψ.
            let d_beta = Array1::from_shape_fn(total, |i| 0.04 + 0.1 * ((i + 1) as f64).cos());
            let mixed = family
                .exact_newton_joint_psihessian_directional_derivative(
                    &states,
                    &specs,
                    &derivative_blocks,
                    0,
                    &d_beta,
                )
                .expect("wiggle psi mixed-drift call")
                .expect("wiggle psi mixed-drift present");
            assert_eq!(mixed.dim(), (total, total));
            assert_wiggle_crosses_zero(&mixed, "mixed β·ψ");
        }
    }
}