gam 0.3.121 - Docs.rs

use super::*;

pub(crate) const REML_SECOND_ORDER_RHO_CAP: usize = 4;
/// Continuation prewarm is a seed-polishing pass, not part of the REML
/// objective. It can be useful for tiny rho spaces where one or two warm
/// solves amortize, but it scales with the number of starts and runs full
/// inner solves before the real optimizer even begins. Moderate/high-rho
/// smooths (measure-jet spectral candidates are the motivating profile) start
/// directly from the seed lattice; the optimizer's own line search owns
/// globalization.
pub(crate) const REML_CONTINUATION_PREWARM_RHO_CAP: usize = 4;
/// Above this rho dimension, startup work must be linear in "one real solve",
/// not "rank a seed lattice with capped PIRLS solves". The heuristic seed is
/// deterministic and already centered on the current penalty scale; BFGS/ARC
/// globalizes from there. Low-dimensional classic smooths keep screening
/// because the extra probes are cheap and sometimes useful.
pub(crate) const REML_SEED_SCREENING_RHO_CAP: usize = 4;

const KAHAN_SWITCH_ELEMS: usize = 10_000;

pub(crate) fn faer_frob_inner(a: MatRef<'_, f64>, b: MatRef<'_, f64>) -> f64 {
    let (m, n) = (a.nrows(), a.ncols());
    let elem_count = m.saturating_mul(n);
    if elem_count < KAHAN_SWITCH_ELEMS {
        let mut sum = 0.0_f64;
        for j in 0..n {
            for i in 0..m {
                sum += a[(i, j)] * b[(i, j)];
            }
        }
        sum
    } else {
        let mut sum = KahanSum::default();
        for j in 0..n {
            for i in 0..m {
                sum.add(a[(i, j)] * b[(i, j)]);
            }
        }
        sum.sum()
    }
}

pub(crate) fn kahan_sum<I>(iter: I) -> f64
where
    I: IntoIterator<Item = f64>,
{
    let mut acc = KahanSum::default();
    for value in iter {
        acc.add(value);
    }
    acc.sum()
}

#[derive(Clone, Debug)]
pub(crate) struct ParametricColumnConditioning {
    pub(crate) intercept_idx: Option<usize>,
    pub(crate) columns: Vec<(usize, f64, f64)>,
}

impl ParametricColumnConditioning {
    /// Build conditioning from explicit unpenalized column indices.
    ///
    /// Reads only the specified columns from `x` (via `extract_column`) to
    /// compute per-column mean/variance — no full-design densification.
    pub(crate) fn from_column_indices(x: &DesignMatrix, unpenalized_cols: &[usize]) -> Self {
        const SCALE_EPS: f64 = 1e-12;
        let n = x.nrows();
        if n == 0 {
            return Self {
                intercept_idx: None,
                columns: Vec::new(),
            };
        }
        let mut intercept_idx = None;
        let mut columns = Vec::new();
        // Batched extract avoids per-column unit-vector dispatch when `x` is a
        // lazy operator (e.g. ReparamOperator): one GEMM versus
        // `unpenalized_cols.len()` separate matvecs.
        let block = x.extract_columns(unpenalized_cols);
        for (k, &j) in unpenalized_cols.iter().enumerate() {
            let col = block.column(k);
            let first = col[0];
            let is_constant = col.iter().all(|&v| (v - first).abs() <= 1e-12);
            if is_constant {
                if (first - 1.0).abs() <= 1e-12 && intercept_idx.is_none() {
                    intercept_idx = Some(j);
                }
                continue;
            }
            let mean = col.iter().copied().sum::<f64>() / n as f64;
            let var = col
                .iter()
                .map(|&v| {
                    let d = v - mean;
                    d * d
                })
                .sum::<f64>()
                / n as f64;
            if !var.is_finite() || var <= SCALE_EPS * SCALE_EPS {
                continue;
            }
            columns.push((j, mean, var.sqrt()));
        }
        if intercept_idx.is_none() {
            for (_, mean, _) in &mut columns {
                *mean = 0.0;
            }
        }
        Self {
            intercept_idx,
            columns,
        }
    }

    /// Infer unpenalized columns from `PenaltySpec` slices.
    pub(crate) fn infer_from_penalty_specs(x: &DesignMatrix, specs: &[PenaltySpec]) -> Self {
        let p = x.ncols();
        let mut penalized = vec![false; p];
        for spec in specs {
            let range = spec.col_range(p);
            for j in range {
                penalized[j] = true;
            }
        }
        let unpenalized: Vec<usize> = (0..p).filter(|&j| !penalized[j]).collect();
        Self::from_column_indices(x, &unpenalized)
    }

    pub(crate) fn is_active(&self) -> bool {
        !self.columns.is_empty()
    }

    /// Return a lazily-conditioned design matrix (no materialization).
    ///
    /// Wraps `x` in a `ConditionedDesign` operator that applies per-column
    /// centering and scaling through matvec algebra, avoiding densification.
    pub(crate) fn apply_to_design(&self, x: &DesignMatrix) -> DesignMatrix {
        if !self.is_active() {
            return x.clone();
        }
        DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Arc::new(
            crate::matrix::ConditionedDesign::new(x.clone(), self.columns.clone()),
        )))
    }

    /// Map a constraint matrix from original (user-scale) coefficients to the
    /// internally-conditioned coordinates the solver actually optimizes.
    ///
    /// Constraints are authored on the *original* design-column coefficients:
    /// `A_orig · β_orig {≥,≤} b` (e.g. a `linear(x, min, max)` box pushes rows
    /// `β_col ≥ min` and `β_col ≤ max`). The inner solve works with the
    /// conditioned coefficients `β_int`, where the back-transform `β_orig = M·β_int`
    /// is exactly the one implemented by [`Self::backtransform_beta`]:
    ///
    /// ```text
    ///   β_orig[j]         = β_int[j] / scale_j                         (conditioned col j)
    ///   β_orig[intercept] = β_int[intercept] − Σ_j (mean_j / scale_j) · β_int[j]
    /// ```
    ///
    /// so `M[j][j] = 1/scale_j`, `M[intercept][j] = −mean_j/scale_j`, and `M` is
    /// the identity elsewhere. Substituting into `A_orig · β_orig` gives the
    /// equivalent internal constraint `A_int · β_int {≥,≤} b` with `A_int = A_orig·M`.
    /// Only the conditioned columns of `A_int` differ from `A_orig`:
    ///
    /// ```text
    ///   A_int[:, j] = (A_orig[:, j] − mean_j · A_orig[:, intercept]) / scale_j
    /// ```
    ///
    /// The RHS `b` is unchanged, so [`Self::transform_linear_constraints_to_internal`]
    /// carries it through verbatim. `A_orig · M` is precisely `M` applied to the
    /// columns of `A_orig`, which is the canonical column-conditioning primitive
    /// [`Self::transform_matrix_columnswith_a`] — so delegate to it rather than
    /// carry a second copy of the per-column algebra.
    pub(crate) fn transform_constraint_matrix_to_internal(
        &self,
        a_original: &Array2<f64>,
    ) -> Array2<f64> {
        self.transform_matrix_columnswith_a(a_original)
    }

    pub(crate) fn transform_linear_constraints_to_internal(
        &self,
        constraints: Option<crate::pirls::LinearInequalityConstraints>,
    ) -> Option<crate::pirls::LinearInequalityConstraints> {
        constraints.map(|constraints| crate::pirls::LinearInequalityConstraints {
            a: self.transform_constraint_matrix_to_internal(&constraints.a),
            b: constraints.b,
        })
    }

    pub(crate) fn backtransform_beta(&self, beta_internal: &Array1<f64>) -> Array1<f64> {
        let mut beta = beta_internal.clone();
        for &(j, mean, scale) in &self.columns {
            if let Some(intercept_idx) = self.intercept_idx {
                beta[intercept_idx] -= beta_internal[j] * mean / scale;
            }
            beta[j] = beta_internal[j] / scale;
        }
        beta
    }

    pub(crate) fn transform_matrix_columnswith_a(&self, mat: &Array2<f64>) -> Array2<f64> {
        let mut out = mat.clone();
        self.transform_matrix_columnswith_a_inplace(&mut out);
        out
    }

    pub(crate) fn transform_matrix_columnswith_a_inplace(&self, mat: &mut Array2<f64>) {
        if !self.is_active() {
            return;
        }
        let intercept_col = self.intercept_idx.map(|idx| mat.column(idx).to_owned());
        for &(j, mean, scale) in &self.columns {
            let mut target = mat.column_mut(j);
            if mean != 0.0
                && let Some(intercept_col) = intercept_col.as_ref()
            {
                target -= &(intercept_col * mean);
            }
            if scale != 1.0 {
                target.mapv_inplace(|v| v / scale);
            }
        }
    }

    /// Left-multiply `mat_internal` by `M`, where `M` is the coefficient
    /// back-transform: `β_orig = M · β_int` (the same map
    /// [`Self::backtransform_beta`] applies to a single vector).
    ///
    /// `M` has the structure
    /// ```text
    ///   M[intercept, intercept] = 1
    ///   M[intercept, j]        = −mean_j / scale_j     (conditioned column j)
    ///   M[j, j]                = 1 / scale_j           (conditioned column j)
    /// ```
    /// and is the identity elsewhere. Acts on each column of `mat_internal`
    /// the same way `backtransform_beta` acts on a single vector.
    pub(crate) fn left_multiply_by_m(&self, mat_internal: &Array2<f64>) -> Array2<f64> {
        let mut out = mat_internal.clone();
        if !self.is_active() {
            return out;
        }
        if let Some(intercept_idx) = self.intercept_idx {
            // (M·X)[intercept, :] = X[intercept, :] − Σ_j (mean_j/scale_j) · X[j, :]
            // Each conditioned column reads from the ORIGINAL `mat_internal`
            // row j (snapshot), so the contributions accumulate independently
            // — identical semantics to `backtransform_beta`'s use of
            // `beta_internal[j]` rather than the running `beta[j]`.
            for &(j, mean, scale) in &self.columns {
                if mean != 0.0 {
                    let factor = mean / scale;
                    let row_j_snapshot = mat_internal.row(j).to_owned();
                    let mut interceptrow = out.row_mut(intercept_idx);
                    interceptrow -= &(&row_j_snapshot * factor);
                }
            }
        }
        // (M·X)[j, :] = X[j, :] / scale_j
        for &(j, _mean, scale) in &self.columns {
            if scale != 1.0 {
                out.row_mut(j).mapv_inplace(|v| v / scale);
            }
        }
        out
    }

    /// Right-multiply `mat_internal` by `Mᵀ` (the transpose of the
    /// coefficient back-transform). Mirror of [`Self::left_multiply_by_m`]
    /// on columns.
    pub(crate) fn right_multiply_by_m_transpose(&self, mat_internal: &Array2<f64>) -> Array2<f64> {
        let mut out = mat_internal.clone();
        if !self.is_active() {
            return out;
        }
        if let Some(intercept_idx) = self.intercept_idx {
            // (X·Mᵀ)[:, intercept] = X[:, intercept] − Σ_j (mean_j/scale_j) · X[:, j]
            for &(j, mean, scale) in &self.columns {
                if mean != 0.0 {
                    let factor = mean / scale;
                    let col_j_snapshot = mat_internal.column(j).to_owned();
                    let mut intercept_col = out.column_mut(intercept_idx);
                    intercept_col -= &(&col_j_snapshot * factor);
                }
            }
        }
        // (X·Mᵀ)[:, j] = X[:, j] / scale_j
        for &(j, _mean, scale) in &self.columns {
            if scale != 1.0 {
                out.column_mut(j).mapv_inplace(|v| v / scale);
            }
        }
        out
    }

    /// Left-multiply `mat_internal` by `M⁻ᵀ`. The inverse basis map is
    /// ```text
    ///   M⁻¹[intercept, intercept] = 1
    ///   M⁻¹[intercept, j]         = mean_j     (conditioned column j)
    ///   M⁻¹[j, j]                 = scale_j    (conditioned column j)
    /// ```
    /// so `(M⁻ᵀ · X)[j, :] = scale_j · X[j, :] + mean_j · X[intercept, :]`
    /// and `(M⁻ᵀ · X)[intercept, :] = X[intercept, :]`.
    pub(crate) fn left_multiply_by_m_inv_transpose(
        &self,
        mat_internal: &Array2<f64>,
    ) -> Array2<f64> {
        let mut out = mat_internal.clone();
        if !self.is_active() {
            return out;
        }
        if let Some(intercept_idx) = self.intercept_idx {
            let interceptrow_snapshot = mat_internal.row(intercept_idx).to_owned();
            for &(j, mean, scale) in &self.columns {
                if scale != 1.0 {
                    out.row_mut(j).mapv_inplace(|v| v * scale);
                }
                if mean != 0.0 {
                    let mut row_j = out.row_mut(j);
                    row_j += &(&interceptrow_snapshot * mean);
                }
            }
        } else {
            for &(j, _mean, scale) in &self.columns {
                if scale != 1.0 {
                    out.row_mut(j).mapv_inplace(|v| v * scale);
                }
            }
        }
        out
    }

    /// Right-multiply `mat_internal` by `M⁻¹`. Mirror of
    /// [`Self::left_multiply_by_m_inv_transpose`] on columns.
    pub(crate) fn right_multiply_by_m_inv(&self, mat_internal: &Array2<f64>) -> Array2<f64> {
        let mut out = mat_internal.clone();
        if !self.is_active() {
            return out;
        }
        if let Some(intercept_idx) = self.intercept_idx {
            let intercept_col_snapshot = mat_internal.column(intercept_idx).to_owned();
            for &(j, mean, scale) in &self.columns {
                if scale != 1.0 {
                    out.column_mut(j).mapv_inplace(|v| v * scale);
                }
                if mean != 0.0 {
                    let mut col_j = out.column_mut(j);
                    col_j += &(&intercept_col_snapshot * mean);
                }
            }
        } else {
            for &(j, _mean, scale) in &self.columns {
                if scale != 1.0 {
                    out.column_mut(j).mapv_inplace(|v| v * scale);
                }
            }
        }
        out
    }

    /// `Cov(β_orig) = M · Cov(β_int) · Mᵀ`.
    ///
    /// Since `β_orig = M · β_int`, the covariance back-transform is the
    /// congruence `M · Σ · Mᵀ`, NOT `Mᵀ · Σ · M`. The latter (the prior
    /// implementation) silently swapped the variance of every conditioned
    /// parametric column with the variance of the intercept, off by exactly
    /// the basis change the intercept absorbs when columns are centered.
    pub(crate) fn backtransform_covariance(&self, cov_internal: &Array2<f64>) -> Array2<f64> {
        let right = self.right_multiply_by_m_transpose(cov_internal);
        self.left_multiply_by_m(&right)
    }

    /// `H_orig = M⁻ᵀ · H_int · M⁻¹`.
    ///
    /// Derived from `L_int(β_int) = L_orig(M · β_int)`: the chain rule gives
    /// `H_int = Mᵀ · H_orig · M`, so `H_orig = M⁻ᵀ · H_int · M⁻¹`. The prior
    /// implementation multiplied the intercept entry of `M⁻¹` by `scale_j`,
    /// silently scaling the Hessian by `scale_j²` along every conditioned
    /// column whenever scaling (not just centering) was active.
    pub(crate) fn backtransform_penalized_hessian(&self, h_internal: &Array2<f64>) -> Array2<f64> {
        let right = self.right_multiply_by_m_inv(h_internal);
        self.left_multiply_by_m_inv_transpose(&right)
    }

    pub(crate) fn backtransform_external_result(
        &self,
        mut result: ExternalOptimResult,
    ) -> ExternalOptimResult {
        if !self.is_active() {
            return result;
        }
        result.beta = self.backtransform_beta(&result.beta);
        if let Some(inf) = result.inference.as_mut() {
            inf.penalized_hessian = self
                .backtransform_penalized_hessian(inf.penalized_hessian.as_array())
                .into();
            inf.beta_covariance = inf
                .beta_covariance
                .take()
                .map(|cov| self.backtransform_covariance(cov.as_array()).into());
            inf.beta_standard_errors = inf
                .beta_covariance
                .as_ref()
                .map(|c| se_from_covariance(c.as_array()));
            inf.beta_covariance_corrected = inf
                .beta_covariance_corrected
                .take()
                .map(|cov| self.backtransform_covariance(&cov));
            inf.beta_standard_errors_corrected = inf
                .beta_covariance_corrected
                .as_ref()
                .map(se_from_covariance);
            inf.beta_covariance_frequentist = inf
                .beta_covariance_frequentist
                .take()
                .map(|cov| self.backtransform_covariance(&cov));
            // The influence matrix is a mixed linear operator, not a covariance
            // or Hessian. Drop it across column-conditioning transforms rather
            // than applying the wrong congruence map.
            inf.coefficient_influence = None;
            // X'WX is a congruence object under column-conditioning transforms;
            // its companion `F` is dropped here, so drop the stored Gram too and
            // let the WPS correction fall back to the conditional EDF rather than
            // applying a mismatched congruence map.
            inf.weighted_gram = None;
            inf.bias_correction_beta = inf
                .bias_correction_beta
                .take()
                .map(|b| self.backtransform_beta(&b));
            inf.smoothing_correction = inf
                .smoothing_correction
                .take()
                .map(|cov| self.backtransform_covariance(&cov));
            inf.reparam_qs = None;
        }
        result.constraint_kkt = None;
        // `result.artifacts.pirls` is a self-consistent geometric bundle in the
        // PIRLS internal basis (`x_transformed`, `beta_transformed`,
        // `penalized_hessian_transformed`, and the per-observation
        // `final_eta`/`finalmu`/`solveworking_response`/weights, all paired in
        // that one frame). Observation-space quantities derived from it
        // — η̂_i, leverages a_ii, sandwich SEs — are invariant under the
        // invertible coefficient-space reparameterization that conditioning
        // introduces, so the bundle stays correct in its own coordinates and
        // we keep it instead of wiping `pirls: None`.
        result
    }
}

pub(crate) fn map_hessian_to_original_basis(
    pirls: &crate::pirls::PirlsResult,
) -> Result<Array2<f64>, EstimationError> {
    let qs = &pirls.reparam_result.qs;
    let h_t = &pirls.penalized_hessian_transformed;
    // H_original = Qs * H_transformed * Qs'
    // left_dot_matrix avoids densification for sparse Hessians.
    let tmp = h_t.left_dot_matrix(qs);
    let mut h = tmp.dot(&qs.t());
    // Two non-self-adjoint matmuls accumulate ~p · ε rounding noise that
    // breaks bitwise symmetry even though the analytic result `Q H Qᵀ` is
    // symmetric whenever `H_transformed` is.  Average opposite entries
    // explicitly so downstream `validate_dense_hessian_export` doesn't
    // reject otherwise-valid fits over rounding-noise asymmetry.
    crate::families::custom_family::symmetrize_dense_in_place(&mut h);
    Ok(h)
}


/// Scale a posterior covariance `H^{-1}` by the coefficient-covariance scale.
///
/// `Vb = H^{-1} * scale`. The multiplier is supplied by
/// `GlmLikelihoodSpec::coefficient_covariance_scale`: it is the profiled
/// residual variance `sigma^2` for the scale-free profiled Gaussian, and `1.0`
/// for every family whose IRLS working weight already carries the dispersion /
/// full Fisher information (Gamma, Tweedie, Beta, Negative-Binomial, and the
/// fixed-scale Poisson/Binomial). For the latter the stored `H = X'WX + S_λ`
/// is already the true penalized Hessian, so no further dispersion multiply is
/// applied — multiplying again would double-count the dispersion (#679).
/// Centralizing the scaling here keeps the contract visible at every covariance
/// construction site instead of being inlined as a bare `cov * scale`.
#[inline]
pub(crate) fn scaled_covariance(cov: Array2<f64>, phi: f64) -> Array2<f64> {
    if (phi - 1.0).abs() <= f64::EPSILON {
        cov
    } else {
        cov * phi
    }
}