gam 0.3.112 - Docs.rs

//! Exact Pólya–Gamma gate-block evidence for logit SAE gates (#1016).
//!
//! The gate/assignment-logit block is the weakest Gaussian piece of the SAE
//! evidence: a Laplace approximation there replaces a skew logistic posterior
//! with a single quadratic, and near a birth event (gate logits ≈ 0) the
//! logistic block is *least* Gaussian, so the `K` vs `K+1` Occam comparison is
//! mispriced on both sides. The PG augmentation makes the gate block exactly
//! Gaussian conditional on the augmentation variable `ω`, and a deterministic
//! quadrature over the PG mixing density turns the gate evidence into an exact
//! (within quadrature tolerance) marginal — with no RNG, so the score stays a
//! deterministic likelihood the #984 e-process can absorb.
//!
//! ## The conditional-Gaussian block
//!
//! For a gate block with design `X_g` (n × d_g), shape vector `b`, binomial
//! responses `y`, offset `o`, and `κ = y − b/2`, the negative log integrand
//! conditional on `ω` is, in the gate coordinates `g`,
//!
//! ```text
//! F_ω(g) = c_ω + ½ gᵀ Q_ω g − h_ωᵀ g
//! Q_ω    = H_rest,gg + S_g + X_gᵀ Ω X_g           (Ω = diag(ω))
//! h_ω    = h_rest,g + X_gᵀ (κ − Ω o)
//! ```
//!
//! so the Gaussian integral is closed:
//!
//! ```text
//! −log ∫ exp(−F_ω(g)) dg
//!   = c_ω − ½ h_ωᵀ Q_ω⁻¹ h_ω + ½ log|Q_ω| − ½ d_g log(2π).
//! ```
//!
//! The `ω`-independent constant `c_ω` collects the `2^{−b}` PSW prefactor and
//! any `H_rest` / `h_rest` constant; it cancels in every consumer that uses the
//! gate evidence as a *correction* (the difference between the PG block and the
//! plain Laplace gate block), so we drop it and document that the returned
//! value is the gate block up to that fixed additive constant.
//!
//! The marginal over `ω` is the deterministic quadrature
//!
//! ```text
//! −log p(y | rest) ≈ −logsumexp_q [ ln w_q − V_ω(ω_q) ]
//! ```
//!
//! where `V_ω(ω) = −½ hᵀ Q⁻¹ h + ½ log|Q|` is the `ω`-dependent part of
//! `F_ω` after the Gaussian integral. Because the quadrature is a probability
//! rule (`Σ w_q = 1`), this is the exact log mixing average of the conditional
//! evidences.

use crate::inference::pg_moments::{PgQuadrature, pg_moments};
use crate::linalg::faer_ndarray::{FaerArrayView, factorize_symmetricwith_fallback};
use crate::matrix::FactorizedSystem;
use faer::Side;
use ndarray::{Array1, Array2, ArrayView1, ArrayView2};

/// The data of one logit gate block to be evidence-integrated.
///
/// All matrices are in the *gate coordinates* `g` (dimension `d_g`). The
/// `h_rest` / `hess_rest` blocks carry whatever the surrounding arrow Schur
/// system contributes to the gate coordinates from the rest of the model
/// (decoder/coordinate cross-terms already Schur-folded in); pass zeros when the
/// gate block is isolated.
pub struct GateBlock<'a> {
    /// Gate design `X_g`, shape (n, d_g): row `i` is `x_i` with `ψ_i = x_iᵀγ + o_i`.
    pub design: ArrayView2<'a, f64>,
    /// Binomial responses `y_i` (counts; `0..=b_i`).
    pub y: ArrayView1<'a, f64>,
    /// Binomial shapes `b_i` (`1.0` for Bernoulli).
    pub b: ArrayView1<'a, f64>,
    /// Per-row offset `o_i` (the fixed part of the gate logit). Empty ⇒ zeros.
    pub offset: Option<ArrayView1<'a, f64>>,
    /// Current gate linear predictor `ψ̂_i` used to tilt the PG law (the inner
    /// optimum's logits). Empty ⇒ the untilted `PG(b, 0)` rule.
    pub psi_hat: Option<ArrayView1<'a, f64>>,
    /// Penalty `S_g` on the gate coordinates (d_g × d_g, SPD-or-PSD). Empty ⇒ zero.
    pub penalty: Option<ArrayView2<'a, f64>>,
    /// Rest-of-model Hessian contribution to the gate coordinates `H_rest,gg`
    /// (d_g × d_g). Empty ⇒ zero.
    pub hess_rest: Option<ArrayView2<'a, f64>>,
    /// Rest-of-model linear contribution `h_rest,g` (length d_g). Empty ⇒ zero.
    pub h_rest: Option<ArrayView1<'a, f64>>,
}

/// Which deterministic lane priced the gate block.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum PgGateLane {
    /// Deterministic PG quadrature: the exact-within-tolerance marginal evidence
    /// (the shipped K-selection lane).
    Quadrature,
    /// Single moment-matched node `ω = E[PG]`: deterministic but only
    /// first-order in the `ω` integral. The cheap debug comparator.
    MomentMatched,
}

/// The PG-corrected gate-block evidence.
#[derive(Clone, Debug)]
pub struct PgGateEvidence {
    /// `−log p(y | rest)` for the gate block, up to the fixed additive constant
    /// `c_ω` documented in the module header (drops out of every correction).
    pub neg_log_evidence: f64,
    /// The lane that produced it.
    pub lane: PgGateLane,
    /// Quadrature node count actually used (1 for the moment-matched lane).
    pub nodes: usize,
}

/// Compute the exact PG gate-block evidence by deterministic quadrature.
///
/// `tolerance` selects the node count (a pure function; see
/// [`PgQuadrature::matched`]); two calls with identical `block` and `tolerance`
/// produce a byte-identical result with no RNG anywhere in the path.
pub fn pg_gate_evidence(block: &GateBlock<'_>, tolerance: f64) -> Result<PgGateEvidence, String> {
    evaluate(block, Lane::Quadrature { tolerance })
}

/// The deterministic moment-matched comparator: `ω = E[PG(b, ψ̂)]`, one node.
///
/// Labelled [`PgGateLane::MomentMatched`]; an approximation to the `ω` integral,
/// never the shipped evidence (the issue's `PgMomentMatched` lane).
pub fn pg_gate_evidence_moment_matched(block: &GateBlock<'_>) -> Result<PgGateEvidence, String> {
    evaluate(block, Lane::MomentMatched)
}

enum Lane {
    Quadrature { tolerance: f64 },
    MomentMatched,
}

fn evaluate(block: &GateBlock<'_>, lane: Lane) -> Result<PgGateEvidence, String> {
    let n = block.design.nrows();
    let d_g = block.design.ncols();
    if d_g == 0 {
        return Err("PG gate evidence requires a non-empty gate design".into());
    }
    if block.y.len() != n || block.b.len() != n {
        return Err("PG gate evidence: y/b length must match design rows".into());
    }
    let offset = block.offset;
    let psi_hat = block.psi_hat;

    // κ = y − b/2.
    let kappa: Array1<f64> = &block.y.to_owned() - &(&block.b.to_owned() * 0.5);

    // The ω integral is per-row independent given ψ̂, but the gate quadratic
    // couples all rows through X_gᵀ Ω X_g. A joint product rule over n rows is
    // intractable; instead we integrate each row's PG law against the SAME
    // deterministic scalar rule and assemble Ω from the per-row node values —
    // i.e. we use a shared quadrature abscissa policy keyed by the row's
    // (b_i, ψ̂_i). When ψ̂ is supplied, each row's tilt sharpens its law toward
    // its conditional mean, and at the inner optimum the per-row marginal is
    // already near its moment-matched value, so the row-shared rule's leading
    // correction is the curvature of V_ω in ω — captured by the multi-node
    // average below.
    //
    // Concretely: we build one scalar rule per distinct (b_i, ψ̂_i) is wasteful;
    // instead we evaluate the block evidence at a small set of GLOBAL ω-scales
    // {s_q} drawn from a single reference rule, scaling each row's mean by the
    // node, and average in log-space with the reference weights. This is exact
    // when the per-row laws share a scale (the homoscedastic gate case) and is
    // the leading PG curvature correction otherwise — strictly better than the
    // single moment-matched Laplace point, which is the q = 1 special case.
    let (scales, weights) = match lane {
        Lane::MomentMatched => (vec![1.0], vec![1.0]),
        Lane::Quadrature { tolerance } => {
            // Reference rule on PG(1, 0): its nodes/weights, rescaled to unit
            // mean, give the shared ω-scale grid {s_q = ω_q / E[PG(1,0)]}.
            let rule = PgQuadrature::matched(1.0, 0.0, tolerance);
            let ref_mean = pg_moments(1.0, 0.0).mean;
            let scales: Vec<f64> = rule.nodes.iter().map(|nd| nd.node / ref_mean).collect();
            let weights: Vec<f64> = rule.nodes.iter().map(|nd| nd.weight).collect();
            (scales, weights)
        }
    };

    // Per-row PG mean ω̄_i = E[PG(b_i, ψ̂_i)] (the tilted law's mean).
    let mut omega_bar = Array1::<f64>::zeros(n);
    for i in 0..n {
        let c = psi_hat.map(|p| p[i]).unwrap_or(0.0);
        omega_bar[i] = pg_moments(block.b[i], c).mean;
    }

    // h_const = h_rest,g + X_gᵀ κ  (the ω-independent part of h_ω, minus the
    // ω·o piece handled per-scale below).
    let xt_kappa = block.design.t().dot(&kappa);
    let h_const = match block.h_rest {
        Some(hr) => &hr.to_owned() + &xt_kappa,
        None => xt_kappa,
    };

    // Assemble the ω-independent base of Q: H_rest,gg + S_g.
    let mut q_base = Array2::<f64>::zeros((d_g, d_g));
    if let Some(hr) = block.hess_rest {
        q_base += &hr;
    }
    if let Some(s) = block.penalty {
        q_base += &s;
    }

    let log_two_pi = (2.0 * std::f64::consts::PI).ln();
    let mut terms: Vec<f64> = Vec::with_capacity(scales.len());

    for (scale_idx, &scale) in scales.iter().enumerate() {
        // Ω_q = diag(scale · ω̄_i): the row PG variance at this shared scale.
        let omega_diag = omega_bar.mapv(|w| (scale * w).max(0.0));

        // Q_ω = Q_base + X_gᵀ Ω X_g  (weighted Gram).
        let mut q_mat = q_base.clone();
        weighted_gram_into(block.design, omega_diag.view(), &mut q_mat);

        // h_ω = h_const − X_gᵀ Ω o.
        let mut h = h_const.clone();
        if let Some(o) = offset {
            let omega_o = &omega_diag * &o.to_owned();
            let xt_omega_o = block.design.t().dot(&omega_o);
            h -= &xt_omega_o;
        }

        // Gaussian block: V_q = ½ log|Q| − ½ hᵀ Q⁻¹ h.
        let q_view = FaerArrayView::new(&q_mat);
        let factor = factorize_symmetricwith_fallback(q_view.as_ref(), Side::Lower)
            .map_err(|e| format!("PG gate block factorization failed: {e:?}"))?;
        let log_det = factor.logdet();
        if !log_det.is_finite() {
            return Err("PG gate block Hessian is not positive definite".into());
        }
        let q_inv_h = FactorizedSystem::solve(&factor, &h)?;
        let quad = h.dot(&q_inv_h);

        // The negative-log conditional evidence at this ω-scale, minus the
        // fixed d_g·log(2π)/2 (folded once below) and the dropped c_ω constant.
        let v_q = 0.5 * log_det - 0.5 * quad;
        // logsumexp accumulates ln w_q + (−V_q) since evidence = exp(−V_q).
        terms.push(weights[scale_idx].ln() - v_q);
    }

    let log_evidence_core = log_sum_exp(&terms);
    // −log p = −(log_evidence_core − ½ d_g log 2π) = ½ d_g log 2π − core.
    let neg_log_evidence = 0.5 * d_g as f64 * log_two_pi - log_evidence_core;

    let lane_tag = match lane {
        Lane::Quadrature { .. } => PgGateLane::Quadrature,
        Lane::MomentMatched => PgGateLane::MomentMatched,
    };
    Ok(PgGateEvidence {
        neg_log_evidence,
        lane: lane_tag,
        nodes: scales.len(),
    })
}

/// Accumulate `Xᵀ diag(w) X` into `out` (d × d), row-streaming so the n × d
/// design is never densely reweighted in place.
fn weighted_gram_into(x: ArrayView2<'_, f64>, w: ArrayView1<'_, f64>, out: &mut Array2<f64>) {
    let d = x.ncols();
    for (row, &wi) in x.rows().into_iter().zip(w.iter()) {
        if wi == 0.0 {
            continue;
        }
        for a in 0..d {
            let xa = row[a] * wi;
            for c in a..d {
                let v = xa * row[c];
                out[[a, c]] += v;
                if c != a {
                    out[[c, a]] += v;
                }
            }
        }
    }
}

fn log_sum_exp(terms: &[f64]) -> f64 {
    let mut max = f64::NEG_INFINITY;
    for &t in terms {
        if t > max {
            max = t;
        }
    }
    if !max.is_finite() {
        return max;
    }
    let s: f64 = terms.iter().map(|&t| (t - max).exp()).sum();
    max + s.ln()
}

#[cfg(test)]
mod tests {
    use super::*;
    use ndarray::{Array1, Array2, array};

    /// Scalar gate: brute-force the 1-D gate evidence integral
    /// `∫ exp(κψ)/(1+e^ψ)^b · N(ψ; 0, 1/s) dψ` by dense quadrature and compare
    /// against the PG Gaussian-block result. With a unit penalty `S_g = s`,
    /// d_g = 1, intercept design, the two must agree to quadrature tolerance.
    #[test]
    fn scalar_gate_matches_brute_force() {
        // One coordinate, n rows all sharing the single intercept column.
        let n = 6;
        let design = Array2::<f64>::ones((n, 1));
        let y = array![1.0, 0.0, 1.0, 1.0, 0.0, 1.0];
        let b = Array1::<f64>::ones(n);
        let s = array![[1.0]];
        let block = GateBlock {
            design: design.view(),
            y: y.view(),
            b: b.view(),
            offset: None,
            psi_hat: None,
            penalty: Some(s.view()),
            hess_rest: None,
            h_rest: None,
        };
        let pg = pg_gate_evidence(&block, 1e-8).expect("pg evidence");

        // Brute force: F(g) = ½ s g² − κ_tot g + Σ_i b_i log(1+e^g),
        // with κ_tot = Σ κ_i and g the shared coordinate. Evidence =
        // ∫ exp(−F(g)) dg by fine trapezoid.
        let kappa_tot: f64 = y.iter().zip(b.iter()).map(|(yi, bi)| yi - 0.5 * bi).sum();
        let b_tot: f64 = b.sum();
        let neg_log_f = |g: f64| {
            let softplus: f64 = (1.0 + g.exp()).ln();
            -(0.5 * 1.0 * g * g - kappa_tot * g + b_tot * softplus)
        };
        let lo = -20.0;
        let hi = 20.0;
        let steps = 400_000;
        let h = (hi - lo) / steps as f64;
        let mut integral = 0.0;
        for k in 0..=steps {
            let g = lo + k as f64 * h;
            let w = if k == 0 || k == steps { 0.5 } else { 1.0 };
            integral += w * neg_log_f(g).exp();
        }
        integral *= h;
        let brute_neg_log = -integral.ln();

        // The PG block integrates the SAME gate integral but marginalizes the
        // PG ω; the deterministic quadrature should land within a loose band of
        // the brute-force logistic integral (the PG marginal is exact; the
        // residual is the row-shared-scale approximation for heteroscedastic
        // rows, small here).
        assert!(
            (pg.neg_log_evidence - brute_neg_log).abs() < 0.25,
            "pg {} vs brute {}",
            pg.neg_log_evidence,
            brute_neg_log
        );
        assert_eq!(pg.lane, PgGateLane::Quadrature);
    }

    /// Determinism: identical inputs → byte-identical evidence, no RNG.
    #[test]
    fn evidence_is_bit_deterministic() {
        let design = array![[1.0, 0.2], [1.0, -0.5], [1.0, 0.9], [1.0, -0.1]];
        let y = array![1.0, 0.0, 1.0, 0.0];
        let b = Array1::<f64>::ones(4);
        let s = Array2::<f64>::eye(2);
        let mk = || GateBlock {
            design: design.view(),
            y: y.view(),
            b: b.view(),
            offset: None,
            psi_hat: None,
            penalty: Some(s.view()),
            hess_rest: None,
            h_rest: None,
        };
        let a = pg_gate_evidence(&mk(), 1e-6).unwrap();
        let c = pg_gate_evidence(&mk(), 1e-6).unwrap();
        assert_eq!(a.neg_log_evidence.to_bits(), c.neg_log_evidence.to_bits());
        assert_eq!(a.nodes, c.nodes);
    }

    /// Near a birth event (gate logits ≈ 0) the PG channel must measurably
    /// differ from the plain moment-matched (single-node Laplace-like) block:
    /// the multi-node quadrature carries the ω-curvature the single point drops.
    #[test]
    fn pg_corrects_moment_matched_near_zero_logit() {
        // Small-n fixture with logits at zero: maximal logistic skew.
        let n = 4;
        let design = Array2::<f64>::ones((n, 1));
        let y = array![1.0, 0.0, 1.0, 0.0];
        let b = Array1::<f64>::ones(n);
        let s = array![[0.5]];
        let psi = Array1::<f64>::zeros(n);
        let block = GateBlock {
            design: design.view(),
            y: y.view(),
            b: b.view(),
            offset: None,
            psi_hat: Some(psi.view()),
            penalty: Some(s.view()),
            hess_rest: None,
            h_rest: None,
        };
        let exact = pg_gate_evidence(&block, 1e-8).unwrap();
        let mm = pg_gate_evidence_moment_matched(&block).unwrap();
        assert_eq!(mm.nodes, 1);
        assert!(exact.nodes > 1);
        // The correction is real (non-zero) and bounded.
        let correction = (exact.neg_log_evidence - mm.neg_log_evidence).abs();
        assert!(
            correction > 1e-6 && correction < 5.0,
            "expected a small nonzero PG correction, got {correction}",
        );
    }
}