ndarray-glm 0.1.0

//! Integration tests for Exponential, Gamma, and InvGaussian regression against R's glm(). This
//! source file was mostly generated by Claude Code.
//!
//! All families use the Log link (non-canonical) to avoid the sign-convention difference
//! between our NegRec/NegRecSq canonical links and R's conventions.
//!
//! Quantities compared against R:
//!   - Coefficients, deviance, null deviance
//!   - Response, working, Pearson, deviance residuals
//!   - Partial residuals (skipped when var weights + intercept)
//!   - Leverage (skipped for frequency-weighted scenarios)
//!   - LR omnibus p-value (chi-squared, stats feature only)
//!   - Prediction on held-out data
//!
//! Quantities NOT compared (differ between R and this library):
//!   - AIC/BIC: log_like_sat normalization differs from R's convention
//!   - Dispersion: R uses Pearson estimator; we use deviance-based estimator
//!   - Covariance, Wald z, standardized residuals, Cook's: all depend on dispersion

use anyhow::Result;
use approx::assert_abs_diff_eq;
use ndarray::{Array, Array1, Array2, array};
use ndarray_glm::{
    Exponential, Gamma, InvGaussian, ModelBuilder, exp_link, gamma_link, inv_gauss_link,
};
mod common;
use common::{array_from_csv, load_responses_data};

/// Fixed held-out test observations for prediction (3 rows, not in training set).
/// Matches x_test in responses.R.
fn pred_test_x() -> Array2<f64> {
    array![[0.5, 0.3], [-1.0, -0.5], [2.0, 1.2]]
}

/// Check a single scenario against R's reference CSVs.
///
/// `eps` is the base tolerance for coefficients, deviance, and residuals.
/// A relaxed tolerance of 10×eps is used for working residuals, leverage, and partial residuals,
/// which involve more floating-point operations.
///
/// Gating rules:
/// - `resid_pear`, `resid_dev`, `leverage`: pass `None` for frequency-weighted scenarios (R
///   expands rows; we work with the original n observations).
/// - `resid_part`: pass `None` when variance weights and an intercept are both present (our
///   centering uses fully-weighted column means; R uses only frequency weights).
/// - `pvalue_lr`: pass `None` to skip (used when stats feature is disabled).
#[allow(clippy::too_many_arguments)]
fn check_responses_scenario(
    dir: &str,
    eps: f64,
    result: &Array1<f64>,
    deviance: f64,
    lr_stat: f64, // null_deviance - deviance
    predict_resp: Array1<f64>,
    resid_resp: Array1<f64>,
    resid_work: Array1<f64>,
    resid_pear: Option<&Array1<f64>>,
    resid_dev: Option<Array1<f64>>,
    leverage: Option<Array1<f64>>,
    resid_part: Option<Array2<f64>>,
    pvalue_lr: Option<f64>,
) -> Result<()> {
    let r =
        |name: &str| array_from_csv::<f64>(&format!("tests/R/responses_results/{dir}/{name}.csv"));
    let cov_eps = 10. * eps;

    // --- Coefficients ---
    let r_coef = r("coefficients")?;
    assert_abs_diff_eq!(result, &r_coef, epsilon = eps);

    // --- Deviance ---
    let r_dev = r("deviance")?[0];
    assert_abs_diff_eq!(deviance, r_dev, epsilon = eps);

    // --- Null deviance ---
    let r_null_dev = r("null_deviance")?[0];
    assert_abs_diff_eq!(lr_stat + deviance, r_null_dev, epsilon = cov_eps);

    // --- Prediction ---
    let r_pred = r("predict_resp")?;
    assert_abs_diff_eq!(&predict_resp, &r_pred, epsilon = eps);

    // --- Response residuals ---
    let r_resid_resp = r("resid_resp")?;
    assert_abs_diff_eq!(&resid_resp, &r_resid_resp, epsilon = eps);

    // --- Working residuals ---
    let r_resid_work = r("resid_work")?;
    assert_abs_diff_eq!(&resid_work, &r_resid_work, epsilon = cov_eps);

    // --- Pearson residuals (not freq-weighted) ---
    if let Some(pear) = resid_pear {
        let r_resid_pear = r("resid_pear")?;
        assert_abs_diff_eq!(pear, &r_resid_pear, epsilon = eps);
    }

    // --- Deviance residuals (not freq-weighted) ---
    if let Some(dev) = resid_dev {
        let r_resid_dev = r("resid_dev")?;
        assert_abs_diff_eq!(&dev, &r_resid_dev, epsilon = eps);
    }

    // --- Leverage (not freq-weighted) ---
    if let Some(lev) = leverage {
        let r_lev = r("leverage")?;
        assert_abs_diff_eq!(&lev, &r_lev, epsilon = cov_eps);
    }

    // --- Partial residuals (not var-weighted with intercept) ---
    if let Some(part) = resid_part {
        let r_part_flat = r("resid_partial")?;
        let n_feat = part.ncols();
        let r_part = Array::from_shape_vec(
            (part.nrows(), n_feat),
            r_part_flat.into_raw_vec_and_offset().0,
        )?;
        assert_abs_diff_eq!(&part, &r_part, epsilon = cov_eps);
    }

    // --- LR omnibus p-value (requires stats feature) ---
    if let Some(p) = pvalue_lr {
        let r_lr_p = r("pvalue_lr_test")?[0];
        assert_abs_diff_eq!(p, r_lr_p, epsilon = eps);
    }

    Ok(())
}

// ============================================================================
// Exponential scenarios
// ============================================================================

/// Exponential regression (Log link), no weights.
/// R reference: Gamma(link="log") on the same data; coefficients and deviance are identical
/// since Exponential and Gamma share the same log-partition and variance function.
#[test]
fn responses_exp_none() -> Result<()> {
    let (y_exp, _, _, x, _, _) = load_responses_data()?;
    let model = ModelBuilder::<Exponential<exp_link::Log>>::data(&y_exp, &x).build()?;
    let fit = model.fit()?;
    let pred_x = pred_test_x();
    let _loo = fit.loo_exact()?;
    let _infl = fit.infl_coef()?;
    #[cfg(feature = "stats")]
    let pvalue_lr = Some(fit.pvalue_lr_test());
    #[cfg(not(feature = "stats"))]
    let pvalue_lr: Option<f64> = None;
    check_responses_scenario(
        "exp_none",
        1e-6,
        &fit.result,
        fit.deviance(),
        fit.lr_test(),
        fit.predict(&pred_x, None),
        fit.resid_resp(),
        fit.resid_work(),
        Some(fit.resid_pear()),
        Some(fit.resid_dev()),
        Some(fit.leverage()?),
        Some(fit.resid_part()),
        pvalue_lr,
    )
}

// ============================================================================
// Gamma scenarios
// ============================================================================

/// Gamma regression (Log link), no weights.
#[test]
fn responses_gamma_none() -> Result<()> {
    let (_, y_gamma, _, x, _, _) = load_responses_data()?;
    let model = ModelBuilder::<Gamma<gamma_link::Log>>::data(&y_gamma, &x).build()?;
    let fit = model.fit()?;
    let pred_x = pred_test_x();
    let _loo = fit.loo_exact()?;
    let _infl = fit.infl_coef()?;
    #[cfg(feature = "stats")]
    let pvalue_lr = Some(fit.pvalue_lr_test());
    #[cfg(not(feature = "stats"))]
    let pvalue_lr: Option<f64> = None;
    check_responses_scenario(
        "gamma_none",
        1e-7,
        &fit.result,
        fit.deviance(),
        fit.lr_test(),
        fit.predict(&pred_x, None),
        fit.resid_resp(),
        fit.resid_work(),
        Some(fit.resid_pear()),
        Some(fit.resid_dev()),
        Some(fit.leverage()?),
        Some(fit.resid_part()),
        pvalue_lr,
    )
}

/// Gamma regression (Log link), variance weights.
/// Partial residuals are skipped: our centering uses fully-weighted means; R uses only freq weights.
#[test]
fn responses_gamma_var_weights() -> Result<()> {
    let (_, y_gamma, _, x, var_wt, _) = load_responses_data()?;
    let model = ModelBuilder::<Gamma<gamma_link::Log>>::data(&y_gamma, &x)
        .var_weights(var_wt)
        .build()?;
    let fit = model.fit()?;
    let pred_x = pred_test_x();
    let _loo = fit.loo_exact()?;
    #[cfg(feature = "stats")]
    let pvalue_lr = Some(fit.pvalue_lr_test());
    #[cfg(not(feature = "stats"))]
    let pvalue_lr: Option<f64> = None;
    check_responses_scenario(
        "gamma_var",
        1e-7,
        &fit.result,
        fit.deviance(),
        fit.lr_test(),
        fit.predict(&pred_x, None),
        fit.resid_resp(),
        fit.resid_work(),
        Some(fit.resid_pear()),
        Some(fit.resid_dev()),
        Some(fit.leverage()?),
        None, // skip partial: var weights + intercept
        pvalue_lr,
    )
}

/// Gamma regression (Log link), frequency weights.
/// Per-observation quantities (Pearson, deviance residuals, leverage) are skipped: R expands
/// rows while we work with the original n observations.
#[test]
fn responses_gamma_freq_weights() -> Result<()> {
    let (_, y_gamma, _, x, _, freq_wt) = load_responses_data()?;
    let model = ModelBuilder::<Gamma<gamma_link::Log>>::data(&y_gamma, &x)
        .freq_weights(freq_wt)
        .build()?;
    let fit = model.fit()?;
    let pred_x = pred_test_x();
    #[cfg(feature = "stats")]
    let pvalue_lr = Some(fit.pvalue_lr_test());
    #[cfg(not(feature = "stats"))]
    let pvalue_lr: Option<f64> = None;
    check_responses_scenario(
        "gamma_freq",
        1e-7,
        &fit.result,
        fit.deviance(),
        fit.lr_test(),
        fit.predict(&pred_x, None),
        fit.resid_resp(),
        fit.resid_work(),
        None, // skip: freq-weighted
        None, // skip: freq-weighted
        None, // skip: freq-weighted
        Some(fit.resid_part()),
        pvalue_lr,
    )
}

// ============================================================================
// InvGaussian scenarios
// ============================================================================

/// Inverse Gaussian regression (Log link), no weights.
#[test]
fn responses_ig_none() -> Result<()> {
    let (_, _, y_ig, x, _, _) = load_responses_data()?;
    let model = ModelBuilder::<InvGaussian<inv_gauss_link::Log>>::data(&y_ig, &x).build()?;
    let fit = model.fit()?;
    let pred_x = pred_test_x();
    let _loo = fit.loo_exact()?;
    let _infl = fit.infl_coef()?;
    #[cfg(feature = "stats")]
    let pvalue_lr = Some(fit.pvalue_lr_test());
    #[cfg(not(feature = "stats"))]
    let pvalue_lr: Option<f64> = None;
    check_responses_scenario(
        "ig_none",
        1e-6,
        &fit.result,
        fit.deviance(),
        fit.lr_test(),
        fit.predict(&pred_x, None),
        fit.resid_resp(),
        fit.resid_work(),
        Some(fit.resid_pear()),
        Some(fit.resid_dev()),
        Some(fit.leverage()?),
        Some(fit.resid_part()),
        pvalue_lr,
    )
}

/// Inverse Gaussian regression (Log link), variance weights.
/// Partial residuals are skipped: our centering uses fully-weighted means; R uses only freq weights.
#[test]
fn responses_ig_var_weights() -> Result<()> {
    let (_, _, y_ig, x, var_wt, _) = load_responses_data()?;
    let model = ModelBuilder::<InvGaussian<inv_gauss_link::Log>>::data(&y_ig, &x)
        .var_weights(var_wt)
        .build()?;
    let fit = model.fit()?;
    let pred_x = pred_test_x();
    let _loo = fit.loo_exact()?;
    #[cfg(feature = "stats")]
    let pvalue_lr = Some(fit.pvalue_lr_test());
    #[cfg(not(feature = "stats"))]
    let pvalue_lr: Option<f64> = None;
    check_responses_scenario(
        "ig_var",
        1e-6,
        &fit.result,
        fit.deviance(),
        fit.lr_test(),
        fit.predict(&pred_x, None),
        fit.resid_resp(),
        fit.resid_work(),
        Some(fit.resid_pear()),
        Some(fit.resid_dev()),
        Some(fit.leverage()?),
        None, // skip partial: var weights + intercept
        pvalue_lr,
    )
}