ndarray-glm 0.1.0

#!/usr/bin/env Rscript
# Generate reference data for Exponential, Gamma, and InvGaussian regression tests.
# All families use the Log link to avoid sign-convention issues with canonical links.
# Run from tests/R/ directory: Rscript responses.R
# Mostly generated by Claude Code.

options(digits = 17)

# Use near-machine-precision convergence to match our IRLS as closely as possible.
ctrl <- glm.control(epsilon = 1e-14, maxit = 10000)

set.seed(7)
n        <- 20
x1       <- rnorm(n)
x2       <- rnorm(n)
eta_true <- 0.3 + 0.8 * x1 - 0.5 * x2
mu_true  <- exp(eta_true)

var_wt  <- runif(n, 0.5, 2.0)
freq_wt <- sample(1:3, n, replace = TRUE)

# ---- Response generation ----

# Exponential: Gamma(shape=1, rate=1/mu)
y_exp <- rgamma(n, shape = 1, rate = 1 / mu_true)

# Gamma: shape=2 gives phi=0.5
y_gamma <- rgamma(n, shape = 2, rate = 2 / mu_true)

# Inverse Gaussian via Michael et al. (1976) algorithm.
# For InvGauss(mu, lambda): Var(y) = mu^3 / lambda.
# With phi=0.3, lambda=1/phi=1/0.3.
rinvgauss_sim <- function(n, mu, phi) {
  lambda <- 1 / phi
  v      <- rnorm(n)^2
  x      <- mu + mu^2 * v / (2 * lambda) -
             mu / (2 * lambda) * sqrt(4 * mu * lambda * v + mu^2 * v^2)
  z <- runif(n)
  ifelse(z <= mu / (mu + x), x, mu^2 / x)
}
y_ig <- rinvgauss_sim(n, mu_true, phi = 0.3)

write.csv(
  data.frame(y_exp, y_gamma, y_ig, x1, x2, var_wt, freq_wt),
  file = "../data/responses.csv", row.names = FALSE
)

# ---- Frequency-expanded dataset ----
expand_by_freq <- function(df, freq_col) {
  idx <- rep(seq_len(nrow(df)), df[[freq_col]])
  list(data = df[idx, ], orig_idx = cumsum(df[[freq_col]]) - df[[freq_col]] + 1)
}
df       <- data.frame(y_exp, y_gamma, y_ig, x1, x2, var_wt, freq_wt)
exp_res  <- expand_by_freq(df, "freq_wt")
df_exp   <- exp_res$data
orig_idx <- exp_res$orig_idx

# ---- Fixed held-out test observations for prediction (hard-coded in responses.rs) ----
x_test <- data.frame(x1 = c(0.5, -1.0, 2.0), x2 = c(0.3, -0.5, 1.2))

# ---- Export helper ----
#
# Writes only quantities that can be directly compared between R and our library.
# Skipped: AIC/BIC, dispersion, covariance, Cook's, standardized residuals, Wald/exact p-values.
# (These depend on the dispersion estimator, which differs between R (Pearson) and our library
# (deviance-based). Coefficients and deviance are unaffected by phi estimation and are compared.)
#
# When orig_idx is non-NULL (frequency-weighted scenario), per-observation quantities are
# extracted at the indices of the first occurrence of each original observation in the
# expanded data. Per-expanded-row quantities (Pearson, deviance residuals, leverage) are
# skipped entirely.
#
# export_partial: set FALSE when variance weights + intercept are both present, because our
# centering uses fully-weighted column means while R uses only frequency weights.
export_responses_scenario <- function(model, dir_name, orig_idx = NULL,
                                      export_partial = TRUE, pred_data = NULL) {
  dir.create(dir_name, showWarnings = FALSE, recursive = TRUE)

  sub     <- function(x) if (!is.null(orig_idx)) x[orig_idx] else x
  sub_mat <- function(x) if (!is.null(orig_idx)) x[orig_idx, , drop = FALSE] else x

  write(model$coefficients,  file.path(dir_name, "coefficients.csv"),  ncolumns = 1)
  write(model$deviance,      file.path(dir_name, "deviance.csv"),      ncolumns = 1)
  write(model$null.deviance, file.path(dir_name, "null_deviance.csv"), ncolumns = 1)

  write(sub(residuals(model, type = "response")), file.path(dir_name, "resid_resp.csv"), ncolumns = 1)
  write(sub(residuals(model, type = "working")),  file.path(dir_name, "resid_work.csv"), ncolumns = 1)

  if (is.null(orig_idx)) {
    # Not frequency-weighted: export per-observation quantities.
    write(residuals(model, type = "pearson"),  file.path(dir_name, "resid_pear.csv"), ncolumns = 1)
    write(residuals(model, type = "deviance"), file.path(dir_name, "resid_dev.csv"),  ncolumns = 1)
    infl <- influence(model, do.coef = FALSE)
    write(infl$hat, file.path(dir_name, "leverage.csv"), ncolumns = 1)
  }

  if (export_partial) {
    write(t(sub_mat(residuals(model, type = "partial"))),
          file.path(dir_name, "resid_partial.csv"), ncolumns = 1)
  }

  # LR p-value: chi-squared omnibus test; does not depend on phi.
  lr_df     <- model$df.null - model$df.residual
  pvalue_lr <- pchisq(model$null.deviance - model$deviance, df = lr_df, lower.tail = FALSE)
  write(pvalue_lr, file.path(dir_name, "pvalue_lr_test.csv"), ncolumns = 1)

  if (!is.null(pred_data)) {
    new_pred <- predict(model, newdata = pred_data, type = "response")
    write(new_pred, file.path(dir_name, "predict_resp.csv"), ncolumns = 1)
  }

  cat("Exported scenario:", dir_name, "\n")
}

# ---- Exponential ----
# Exponential regression uses the same log-partition and variance function as Gamma
# (both have V(mu)=mu^2 and b(eta)=-log(-eta) with canonical eta=-1/mu).
# R's Gamma(link="log") therefore produces identical coefficient estimates and deviance.
m_exp_none <- glm(y_exp ~ x1 + x2, family = Gamma(link = "log"), control = ctrl)
export_responses_scenario(m_exp_none, "responses_results/exp_none",
                           export_partial = TRUE, pred_data = x_test)

# ---- Gamma ----
m_gamma_none <- glm(y_gamma ~ x1 + x2, family = Gamma(link = "log"), control = ctrl)
export_responses_scenario(m_gamma_none, "responses_results/gamma_none",
                           export_partial = TRUE, pred_data = x_test)

m_gamma_var <- glm(y_gamma ~ x1 + x2, family = Gamma(link = "log"),
                   weights = var_wt, control = ctrl)
export_responses_scenario(m_gamma_var, "responses_results/gamma_var",
                           export_partial = FALSE, pred_data = x_test)

m_gamma_freq <- glm(y_gamma ~ x1 + x2, data = df_exp,
                    family = Gamma(link = "log"), control = ctrl)
export_responses_scenario(m_gamma_freq, "responses_results/gamma_freq", orig_idx,
                           export_partial = TRUE, pred_data = x_test)

# ---- InvGaussian ----
m_ig_none <- glm(y_ig ~ x1 + x2, family = inverse.gaussian(link = "log"), control = ctrl)
export_responses_scenario(m_ig_none, "responses_results/ig_none",
                           export_partial = TRUE, pred_data = x_test)

m_ig_var <- glm(y_ig ~ x1 + x2, family = inverse.gaussian(link = "log"),
                weights = var_wt, control = ctrl)
export_responses_scenario(m_ig_var, "responses_results/ig_var",
                           export_partial = FALSE, pred_data = x_test)

cat("Done. All response scenarios exported.\n")