use anyhow::{bail, Result};
use ndarray::{s, Array1, Array2, ArrayView1, ArrayView2, Axis};
use crate::linalg::{eigh, inv_upper, matrix_rank, qr_econ, xtx_inv_from_r};
#[derive(Clone, Debug)]
pub struct MArrayLM {
pub coefficients: Array2<f64>, pub stdev_unscaled: Array2<f64>, pub sigma: Array1<f64>, pub df_residual: Array1<f64>, pub cov_coefficients: Array2<f64>, pub gene_names: Vec<String>,
pub coef_names: Vec<String>,
pub amean: Array1<f64>, pub design: Option<Array2<f64>>,
pub contrasts: Option<Array2<f64>>,
pub df_prior: Option<Array1<f64>>,
pub s2_prior: Option<Array1<f64>>, pub var_prior: Option<Array1<f64>>, pub proportion: Option<f64>,
pub s2_post: Option<Array1<f64>>,
pub t: Option<Array2<f64>>,
pub df_total: Option<Array1<f64>>,
pub p_value: Option<Array2<f64>>,
pub lods: Option<Array2<f64>>,
pub f_stat: Option<Array1<f64>>,
pub f_p_value: Option<Array1<f64>>,
}
impl MArrayLM {
pub fn n_genes(&self) -> usize {
self.coefficients.nrows()
}
pub fn n_coef(&self) -> usize {
self.coefficients.ncols()
}
pub fn fitted(&self) -> Result<Array2<f64>> {
if self.contrasts.is_some() {
bail!("fit contains contrasts rather than coefficients, so fitted values cannot be computed");
}
let design = match &self.design {
Some(d) => d,
None => bail!("fit has no design matrix, so fitted values cannot be computed"),
};
Ok(self.coefficients.dot(&design.t()))
}
pub fn residuals(&self, y: &Array2<f64>) -> Result<Array2<f64>> {
let fitted = self.fitted()?;
Ok(y - &fitted)
}
}
fn row_nanmean(exprs: &Array2<f64>) -> Array1<f64> {
let n = exprs.nrows();
let mut out = Array1::<f64>::zeros(n);
for i in 0..n {
let mut sum = 0.0;
let mut cnt = 0usize;
for &v in exprs.row(i) {
if v.is_finite() {
sum += v;
cnt += 1;
}
}
out[i] = if cnt > 0 { sum / cnt as f64 } else { f64::NAN };
}
out
}
pub fn non_estimable(design: &Array2<f64>) -> Option<Vec<usize>> {
let p = design.ncols();
let rank = matrix_rank(design);
if rank < p {
let mut kept: Vec<usize> = Vec::new();
let mut dependent: Vec<usize> = Vec::new();
for j in 0..p {
let mut trial: Vec<usize> = kept.clone();
trial.push(j);
let sub = design.select(Axis(1), &trial);
if matrix_rank(&sub) == trial.len() {
kept.push(j);
} else {
dependent.push(j);
}
}
Some(dependent)
} else {
None
}
}
pub fn is_fullrank(x: &Array2<f64>) -> bool {
let (evals, _) = eigh(&x.t().dot(x));
let n = evals.len();
let largest = evals[n - 1];
let smallest = evals[0];
largest > 0.0 && (smallest / largest).abs() > 1e-13
}
pub fn lmfit(
exprs: &Array2<f64>,
design: &Array2<f64>,
gene_names: Vec<String>,
coef_names: Vec<String>,
) -> Result<MArrayLM> {
let n_genes = exprs.nrows();
let n_samples = exprs.ncols();
let p = design.ncols();
if n_genes == 0 {
bail!("expression matrix has zero rows");
}
if design.nrows() != n_samples {
bail!(
"row dimension of design ({}) does not match column dimension of data ({})",
design.nrows(),
n_samples
);
}
if design.iter().any(|v| v.is_nan()) {
bail!("NAs not allowed in design matrix");
}
if matrix_rank(design) < p {
bail!(
"design matrix is not of full column rank ({} of {} columns estimable); \
reduce the design to estimable coefficients",
matrix_rank(design),
p
);
}
let amean = row_nanmean(exprs);
let any_missing = exprs.iter().any(|v| !v.is_finite());
let mut fit = if !any_missing {
lm_series_fast(exprs, design, &gene_names, &coef_names)
} else {
lm_series_genewise(exprs, design, &gene_names, &coef_names)
};
fit.amean = amean;
fit.design = Some(design.clone());
Ok(fit)
}
pub fn lmfit_weighted(
exprs: &Array2<f64>,
design: &Array2<f64>,
weights: &Array2<f64>,
gene_names: Vec<String>,
coef_names: Vec<String>,
) -> Result<MArrayLM> {
let n_genes = exprs.nrows();
let n_samples = exprs.ncols();
let p = design.ncols();
if n_genes == 0 {
bail!("expression matrix has zero rows");
}
if design.nrows() != n_samples {
bail!(
"row dimension of design ({}) does not match column dimension of data ({})",
design.nrows(),
n_samples
);
}
if weights.nrows() != n_genes || weights.ncols() != n_samples {
bail!(
"weights dimensions ({}x{}) must match expression matrix ({}x{})",
weights.nrows(),
weights.ncols(),
n_genes,
n_samples
);
}
if design.iter().any(|v| v.is_nan()) {
bail!("NAs not allowed in design matrix");
}
if matrix_rank(design) < p {
bail!(
"design matrix is not of full column rank ({} of {} columns estimable); \
reduce the design to estimable coefficients",
matrix_rank(design),
p
);
}
let amean = row_nanmean(exprs);
let mut fit = lm_series_weighted(exprs, design, weights, &gene_names, &coef_names);
fit.amean = amean;
fit.design = Some(design.clone());
Ok(fit)
}
fn lm_series_fast(
exprs: &Array2<f64>,
design: &Array2<f64>,
gene_names: &[String],
coef_names: &[String],
) -> MArrayLM {
let n = design.nrows();
let p = design.ncols();
let n_genes = exprs.nrows();
let (q, r) = qr_econ(design);
let rinv = inv_upper(&r);
let xtx_inv = xtx_inv_from_r(&r);
let yt = exprs.t().to_owned(); let qty = q.t().dot(&yt); let beta = rinv.dot(&qty); let coefficients = beta.t().to_owned();
let df_res = (n - p) as f64;
let mut sigma = Array1::<f64>::zeros(n_genes);
if df_res > 0.0 {
let mut ss_fit = vec![0.0f64; n_genes];
for j in 0..p {
for (g, &v) in qty.row(j).iter().enumerate() {
ss_fit[g] += v * v;
}
}
for g in 0..n_genes {
let ssy: f64 = exprs.row(g).iter().map(|&v| v * v).sum();
let rss = (ssy - ss_fit[g]).max(0.0);
sigma[g] = (rss / df_res).sqrt();
}
} else {
sigma.fill(f64::NAN);
}
let diag_se: Array1<f64> = (0..p).map(|j| xtx_inv[[j, j]].sqrt()).collect();
let mut stdev_unscaled = Array2::<f64>::zeros((n_genes, p));
for g in 0..n_genes {
for j in 0..p {
stdev_unscaled[[g, j]] = diag_se[j];
}
}
let df_residual = Array1::from_elem(n_genes, df_res);
new_fit(
coefficients,
stdev_unscaled,
sigma,
df_residual,
xtx_inv,
gene_names,
coef_names,
)
}
struct RowFit {
coef: Vec<f64>, stdev: Vec<f64>, sigma: f64, df: f64, }
impl RowFit {
fn skipped(p: usize) -> Self {
RowFit {
coef: vec![f64::NAN; p],
stdev: vec![f64::NAN; p],
sigma: f64::NAN,
df: 0.0,
}
}
}
struct GeneScratch {
obs: Vec<usize>,
xtil: Array2<f64>,
ytil: Array1<f64>,
}
impl GeneScratch {
fn new(n_arrays: usize, p: usize) -> Self {
GeneScratch {
obs: Vec::with_capacity(n_arrays),
xtil: Array2::zeros((n_arrays, p)),
ytil: Array1::zeros(n_arrays),
}
}
}
fn map_genes_init<S, I, F>(n: usize, init: I, f: F) -> Vec<RowFit>
where
S: Send,
I: Fn() -> S + Sync + Send,
F: Fn(&mut S, usize) -> RowFit + Sync + Send,
{
#[cfg(feature = "parallel")]
{
use rayon::prelude::*;
(0..n)
.into_par_iter()
.map_init(init, |s, g| f(s, g))
.collect()
}
#[cfg(not(feature = "parallel"))]
{
let mut s = init();
(0..n).map(|g| f(&mut s, g)).collect()
}
}
fn scatter_rows(
rows: Vec<RowFit>,
coefficients: &mut Array2<f64>,
stdev_unscaled: &mut Array2<f64>,
sigma: &mut Array1<f64>,
df_residual: &mut Array1<f64>,
) {
let p = coefficients.ncols();
for (g, r) in rows.into_iter().enumerate() {
for j in 0..p {
coefficients[[g, j]] = r.coef[j];
stdev_unscaled[[g, j]] = r.stdev[j];
}
sigma[g] = r.sigma;
df_residual[g] = r.df;
}
}
#[cfg(feature = "faer")]
fn solve_scaled(xtil: ArrayView2<f64>, ytil: ArrayView1<f64>) -> (Vec<f64>, Vec<f64>, f64) {
use faer::{prelude::*, Mat};
let n_obs = xtil.nrows();
let p = xtil.ncols();
let xf = Mat::from_fn(n_obs, p, |i, j| xtil[[i, j]]);
let yf = Mat::from_fn(n_obs, 1, |i, _| ytil[i]);
let qr = xf.qr();
let beta = qr.solve_lstsq(&yf); let coef: Vec<f64> = (0..p).map(|j| beta[(j, 0)]).collect();
let rf = qr.thin_R();
let mut r = Array2::<f64>::zeros((p, p));
for i in 0..p {
for j in i..p {
r[[i, j]] = rf[(i, j)];
}
}
let rinv = inv_upper(&r);
let stdev: Vec<f64> = (0..p)
.map(|j| {
(0..p)
.map(|k| rinv[[j, k]] * rinv[[j, k]])
.sum::<f64>()
.sqrt()
})
.collect();
let mut rss = 0.0_f64;
for i in 0..n_obs {
let mut fit = 0.0;
for (j, &b) in coef.iter().enumerate() {
fit += xtil[[i, j]] * b;
}
let e = ytil[i] - fit;
rss += e * e;
}
(coef, stdev, rss.max(0.0))
}
#[cfg(not(feature = "faer"))]
fn solve_scaled(xtil: ArrayView2<f64>, ytil: ArrayView1<f64>) -> (Vec<f64>, Vec<f64>, f64) {
let p = xtil.ncols();
let xq = xtil.to_owned();
let (q, r) = qr_econ(&xq);
let rinv = inv_upper(&r);
let qty = q.t().dot(&ytil);
let beta = rinv.dot(&qty);
let xtx_inv = xtx_inv_from_r(&r);
let coef: Vec<f64> = (0..p).map(|j| beta[j]).collect();
let stdev: Vec<f64> = (0..p).map(|j| xtx_inv[[j, j]].sqrt()).collect();
let ssy: f64 = ytil.iter().map(|&v| v * v).sum();
let ss_fit: f64 = qty.iter().map(|&v| v * v).sum();
let rss = (ssy - ss_fit).max(0.0);
(coef, stdev, rss)
}
fn ols_one_gene(
scratch: &mut GeneScratch,
row: ArrayView1<f64>,
design: &Array2<f64>,
p: usize,
) -> RowFit {
let GeneScratch { obs, xtil, ytil } = scratch;
obs.clear();
obs.extend((0..row.len()).filter(|&i| row[i].is_finite()));
if obs.is_empty() {
return RowFit::skipped(p);
}
let n_obs = obs.len();
if n_obs < p {
return RowFit::skipped(p);
}
for (r, &i) in obs.iter().enumerate() {
ytil[r] = row[i];
for j in 0..p {
xtil[[r, j]] = design[[i, j]];
}
}
let (coef, stdev, rss) = solve_scaled(xtil.slice(s![..n_obs, ..]), ytil.slice(s![..n_obs]));
let df = (n_obs - p) as f64;
let sigma = if df > 0.0 {
(rss / df).sqrt()
} else {
f64::NAN
};
RowFit {
coef,
stdev,
sigma,
df,
}
}
fn wls_one_gene(
scratch: &mut GeneScratch,
yr: ArrayView1<f64>,
wr: ArrayView1<f64>,
design: &Array2<f64>,
p: usize,
) -> RowFit {
let GeneScratch { obs, xtil, ytil } = scratch;
obs.clear();
obs.extend((0..yr.len()).filter(|&k| yr[k].is_finite() && wr[k].is_finite() && wr[k] > 0.0));
let n_obs = obs.len();
if n_obs < p {
return RowFit::skipped(p);
}
for (r, &k) in obs.iter().enumerate() {
let sw = wr[k].sqrt();
ytil[r] = sw * yr[k];
for j in 0..p {
xtil[[r, j]] = sw * design[[k, j]];
}
}
let (coef, stdev, rss) = solve_scaled(xtil.slice(s![..n_obs, ..]), ytil.slice(s![..n_obs]));
let df = (n_obs - p) as f64;
let sigma = if df > 0.0 {
(rss / df).sqrt()
} else {
f64::NAN
};
RowFit {
coef,
stdev,
sigma,
df,
}
}
fn lm_series_genewise(
exprs: &Array2<f64>,
design: &Array2<f64>,
gene_names: &[String],
coef_names: &[String],
) -> MArrayLM {
let p = design.ncols();
let n_genes = exprs.nrows();
let mut coefficients = Array2::<f64>::from_elem((n_genes, p), f64::NAN);
let mut stdev_unscaled = Array2::<f64>::from_elem((n_genes, p), f64::NAN);
let mut sigma = Array1::<f64>::from_elem(n_genes, f64::NAN);
let mut df_residual = Array1::<f64>::zeros(n_genes);
let n_arrays = design.nrows();
let rows = map_genes_init(
n_genes,
|| GeneScratch::new(n_arrays, p),
|s, g| ols_one_gene(s, exprs.row(g), design, p),
);
scatter_rows(
rows,
&mut coefficients,
&mut stdev_unscaled,
&mut sigma,
&mut df_residual,
);
let (_q, r) = qr_econ(design);
let xtx_inv = xtx_inv_from_r(&r);
new_fit(
coefficients,
stdev_unscaled,
sigma,
df_residual,
xtx_inv,
gene_names,
coef_names,
)
}
fn lm_series_weighted(
exprs: &Array2<f64>,
design: &Array2<f64>,
weights: &Array2<f64>,
gene_names: &[String],
coef_names: &[String],
) -> MArrayLM {
let p = design.ncols();
let n_genes = exprs.nrows();
let mut coefficients = Array2::<f64>::from_elem((n_genes, p), f64::NAN);
let mut stdev_unscaled = Array2::<f64>::from_elem((n_genes, p), f64::NAN);
let mut sigma = Array1::<f64>::from_elem(n_genes, f64::NAN);
let mut df_residual = Array1::<f64>::zeros(n_genes);
let n_arrays = design.nrows();
let rows = map_genes_init(
n_genes,
|| GeneScratch::new(n_arrays, p),
|s, g| wls_one_gene(s, exprs.row(g), weights.row(g), design, p),
);
scatter_rows(
rows,
&mut coefficients,
&mut stdev_unscaled,
&mut sigma,
&mut df_residual,
);
let (_q, r) = qr_econ(design);
let xtx_inv = xtx_inv_from_r(&r);
new_fit(
coefficients,
stdev_unscaled,
sigma,
df_residual,
xtx_inv,
gene_names,
coef_names,
)
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn new_fit(
coefficients: Array2<f64>,
stdev_unscaled: Array2<f64>,
sigma: Array1<f64>,
df_residual: Array1<f64>,
cov_coefficients: Array2<f64>,
gene_names: &[String],
coef_names: &[String],
) -> MArrayLM {
let n_genes = coefficients.nrows();
MArrayLM {
coefficients,
stdev_unscaled,
sigma,
df_residual,
cov_coefficients,
gene_names: gene_names.to_vec(),
coef_names: coef_names.to_vec(),
amean: Array1::zeros(n_genes),
design: None,
contrasts: None,
df_prior: None,
s2_prior: None,
var_prior: None,
proportion: None,
s2_post: None,
t: None,
df_total: None,
p_value: None,
lods: None,
f_stat: None,
f_p_value: None,
}
}
#[cfg(test)]
#[allow(clippy::excessive_precision)]
mod tests {
use super::*;
use ndarray::array;
#[test]
fn is_fullrank_matches_r() {
let full = array![[1.0, 0.0], [1.0, 0.0], [1.0, 1.0]];
let deficient = array![[1.0, 2.0], [2.0, 4.0], [3.0, 6.0]];
let single = array![[1.0], [2.0]];
assert!(is_fullrank(&full));
assert!(!is_fullrank(&deficient));
assert!(is_fullrank(&single));
}
fn rclose(a: f64, b: f64) -> bool {
(a - b).abs() <= 1e-7 * (1.0 + b.abs())
}
fn weighted_fixture() -> (Array2<f64>, Array2<f64>, Array2<f64>) {
let ngenes = 8usize;
let narrays = 6usize;
let group = [0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
let mut y = Array2::<f64>::zeros((ngenes, narrays));
let mut w = Array2::<f64>::zeros((ngenes, narrays));
for g in 0..ngenes {
let gi = g as i64;
let base = ((gi % 5) - 2) as f64;
let eff = (((gi * 3) % 7) - 3) as f64;
let mu = 6.0 + (gi % 4) as f64 * 0.5;
for k in 0..narrays {
let ki = k as i64;
let noise = (((gi * 13 + ki * 17) % 19) - 9) as f64 * 0.05;
y[[g, k]] = mu + eff * 0.3 * group[k] + base * 0.1 + noise;
w[[g, k]] = 0.5 + (((gi * 7 + ki * 5) % 11) as f64) * 0.1;
}
}
let mut design = Array2::<f64>::zeros((narrays, 2));
for k in 0..narrays {
design[[k, 0]] = 1.0;
design[[k, 1]] = group[k];
}
(y, design, w)
}
fn names(n: usize, p: usize) -> (Vec<String>, Vec<String>) {
((0..n).map(|i| format!("g{i}")).collect(), {
let mut c = vec!["Intercept".to_string()];
for j in 1..p {
c.push(format!("x{j}"));
}
c
})
}
#[test]
fn lmfit_weighted_matches_r() {
let (y, design, w) = weighted_fixture();
let (gn, cn) = names(8, 2);
let fit = lmfit_weighted(&y, &design, &w, gn, cn).unwrap();
let coef = [
[6.0083333333333346, -1.0051075268817207],
[6.5034482758620706, -0.33678160919540306],
[6.8035714285714244, 1.1567733990147793],
[7.677631578947369, -0.22406015037593929],
[6.3538461538461544, 0.29878542510121447],
[6.1539999999999981, -0.534769230769229],
[6.8057142857142869, 0.59828571428571331],
[7.7029411764705884, -1.200084033613446],
];
let stdev = [
[0.57735026918962584, 0.80988516376991593],
[0.58722021951470349, 0.8235052638205963],
[0.59761430466719667, 0.83783676414308383],
[0.51298917604257699, 0.78759174188135006],
[0.62017367294604198, 0.80484363658553359],
[0.63245553203367566, 0.88578517972214033],
[0.5345224838248489, 0.82807867121082501],
[0.54232614454664041, 0.7614669610515673],
];
let sigma = [
0.26599314305172844,
0.099539168054648686,
0.33639780799178071,
0.38269209879641242,
0.11173095214852602,
0.31304890254497836,
0.36171318550949705,
0.10717044462761564,
];
for g in 0..8 {
for j in 0..2 {
assert!(
rclose(fit.coefficients[[g, j]], coef[g][j]),
"coef[{g}][{j}] {}",
fit.coefficients[[g, j]]
);
assert!(
rclose(fit.stdev_unscaled[[g, j]], stdev[g][j]),
"stdev[{g}][{j}] {}",
fit.stdev_unscaled[[g, j]]
);
}
assert!(
rclose(fit.sigma[g], sigma[g]),
"sigma[{g}] {}",
fit.sigma[g]
);
assert_eq!(fit.df_residual[g], 4.0);
}
assert!(rclose(fit.cov_coefficients[[0, 0]], 0.33333333333333348));
assert!(rclose(fit.cov_coefficients[[0, 1]], -0.33333333333333354));
assert!(rclose(fit.cov_coefficients[[1, 1]], 0.66666666666666685));
}
#[test]
fn lmfit_weighted_drops_zero_weight_and_missing() {
let (mut y, design, mut w) = weighted_fixture();
w[[3, 4]] = 0.0; y[[5, 2]] = f64::NAN; let (gn, cn) = names(8, 2);
let fit = lmfit_weighted(&y, &design, &w, gn, cn).unwrap();
let df = [4.0, 4.0, 4.0, 3.0, 4.0, 3.0, 4.0, 4.0];
for (g, &expect) in df.iter().enumerate() {
assert_eq!(fit.df_residual[g], expect, "df[{g}]");
}
assert!(rclose(fit.coefficients[[3, 0]], 7.6776315789473708));
assert!(rclose(fit.coefficients[[3, 1]], -0.22096491228070209));
assert!(rclose(fit.coefficients[[5, 0]], 6.1868421052631613));
assert!(rclose(fit.coefficients[[5, 1]], -0.56761133603239));
assert!(
rclose(fit.sigma[3], 0.44188309824502131),
"sigma3 {}",
fit.sigma[3]
);
assert!(
rclose(fit.sigma[5], 0.35751900376998208),
"sigma5 {}",
fit.sigma[5]
);
assert!(rclose(fit.stdev_unscaled[[3, 1]], 0.96427411113412587));
assert!(rclose(fit.stdev_unscaled[[5, 0]], 0.72547625011001193));
assert!(rclose(fit.coefficients[[0, 1]], -1.0051075268817207));
assert!(rclose(fit.sigma[7], 0.10717044462761564));
}
#[test]
fn fitted_and_residuals_match_r() {
let group = [0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
let mut y = Array2::<f64>::zeros((5, 6));
for g0 in 0..5 {
let lvl = 5.0 + (g0 % 4) as f64 * 0.3;
let eff = (((g0 as i64 * 3) % 7) - 3) as f64 * 0.2;
for k0 in 0..6 {
let noise = (((g0 as i64 * 7 + k0 as i64 * 13) % 11) - 5) as f64 * 0.05;
y[[g0, k0]] = lvl + eff * group[k0] + noise;
}
}
let mut design = Array2::<f64>::zeros((6, 2));
for k0 in 0..6 {
design[[k0, 0]] = 1.0;
design[[k0, 1]] = group[k0];
}
let (gn, cn) = (
(0..5).map(|i| format!("g{i}")).collect::<Vec<_>>(),
vec!["Intercept".into(), "group".into()],
);
let fit = lmfit(&y, &design, gn, cn).unwrap();
let fitted = fit.fitted().unwrap();
assert_eq!(fitted.dim(), (5, 6));
assert!(rclose(fitted[[0, 0]], 4.8500000000000014));
assert!(rclose(fitted[[0, 3]], 4.5500000000000007));
assert!(rclose(fitted[[4, 0]], 5.1499999999999995));
assert!(rclose(fitted[[4, 3]], 5.2999999999999998));
assert!(rclose(fitted[[0, 1]], fitted[[0, 0]]));
assert!(rclose(fitted[[0, 5]], fitted[[0, 3]]));
let resid = fit.residuals(&y).unwrap();
assert_eq!(resid.dim(), (5, 6));
assert!(rclose(resid[[0, 0]], -0.10000000000000142));
assert!(rclose(resid[[0, 2]], 0.099999999999998757));
assert!(rclose(resid[[4, 2]], 0.10000000000000053));
for g in 0..5 {
for k in 0..6 {
assert!(rclose(y[[g, k]], fitted[[g, k]] + resid[[g, k]]));
}
}
}
}