use anyhow::{anyhow, bail, Result};
use ndarray::{Array1, Array2};
use statrs::distribution::{ContinuousCDF, Normal};
use crate::classifytestsf::classify_tests_fstat;
use crate::ebayes::ebayes;
use crate::fit::MArrayLM;
use crate::fitgamma::fit_gamma_intercept;
use crate::linalg::cov2cor;
use crate::optim::nelder_mead;
use crate::predfcm::pred_fcm;
use crate::proptruenull::{prop_true_null, PropTrueNullMethod};
use crate::special::{chi2_sf, f_sf};
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum GenasSubset {
All,
Fpval,
PUnion,
PInt,
LogFC,
PredFC,
}
#[derive(Debug, Clone)]
pub struct Genas {
pub technical_correlation: f64,
pub covariance_matrix: [[f64; 2]; 2],
pub biological_correlation: f64,
pub deviance: f64,
pub p_value: f64,
pub n: usize,
}
pub fn genas(fit: &MArrayLM, coef: (usize, usize), subset: GenasSubset) -> Result<Genas> {
let ngenes = fit.coefficients.nrows();
if ngenes < 1 {
return Ok(null_genas());
}
let (c1, c2) = coef;
let ncoef = fit.coefficients.ncols();
if c1 >= ncoef || c2 >= ncoef {
bail!("coef out of range for a {ncoef}-coefficient fit");
}
let s2post = fit
.s2_post
.as_ref()
.ok_or_else(|| anyhow!("genas requires an eBayes fit (s2_post missing)"))?;
let dft = fit
.df_total
.as_ref()
.ok_or_else(|| anyhow!("genas requires an eBayes fit (df_total missing)"))?;
let b0_full = fit.coefficients.column(c1).to_vec();
let b1_full = fit.coefficients.column(c2).to_vec();
let v = cov_block(fit, c1, c2);
let s2post_slice = s2post.as_slice().expect("contiguous s2_post");
let x_start = gamma_start(&b0_full, &b1_full, v, s2post_slice);
if subset == GenasSubset::All {
return Ok(genas_core(
&b0_full,
&b1_full,
v,
s2post_slice,
dft.as_slice().expect("contiguous df_total"),
x_start,
));
}
let mask = which_genes(fit, subset, c1, c2)?;
let selected: Vec<usize> = (0..ngenes).filter(|&g| mask[g]).collect();
if selected.is_empty() {
return Ok(null_genas());
}
let trend = is_varying(fit.s2_prior.as_ref());
let robust = is_varying(fit.df_prior.as_ref());
let mut sub = build_subfit(fit, &selected, c1, c2);
ebayes(&mut sub, 0.01, (0.1, 4.0), trend, robust)?;
let s2post2 = sub.s2_post.as_ref().expect("eBayes fills s2_post");
let dft2 = sub.df_total.as_ref().expect("eBayes fills df_total");
let b0 = sub.coefficients.column(0).to_vec();
let b1 = sub.coefficients.column(1).to_vec();
Ok(genas_core(
&b0,
&b1,
v,
s2post2.as_slice().expect("contiguous s2_post"),
dft2.as_slice().expect("contiguous df_total"),
x_start,
))
}
fn gamma_start(b0: &[f64], b1: &[f64], v: [[f64; 2]; 2], s2post: &[f64]) -> [f64; 2] {
let ngenes = b0.len();
let ya: Vec<f64> = (0..ngenes).map(|g| b0[g] * b0[g] / s2post[g]).collect();
let yb: Vec<f64> = (0..ngenes).map(|g| b1[g] * b1[g] / s2post[g]).collect();
let x1 = fit_gamma_intercept(&ya, &[v[0][0]], 1000);
let x2 = fit_gamma_intercept(&yb, &[v[1][1]], 1000);
if x1 > 0.0 && x2 > 0.0 {
[0.5 * x1.ln(), 0.5 * x2.ln()]
} else {
[0.0, 0.0]
}
}
fn genas_core(
b0: &[f64],
b1: &[f64],
v: [[f64; 2]; 2],
s2post: &[f64],
dft: &[f64],
x_start: [f64; 2],
) -> Genas {
let ngenes = b0.len();
let x_start = x_start.to_vec();
let m = 2.0;
let q2 = nelder_mead(&x_start, |x| loglik_null(x, v, b0, b1, s2post, dft, m));
let q1_start = [q2.par[0], q2.par[1], 0.0];
let q1 = nelder_mead(&q1_start, |x| loglik_full(x, v, b0, b1, s2post, dft, m));
let d1 = q1.par[0].exp();
let d2 = q1.par[1].exp();
let bb = q1.par[2];
let v0 = [[d1, bb * d1], [bb * d1, bb * bb * d1 + d2]];
let rhobiol = v0[1][0] / (v0[0][0] * v0[1][1]).sqrt();
let rhotech = v[1][0] / (v[0][0] * v[1][1]).sqrt();
let deviance = (2.0 * (q2.value - q1.value)).abs();
let normal = Normal::new(0.0, 1.0).expect("standard normal");
let p_value = 2.0 * normal.cdf(-deviance.sqrt());
Genas {
technical_correlation: rhotech,
covariance_matrix: v0,
biological_correlation: rhobiol,
deviance,
p_value,
n: ngenes,
}
}
fn cov_block(fit: &MArrayLM, c1: usize, c2: usize) -> [[f64; 2]; 2] {
[
[
fit.cov_coefficients[[c1, c1]],
fit.cov_coefficients[[c1, c2]],
],
[
fit.cov_coefficients[[c2, c1]],
fit.cov_coefficients[[c2, c2]],
],
]
}
fn fpval_two_coef(fit: &MArrayLM, c1: usize, c2: usize) -> Result<Vec<f64>> {
let t = fit
.t
.as_ref()
.ok_or_else(|| anyhow!("genas subset=\"Fpval\" needs moderated t (run eBayes)"))?;
let dfp = fit
.df_prior
.as_ref()
.ok_or_else(|| anyhow!("genas subset=\"Fpval\" needs df.prior (run eBayes)"))?;
let ng = t.nrows();
let mut t2 = Array2::<f64>::zeros((ng, 2));
for g in 0..ng {
t2[[g, 0]] = t[[g, c1]];
t2[[g, 1]] = t[[g, c2]];
}
let blk = cov_block(fit, c1, c2);
let mut cov = Array2::<f64>::zeros((2, 2));
for i in 0..2 {
for j in 0..2 {
cov[[i, j]] = blk[i][j];
}
if cov[[i, i]] == 0.0 {
cov[[i, i]] = 1.0;
}
}
let cor = cov2cor(&cov);
let (fstat, r) = classify_tests_fstat(&t2, Some(&cor));
let df1 = r as f64;
let mut fp = vec![0.0; ng];
for g in 0..ng {
let df2 = dfp[g] + fit.df_residual[g];
fp[g] = if df2.is_finite() {
f_sf(fstat[g], df1, df2)
} else {
chi2_sf(df1 * fstat[g], df1)
};
}
Ok(fp)
}
fn which_genes(fit: &MArrayLM, subset: GenasSubset, c1: usize, c2: usize) -> Result<Vec<bool>> {
let ng = fit.coefficients.nrows();
let mask = match subset {
GenasSubset::All => vec![true; ng],
GenasSubset::Fpval => {
let fp = fpval_two_coef(fit, c1, c2)?;
let p = 1.0 - prop_true_null(&fp, PropTrueNullMethod::Lfdr, 20);
let r = rank_average(&fp);
let cut = p * ng as f64;
r.iter().map(|&ri| ri <= cut).collect()
}
GenasSubset::PUnion | GenasSubset::PInt => {
let pv = fit
.p_value
.as_ref()
.ok_or_else(|| anyhow!("genas subset needs p.value (run eBayes)"))?;
let p1col = pv.column(c1).to_vec();
let p2col = pv.column(c2).to_vec();
let p1 = 1.0 - prop_true_null(&p1col, PropTrueNullMethod::Lfdr, 20);
let p2 = 1.0 - prop_true_null(&p2col, PropTrueNullMethod::Lfdr, 20);
let cut1 = p1 * ng as f64;
let cut2 = p2 * ng as f64;
if subset == GenasSubset::PUnion && p1 == 0.0 && p2 == 0.0 {
vec![false; ng]
} else {
let r1 = rank_average(&p1col);
let r2 = rank_average(&p2col);
(0..ng)
.map(|g| {
let a = r1[g] <= cut1;
let b = r2[g] <= cut2;
if subset == GenasSubset::PInt {
a && b
} else {
a || b
}
})
.collect()
}
}
GenasSubset::LogFC => {
let a1: Vec<f64> = fit
.coefficients
.column(c1)
.iter()
.map(|v| v.abs())
.collect();
let a2: Vec<f64> = fit
.coefficients
.column(c2)
.iter()
.map(|v| v.abs())
.collect();
let q1 = quantile_type7(&a1, 0.9);
let q2 = quantile_type7(&a2, 0.9);
(0..ng).map(|g| a1[g] > q1 || a2[g] > q2).collect()
}
GenasSubset::PredFC => {
let pfc1 = pred_fcm(fit, c1, true, true, PropTrueNullMethod::Lfdr)?;
let pfc2 = pred_fcm(fit, c2, true, true, PropTrueNullMethod::Lfdr)?;
let a1: Vec<f64> = pfc1.iter().map(|v| v.abs()).collect();
let a2: Vec<f64> = pfc2.iter().map(|v| v.abs()).collect();
let q1 = quantile_type7(&a1, 0.9);
let q2 = quantile_type7(&a2, 0.9);
(0..ng).map(|g| a1[g] > q1 || a2[g] > q2).collect()
}
};
Ok(mask)
}
fn build_subfit(fit: &MArrayLM, sel: &[usize], c1: usize, c2: usize) -> MArrayLM {
let m = sel.len();
let mut coefficients = Array2::<f64>::zeros((m, 2));
let mut stdev_unscaled = Array2::<f64>::zeros((m, 2));
let mut sigma = Array1::<f64>::zeros(m);
let mut df_residual = Array1::<f64>::zeros(m);
let mut amean = Array1::<f64>::zeros(m);
let mut gene_names = Vec::with_capacity(m);
for (i, &g) in sel.iter().enumerate() {
coefficients[[i, 0]] = fit.coefficients[[g, c1]];
coefficients[[i, 1]] = fit.coefficients[[g, c2]];
stdev_unscaled[[i, 0]] = fit.stdev_unscaled[[g, c1]];
stdev_unscaled[[i, 1]] = fit.stdev_unscaled[[g, c2]];
sigma[i] = fit.sigma[g];
df_residual[i] = fit.df_residual[g];
amean[i] = if g < fit.amean.len() {
fit.amean[g]
} else {
0.0
};
gene_names.push(fit.gene_names.get(g).cloned().unwrap_or_default());
}
let b = cov_block(fit, c1, c2);
let mut cov_coefficients = Array2::<f64>::zeros((2, 2));
cov_coefficients[[0, 0]] = b[0][0];
cov_coefficients[[0, 1]] = b[0][1];
cov_coefficients[[1, 0]] = b[1][0];
cov_coefficients[[1, 1]] = b[1][1];
let coef_names = vec![
fit.coef_names.get(c1).cloned().unwrap_or_default(),
fit.coef_names.get(c2).cloned().unwrap_or_default(),
];
MArrayLM {
coefficients,
stdev_unscaled,
sigma,
df_residual,
cov_coefficients,
gene_names,
coef_names,
amean,
design: None,
contrasts: None,
df_prior: None,
s2_prior: None,
var_prior: None,
proportion: None,
s2_post: None,
t: None,
df_total: None,
p_value: None,
lods: None,
f_stat: None,
f_p_value: None,
}
}
fn null_genas() -> Genas {
Genas {
technical_correlation: f64::NAN,
covariance_matrix: [[f64::NAN; 2]; 2],
biological_correlation: f64::NAN,
deviance: 0.0,
p_value: 1.0,
n: 0,
}
}
fn is_varying(v: Option<&Array1<f64>>) -> bool {
match v {
Some(a) => a.len() > 1 && a.iter().any(|&x| x != a[0]),
None => false,
}
}
fn quantile_type7(x: &[f64], prob: f64) -> f64 {
let n = x.len();
if n == 0 {
return f64::NAN;
}
let mut s = x.to_vec();
s.sort_by(|a, b| a.partial_cmp(b).unwrap());
if n == 1 {
return s[0];
}
let h = (n as f64 - 1.0) * prob;
let lo = h.floor() as usize;
let frac = h - lo as f64;
if lo + 1 < n {
s[lo] + frac * (s[lo + 1] - s[lo])
} else {
s[lo]
}
}
fn rank_average(x: &[f64]) -> Vec<f64> {
let n = x.len();
let mut idx: Vec<usize> = (0..n).collect();
idx.sort_by(|&a, &b| x[a].partial_cmp(&x[b]).unwrap());
let mut ranks = vec![0.0; n];
let mut i = 0;
while i < n {
let mut j = i;
while j + 1 < n && x[idx[j + 1]] == x[idx[i]] {
j += 1;
}
let avg = ((i + 1 + j + 1) as f64) / 2.0;
for &k in &idx[i..=j] {
ranks[k] = avg;
}
i = j + 1;
}
ranks
}
fn loglik_core(
v0: [[f64; 2]; 2],
v: [[f64; 2]; 2],
b0: &[f64],
b1: &[f64],
s2post: &[f64],
dft: &[f64],
m: f64,
) -> f64 {
let m00 = v0[0][0] + v[0][0];
let m01 = v0[0][1] + v[0][1];
let m11 = v0[1][1] + v[1][1];
let r11 = m00.sqrt();
let r12 = m01 / r11;
let r22 = (m11 - r12 * r12).sqrt();
let second = r11.ln() + r22.ln();
let mut total = 0.0;
for g in 0..b0.len() {
let w0 = b0[g] / r11;
let w1 = (b1[g] - r12 * w0) / r22;
let q = w0 * w0 + w1 * w1;
let third = 0.5 * (m + dft[g]) * (1.0 + q / (s2post[g] * dft[g])).ln();
total += second + third;
}
total
}
fn loglik_null(
x: &[f64],
v: [[f64; 2]; 2],
b0: &[f64],
b1: &[f64],
s2post: &[f64],
dft: &[f64],
m: f64,
) -> f64 {
let v0 = [[(2.0 * x[0]).exp(), 0.0], [0.0, (2.0 * x[1]).exp()]];
loglik_core(v0, v, b0, b1, s2post, dft, m)
}
fn loglik_full(
x: &[f64],
v: [[f64; 2]; 2],
b0: &[f64],
b1: &[f64],
s2post: &[f64],
dft: &[f64],
m: f64,
) -> f64 {
let d1 = x[0].exp();
let d2 = x[1].exp();
let bb = x[2];
let v0 = [[d1, bb * d1], [bb * d1, bb * bb * d1 + d2]];
loglik_core(v0, v, b0, b1, s2post, dft, m)
}
#[cfg(test)]
#[allow(clippy::excessive_precision, clippy::approx_constant)]
mod tests {
use super::*;
use crate::fit::lmfit;
use ndarray::Array2;
fn rclose(a: f64, b: f64) -> bool {
(a - b).abs() <= 1e-6 * (1.0 + b.abs())
}
fn fixture() -> (Array2<f64>, Array2<f64>) {
let ngenes = 60usize;
let narrays = 9usize;
let g_a = [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0];
let g_b = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
let mut y = Array2::<f64>::zeros((ngenes, narrays));
for g in 0..ngenes {
let gi = g as i64;
let base = ((gi % 7) - 3) as f64;
let extra = (((gi * 5) % 11) - 5) as f64;
let own_b = (((gi * 3) % 13) - 6) as f64;
let eff_a = base * 0.4 + extra * 0.1;
let eff_b = base * 0.32 + own_b * 0.1;
let mu = 8.0 + (gi % 5) as f64 * 0.25;
for k in 0..narrays {
let ki = k as i64;
let noise = (((gi * 13 + ki * 17) % 19) - 9) as f64 * 0.05;
y[[g, k]] = mu + eff_a * g_a[k] + eff_b * g_b[k] + noise;
}
}
let mut design = Array2::<f64>::zeros((narrays, 3));
for k in 0..narrays {
design[[k, 0]] = 1.0;
design[[k, 1]] = g_a[k];
design[[k, 2]] = g_b[k];
}
(y, design)
}
fn ebayes_fit() -> MArrayLM {
let (y, design) = fixture();
let names: Vec<String> = (1..=60).map(|i| format!("g{i}")).collect();
let coefs = vec!["Intercept".to_string(), "gA".to_string(), "gB".to_string()];
let mut fit = lmfit(&y, &design, names, coefs).unwrap();
ebayes(&mut fit, 0.01, (0.1, 4.0), false, false).unwrap();
fit
}
#[test]
fn genas_matches_r() {
let fit = ebayes_fit();
let g = genas(&fit, (1, 2), GenasSubset::All).unwrap();
assert_eq!(g.n, 60);
assert!(
rclose(g.technical_correlation, 0.49999999999999994),
"tech {}",
g.technical_correlation
);
assert!(
rclose(g.biological_correlation, 0.72110133367492135),
"biol {}",
g.biological_correlation
);
assert!(
rclose(g.covariance_matrix[0][0], 20.311293729600376),
"v00 {}",
g.covariance_matrix[0][0]
);
assert!(
rclose(g.covariance_matrix[0][1], 12.393027523641022),
"v01 {}",
g.covariance_matrix[0][1]
);
assert!(
rclose(g.covariance_matrix[1][0], 12.393027523641022),
"v10 {}",
g.covariance_matrix[1][0]
);
assert!(
rclose(g.covariance_matrix[1][1], 14.542016870927945),
"v11 {}",
g.covariance_matrix[1][1]
);
assert!(rclose(g.deviance, 35.500819277411154), "dev {}", g.deviance);
assert!(rclose(g.p_value, 2.5494331700086711e-09), "p {}", g.p_value);
}
#[test]
fn genas_subset_matches_r() {
let fit = ebayes_fit();
let check = |subset: GenasSubset,
n: usize,
biol: f64,
cov: [f64; 4],
dev: f64,
pval: f64,
tag: &str| {
let g = genas(&fit, (1, 2), subset).unwrap();
assert_eq!(g.n, n, "{tag} n");
assert!(rclose(g.technical_correlation, 0.5), "{tag} tech");
assert!(rclose(g.biological_correlation, biol), "{tag} biol");
assert!(rclose(g.covariance_matrix[0][0], cov[0]), "{tag} v00");
assert!(rclose(g.covariance_matrix[0][1], cov[1]), "{tag} v01");
assert!(rclose(g.covariance_matrix[1][0], cov[2]), "{tag} v10");
assert!(rclose(g.covariance_matrix[1][1], cov[3]), "{tag} v11");
assert!(rclose(g.deviance, dev), "{tag} dev");
assert!(rclose(g.p_value, pval), "{tag} pval");
};
check(
GenasSubset::Fpval,
55,
0.71848111283183225,
[
23.231392431559968,
14.120763825027172,
14.120763825027172,
16.626867102316595,
],
32.698047599684912,
1.0764529339097856e-08,
"Fpval",
);
check(
GenasSubset::PUnion,
55,
0.72141688090435385,
[
23.20355805632785,
14.181934574327077,
14.181934574327077,
16.654966711355684,
],
33.078191943303352,
8.8526093918565966e-09,
"p.union",
);
check(
GenasSubset::PInt,
38,
0.83237776620740411,
[
31.358733693853345,
22.383537180965163,
22.383537180965163,
23.059929516708259,
],
37.494823730220958,
9.1655906123216993e-10,
"p.int",
);
check(
GenasSubset::LogFC,
10,
0.9036479641020686,
[
74.055616589393026,
62.941872383594607,
62.941872383594607,
65.512287646621971,
],
15.829494073613475,
6.9313604519058013e-05,
"logFC",
);
check(
GenasSubset::PredFC,
10,
0.9036479641020686,
[
74.055616589393026,
62.941872383594607,
62.941872383594607,
65.512287646621971,
],
15.829494073613475,
6.9313604519058013e-05,
"predFC",
);
}
}