use anyhow::{bail, Result};
use ndarray::{Array1, Array2};
use crate::dups::{kron_rows, unwrapdups};
use crate::fit::{new_fit, MArrayLM};
use crate::linalg::{cholesky_upper, solve_upper, solve_upper_transpose, xtx_inv_from_r};
const HUBER_K: f64 = 1.345;
const MAD_DENOM: f64 = 0.6745;
const ACC: f64 = 1e-4;
const MAXIT: usize = 20;
fn nanmean<'a, I: IntoIterator<Item = &'a f64>>(it: I) -> f64 {
let mut sum = 0.0;
let mut cnt = 0usize;
for &v in it {
if v.is_finite() {
sum += v;
cnt += 1;
}
}
if cnt > 0 {
sum / cnt as f64
} else {
f64::NAN
}
}
fn mrlm_amean(exprs: &Array2<f64>, ndups: usize, spacing: usize) -> Array1<f64> {
let nspots = exprs.nrows();
let spotmean: Vec<f64> = (0..nspots).map(|i| nanmean(exprs.row(i))).collect();
if ndups <= 1 {
return Array1::from(spotmean);
}
let col = Array2::from_shape_vec((nspots, 1), spotmean).expect("column shape");
let uw = unwrapdups(&col, ndups, spacing.max(1));
(0..uw.nrows()).map(|g| nanmean(uw.row(g))).collect()
}
fn xtwx(x: &Array2<f64>, w: &Array1<f64>) -> Array2<f64> {
let (n, p) = x.dim();
let mut a = Array2::<f64>::zeros((p, p));
for i in 0..n {
let wi = w[i];
for j in 0..p {
let xij = x[[i, j]] * wi;
for l in 0..p {
a[[j, l]] += xij * x[[i, l]];
}
}
}
a
}
fn wls(x: &Array2<f64>, y: &Array1<f64>, w: &Array1<f64>) -> (Array1<f64>, Array1<f64>) {
let (n, p) = x.dim();
let a = xtwx(x, w);
let mut b = Array1::<f64>::zeros(p);
for i in 0..n {
let wy = w[i] * y[i];
for j in 0..p {
b[j] += x[[i, j]] * wy;
}
}
let u = cholesky_upper(&a);
let z = solve_upper_transpose(&u, &b);
let coef = solve_upper(&u, &z);
let resid = y - &x.dot(&coef);
(coef, resid)
}
fn irls_delta(old: &Array1<f64>, new: &Array1<f64>) -> f64 {
let num: f64 = old
.iter()
.zip(new.iter())
.map(|(&o, &n)| (o - n) * (o - n))
.sum();
let den: f64 = old.iter().map(|&o| o * o).sum::<f64>().max(1e-20);
(num / den).sqrt()
}
fn median_abs(r: &Array1<f64>) -> f64 {
let mut v: Vec<f64> = r.iter().map(|x| x.abs()).collect();
v.sort_by(|a, b| a.partial_cmp(b).expect("finite residuals"));
let n = v.len();
if n % 2 == 1 {
v[n / 2]
} else {
(v[n / 2 - 1] + v[n / 2]) * 0.5
}
}
fn rlm_fit(
x: &Array2<f64>,
y: &Array1<f64>,
wprior: &Array1<f64>,
) -> (Array1<f64>, Array1<f64>, f64) {
let (n, p) = x.dim();
let mut xf = x.clone();
let mut yf = y.clone();
for i in 0..n {
let fac = wprior[i].sqrt();
for j in 0..p {
xf[[i, j]] *= fac;
}
yf[i] *= fac;
}
let ones = Array1::<f64>::ones(n);
let (mut coef, mut resid) = wls(&xf, &yf, &ones);
let mut scale = 0.0;
let mut wpsi = ones.clone();
for _ in 0..MAXIT {
let testpv = resid.clone();
scale = median_abs(&resid) / MAD_DENOM;
if scale == 0.0 {
break;
}
for i in 0..n {
let u = resid[i] / scale;
wpsi[i] = (HUBER_K / u.abs()).min(1.0);
}
let (c, r) = wls(&xf, &yf, &wpsi);
coef = c;
resid = r;
if irls_delta(&testpv, &resid) <= ACC {
break;
}
}
let xtwx_final = xtwx(&xf, &wpsi);
let u = cholesky_upper(&xtwx_final);
let inv = xtx_inv_from_r(&u);
let stdev: Array1<f64> = (0..p).map(|j| inv[[j, j]].sqrt()).collect();
(coef, stdev, scale)
}
#[allow(clippy::too_many_arguments)]
pub fn mrlm(
exprs: &Array2<f64>,
design: &Array2<f64>,
ndups: usize,
spacing: usize,
weights: Option<&Array2<f64>>,
gene_names: Vec<String>,
coef_names: Vec<String>,
) -> Result<MArrayLM> {
let narrays0 = exprs.ncols();
if design.nrows() != narrays0 {
bail!(
"design rows ({}) must match number of arrays ({})",
design.nrows(),
narrays0
);
}
let nbeta = design.ncols();
let amean = mrlm_amean(exprs, ndups, spacing);
let mut m = exprs.clone();
let mut wmat: Option<Array2<f64>> = None;
if let Some(w) = weights {
if w.dim() != exprs.dim() {
bail!("weights dimensions must match exprs");
}
let mut wc = w.clone();
for i in 0..wc.nrows() {
for j in 0..wc.ncols() {
if wc[[i, j]] <= 0.0 {
wc[[i, j]] = f64::NAN;
}
if !wc[[i, j]].is_finite() {
m[[i, j]] = f64::NAN;
}
}
}
wmat = Some(wc);
}
let (mwork, wwork, dwork) = if ndups > 1 {
let sp = spacing.max(1);
let mu = unwrapdups(&m, ndups, sp);
let wu = wmat.as_ref().map(|w| unwrapdups(w, ndups, sp));
let du = kron_rows(design, ndups);
(mu, wu, du)
} else {
(m, wmat, design.clone())
};
let ngenes = mwork.nrows();
let n = mwork.ncols();
let mut coefficients = Array2::<f64>::from_elem((ngenes, nbeta), f64::NAN);
let mut stdev_unscaled = Array2::<f64>::from_elem((ngenes, nbeta), f64::NAN);
let mut sigma = Array1::<f64>::from_elem(ngenes, f64::NAN);
let mut df_residual = Array1::<f64>::zeros(ngenes);
for g in 0..ngenes {
let yrow = mwork.row(g);
let obs: Vec<usize> = (0..n).filter(|&k| yrow[k].is_finite()).collect();
let nobs = obs.len();
if nobs <= nbeta {
continue;
}
let mut x = Array2::<f64>::zeros((nobs, nbeta));
let mut y = Array1::<f64>::zeros(nobs);
let mut wp = Array1::<f64>::ones(nobs);
for (ri, &k) in obs.iter().enumerate() {
for j in 0..nbeta {
x[[ri, j]] = dwork[[k, j]];
}
y[ri] = yrow[k];
if let Some(w) = &wwork {
wp[ri] = w[[g, k]];
}
}
let (coef, stdev, s) = rlm_fit(&x, &y, &wp);
for j in 0..nbeta {
coefficients[[g, j]] = coef[j];
stdev_unscaled[[g, j]] = stdev[j];
}
let df = (nobs - nbeta) as f64;
df_residual[g] = df;
if df > 0.0 {
sigma[g] = s;
}
}
let xtx = xtwx(&dwork, &Array1::<f64>::ones(dwork.nrows()));
let u = cholesky_upper(&xtx);
let cov_coefficients = xtx_inv_from_r(&u);
let gnames: Vec<String> = if gene_names.len() == ngenes {
gene_names
} else {
(0..ngenes).map(|i| i.to_string()).collect()
};
let cnames: Vec<String> = if coef_names.len() == nbeta {
coef_names
} else {
(0..nbeta).map(|j| format!("coef{j}")).collect()
};
let mut fit = new_fit(
coefficients,
stdev_unscaled,
sigma,
df_residual,
cov_coefficients,
&gnames,
&cnames,
);
fit.amean = amean;
fit.design = Some(design.clone());
Ok(fit)
}
#[cfg(test)]
#[allow(clippy::excessive_precision)]
mod tests {
use super::*;
fn rclose(a: f64, b: f64) -> bool {
(a - b).abs() <= 1e-6 * (1.0 + b.abs())
}
fn names(n: usize, p: usize) -> (Vec<String>, Vec<String>) {
(
(0..n).map(|i| format!("g{i}")).collect(),
(0..p).map(|j| format!("c{j}")).collect(),
)
}
const GROUP: [f64; 8] = [0., 0., 0., 0., 1., 1., 1., 1.];
fn mrlm_fixture() -> (Array2<f64>, Array2<f64>) {
let mut m = Array2::<f64>::zeros((6, 8));
let mut pw = Array2::<f64>::zeros((6, 8));
for g0 in 0..6i64 {
let lvl = 5.0 + (g0 % 5) as f64 * 0.25;
let eff = (((g0 * 3) % 7) - 3) as f64 * 0.3;
for k0 in 0..8i64 {
let noise = (((g0 * 13 + k0 * 7) % 11) - 5) as f64 * 0.04;
let mut val = lvl + eff * GROUP[k0 as usize] + noise;
if g0 == 1 && k0 == 2 {
val += 4.5;
}
if g0 == 4 && k0 == 5 {
val -= 3.0;
}
m[[g0 as usize, k0 as usize]] = val;
pw[[g0 as usize, k0 as usize]] = 0.5 + ((g0 * 3 + k0 * 2) % 7) as f64 * 0.1;
}
}
(m, pw)
}
fn mrlm_design() -> Array2<f64> {
let mut d = Array2::<f64>::zeros((8, 2));
for k0 in 0..8 {
d[[k0, 0]] = 1.0;
d[[k0, 1]] = GROUP[k0];
}
d
}
#[test]
fn mrlm_no_weights_outliers_matches_r() {
let (m, _) = mrlm_fixture();
let (gn, cn) = names(6, 2);
let fit = mrlm(&m, &mrlm_design(), 1, 1, None, gn, cn).unwrap();
let coef = [
[4.9999999999999991, -0.87999999999999978],
[5.3146141629547916, -0.074614162954790883],
[5.4399999999999995, 1.0300000000000002],
[5.7700000000000014, -0.39000000000000001],
[5.9900000000000002, 0.48355048621571056],
[5.0700000000000003, -0.68999999999999984],
];
let stdev = [
[0.49999999999999989, 0.70710678118654746],
[0.57065962792466218, 0.75871760948531697],
[0.49999999999999989, 0.70710678118654746],
[0.49999999999999989, 0.70710678118654746],
[0.49999999999999994, 0.75793140221465416],
[0.49999999999999989, 0.70710678118654746],
];
let sig = [
0.16308376575240946,
0.23333219796492946,
0.16308376575240935,
0.16308376575240935,
0.16308376575240946,
0.1630837657524094,
];
let amean = [4.56, 5.7925, 5.955, 5.575, 5.925, 4.725];
for g in 0..6 {
assert!(rclose(fit.coefficients[[g, 0]], coef[g][0]), "coef0 g{g}");
assert!(rclose(fit.coefficients[[g, 1]], coef[g][1]), "coef1 g{g}");
assert!(rclose(fit.stdev_unscaled[[g, 0]], stdev[g][0]), "sd0 g{g}");
assert!(rclose(fit.stdev_unscaled[[g, 1]], stdev[g][1]), "sd1 g{g}");
assert!(rclose(fit.sigma[g], sig[g]), "sigma g{g}");
assert!(rclose(fit.amean[g], amean[g]), "amean g{g}");
assert_eq!(fit.df_residual[g], 6.0);
}
assert!(rclose(fit.cov_coefficients[[0, 0]], 0.25));
assert!(rclose(fit.cov_coefficients[[0, 1]], -0.25));
assert!(rclose(fit.cov_coefficients[[1, 1]], 0.5));
}
#[test]
fn mrlm_prior_weights_matches_r() {
let (m, pw) = mrlm_fixture();
let (gn, cn) = names(6, 2);
let fit = mrlm(&m, &mrlm_design(), 1, 1, Some(&pw), gn, cn).unwrap();
let coef = [
[5.0325000000000006, -0.90215517241379395],
[5.3064763689357903, -0.054123427759319734],
[5.4485714285714275, 1.0126785714285726],
[5.7790909090909093, -0.38509090909090937],
[6.0108476244986209, 0.48188034709807304],
[5.0386206896551746, -0.64952978056426403],
];
let stdev = [
[0.55901699437494745, 0.81075741514148147],
[0.62713777647068802, 0.82910761529214172],
[0.53452248382484879, 0.77344313670384701],
[0.55048188256318031, 0.79772403521746571],
[0.56867031498498866, 0.81384867745376954],
[0.5872202195147036, 0.80489619780267296],
];
let sig = [
0.1752276081871578,
0.19892640030923783,
0.14950403235466014,
0.15826499012964992,
0.12966195875949166,
0.15628762766357865,
];
for g in 0..6 {
assert!(rclose(fit.coefficients[[g, 0]], coef[g][0]), "coef0 g{g}");
assert!(rclose(fit.coefficients[[g, 1]], coef[g][1]), "coef1 g{g}");
assert!(rclose(fit.stdev_unscaled[[g, 0]], stdev[g][0]), "sd0 g{g}");
assert!(rclose(fit.stdev_unscaled[[g, 1]], stdev[g][1]), "sd1 g{g}");
assert!(rclose(fit.sigma[g], sig[g]), "sigma g{g}");
assert_eq!(fit.df_residual[g], 6.0);
}
assert!(rclose(fit.cov_coefficients[[0, 0]], 0.25));
assert!(rclose(fit.cov_coefficients[[1, 1]], 0.5));
}
#[test]
fn mrlm_one_na_drops_observation_matches_r() {
let (mut m, _) = mrlm_fixture();
m[[2, 5]] = f64::NAN; let (gn, cn) = names(6, 2);
let fit = mrlm(&m, &mrlm_design(), 1, 1, None, gn, cn).unwrap();
assert_eq!(fit.df_residual[2], 5.0);
assert!(rclose(fit.coefficients[[2, 0]], 5.4399999999999986));
assert!(rclose(fit.coefficients[[2, 1]], 1.0400000000000007));
assert!(rclose(fit.stdev_unscaled[[2, 0]], 0.49999999999999989));
assert!(rclose(fit.stdev_unscaled[[2, 1]], 0.76376261582597327));
assert!(rclose(fit.sigma[2], 0.17790956263899199));
assert!(rclose(fit.amean[2], 5.8857142857142861));
assert_eq!(fit.df_residual[0], 6.0);
assert!(rclose(fit.coefficients[[1, 0]], 5.3146141629547916));
assert!(rclose(fit.coefficients[[1, 1]], -0.074614162954790883));
assert!(rclose(fit.sigma[5], 0.1630837657524094));
}
}