1use anyhow::{anyhow, bail, Result};
25use ndarray::{Array1, Array2};
26use statrs::distribution::{ContinuousCDF, Normal};
27
28use crate::classifytestsf::classify_tests_fstat;
29use crate::ebayes::ebayes;
30use crate::fit::MArrayLM;
31use crate::fitgamma::fit_gamma_intercept;
32use crate::linalg::cov2cor;
33use crate::optim::nelder_mead;
34use crate::predfcm::pred_fcm;
35use crate::proptruenull::{prop_true_null, PropTrueNullMethod};
36use crate::special::{chi2_sf, f_sf};
37
38#[derive(Clone, Copy, Debug, PartialEq, Eq)]
40pub enum GenasSubset {
41 All,
43 Fpval,
45 PUnion,
47 PInt,
49 LogFC,
52 PredFC,
54}
55
56#[derive(Debug, Clone)]
58pub struct Genas {
59 pub technical_correlation: f64,
61 pub covariance_matrix: [[f64; 2]; 2],
63 pub biological_correlation: f64,
65 pub deviance: f64,
67 pub p_value: f64,
69 pub n: usize,
71}
72
73pub fn genas(fit: &MArrayLM, coef: (usize, usize), subset: GenasSubset) -> Result<Genas> {
82 let ngenes = fit.coefficients.nrows();
83 if ngenes < 1 {
84 return Ok(null_genas());
85 }
86 let (c1, c2) = coef;
87 let ncoef = fit.coefficients.ncols();
88 if c1 >= ncoef || c2 >= ncoef {
89 bail!("coef out of range for a {ncoef}-coefficient fit");
90 }
91 let s2post = fit
92 .s2_post
93 .as_ref()
94 .ok_or_else(|| anyhow!("genas requires an eBayes fit (s2_post missing)"))?;
95 let dft = fit
96 .df_total
97 .as_ref()
98 .ok_or_else(|| anyhow!("genas requires an eBayes fit (df_total missing)"))?;
99
100 let b0_full = fit.coefficients.column(c1).to_vec();
104 let b1_full = fit.coefficients.column(c2).to_vec();
105 let v = cov_block(fit, c1, c2);
106 let s2post_slice = s2post.as_slice().expect("contiguous s2_post");
107 let x_start = gamma_start(&b0_full, &b1_full, v, s2post_slice);
108
109 if subset == GenasSubset::All {
111 return Ok(genas_core(
112 &b0_full,
113 &b1_full,
114 v,
115 s2post_slice,
116 dft.as_slice().expect("contiguous df_total"),
117 x_start,
118 ));
119 }
120
121 let mask = which_genes(fit, subset, c1, c2)?;
124 let selected: Vec<usize> = (0..ngenes).filter(|&g| mask[g]).collect();
125 if selected.is_empty() {
126 return Ok(null_genas());
127 }
128 let trend = is_varying(fit.s2_prior.as_ref());
129 let robust = is_varying(fit.df_prior.as_ref());
130 let mut sub = build_subfit(fit, &selected, c1, c2);
131 ebayes(&mut sub, 0.01, (0.1, 4.0), trend, robust)?;
132 let s2post2 = sub.s2_post.as_ref().expect("eBayes fills s2_post");
133 let dft2 = sub.df_total.as_ref().expect("eBayes fills df_total");
134 let b0 = sub.coefficients.column(0).to_vec();
135 let b1 = sub.coefficients.column(1).to_vec();
136 Ok(genas_core(
137 &b0,
138 &b1,
139 v,
140 s2post2.as_slice().expect("contiguous s2_post"),
141 dft2.as_slice().expect("contiguous df_total"),
142 x_start,
143 ))
144}
145
146fn gamma_start(b0: &[f64], b1: &[f64], v: [[f64; 2]; 2], s2post: &[f64]) -> [f64; 2] {
152 let ngenes = b0.len();
153 let ya: Vec<f64> = (0..ngenes).map(|g| b0[g] * b0[g] / s2post[g]).collect();
154 let yb: Vec<f64> = (0..ngenes).map(|g| b1[g] * b1[g] / s2post[g]).collect();
155 let x1 = fit_gamma_intercept(&ya, &[v[0][0]], 1000);
156 let x2 = fit_gamma_intercept(&yb, &[v[1][1]], 1000);
157 if x1 > 0.0 && x2 > 0.0 {
158 [0.5 * x1.ln(), 0.5 * x2.ln()]
159 } else {
160 [0.0, 0.0]
161 }
162}
163
164fn genas_core(
167 b0: &[f64],
168 b1: &[f64],
169 v: [[f64; 2]; 2],
170 s2post: &[f64],
171 dft: &[f64],
172 x_start: [f64; 2],
173) -> Genas {
174 let ngenes = b0.len();
175 let x_start = x_start.to_vec();
176 let m = 2.0;
177
178 let q2 = nelder_mead(&x_start, |x| loglik_null(x, v, b0, b1, s2post, dft, m));
180 let q1_start = [q2.par[0], q2.par[1], 0.0];
181 let q1 = nelder_mead(&q1_start, |x| loglik_full(x, v, b0, b1, s2post, dft, m));
182
183 let d1 = q1.par[0].exp();
185 let d2 = q1.par[1].exp();
186 let bb = q1.par[2];
187 let v0 = [[d1, bb * d1], [bb * d1, bb * bb * d1 + d2]];
188 let rhobiol = v0[1][0] / (v0[0][0] * v0[1][1]).sqrt();
189 let rhotech = v[1][0] / (v[0][0] * v[1][1]).sqrt();
190
191 let deviance = (2.0 * (q2.value - q1.value)).abs();
192 let normal = Normal::new(0.0, 1.0).expect("standard normal");
194 let p_value = 2.0 * normal.cdf(-deviance.sqrt());
195
196 Genas {
197 technical_correlation: rhotech,
198 covariance_matrix: v0,
199 biological_correlation: rhobiol,
200 deviance,
201 p_value,
202 n: ngenes,
203 }
204}
205
206fn cov_block(fit: &MArrayLM, c1: usize, c2: usize) -> [[f64; 2]; 2] {
208 [
209 [
210 fit.cov_coefficients[[c1, c1]],
211 fit.cov_coefficients[[c1, c2]],
212 ],
213 [
214 fit.cov_coefficients[[c2, c1]],
215 fit.cov_coefficients[[c2, c2]],
216 ],
217 ]
218}
219
220fn fpval_two_coef(fit: &MArrayLM, c1: usize, c2: usize) -> Result<Vec<f64>> {
226 let t = fit
227 .t
228 .as_ref()
229 .ok_or_else(|| anyhow!("genas subset=\"Fpval\" needs moderated t (run eBayes)"))?;
230 let dfp = fit
231 .df_prior
232 .as_ref()
233 .ok_or_else(|| anyhow!("genas subset=\"Fpval\" needs df.prior (run eBayes)"))?;
234 let ng = t.nrows();
235 let mut t2 = Array2::<f64>::zeros((ng, 2));
236 for g in 0..ng {
237 t2[[g, 0]] = t[[g, c1]];
238 t2[[g, 1]] = t[[g, c2]];
239 }
240 let blk = cov_block(fit, c1, c2);
243 let mut cov = Array2::<f64>::zeros((2, 2));
244 for i in 0..2 {
245 for j in 0..2 {
246 cov[[i, j]] = blk[i][j];
247 }
248 if cov[[i, i]] == 0.0 {
249 cov[[i, i]] = 1.0;
250 }
251 }
252 let cor = cov2cor(&cov);
253 let (fstat, r) = classify_tests_fstat(&t2, Some(&cor));
254 let df1 = r as f64;
255 let mut fp = vec![0.0; ng];
256 for g in 0..ng {
257 let df2 = dfp[g] + fit.df_residual[g];
258 fp[g] = if df2.is_finite() {
259 f_sf(fstat[g], df1, df2)
260 } else {
261 chi2_sf(df1 * fstat[g], df1)
262 };
263 }
264 Ok(fp)
265}
266
267fn which_genes(fit: &MArrayLM, subset: GenasSubset, c1: usize, c2: usize) -> Result<Vec<bool>> {
279 let ng = fit.coefficients.nrows();
280 let mask = match subset {
281 GenasSubset::All => vec![true; ng],
282 GenasSubset::Fpval => {
283 let fp = fpval_two_coef(fit, c1, c2)?;
284 let p = 1.0 - prop_true_null(&fp, PropTrueNullMethod::Lfdr, 20);
285 let r = rank_average(&fp);
286 let cut = p * ng as f64;
287 r.iter().map(|&ri| ri <= cut).collect()
288 }
289 GenasSubset::PUnion | GenasSubset::PInt => {
290 let pv = fit
291 .p_value
292 .as_ref()
293 .ok_or_else(|| anyhow!("genas subset needs p.value (run eBayes)"))?;
294 let p1col = pv.column(c1).to_vec();
295 let p2col = pv.column(c2).to_vec();
296 let p1 = 1.0 - prop_true_null(&p1col, PropTrueNullMethod::Lfdr, 20);
297 let p2 = 1.0 - prop_true_null(&p2col, PropTrueNullMethod::Lfdr, 20);
298 let cut1 = p1 * ng as f64;
299 let cut2 = p2 * ng as f64;
300 if subset == GenasSubset::PUnion && p1 == 0.0 && p2 == 0.0 {
301 vec![false; ng]
302 } else {
303 let r1 = rank_average(&p1col);
304 let r2 = rank_average(&p2col);
305 (0..ng)
306 .map(|g| {
307 let a = r1[g] <= cut1;
308 let b = r2[g] <= cut2;
309 if subset == GenasSubset::PInt {
310 a && b
311 } else {
312 a || b
313 }
314 })
315 .collect()
316 }
317 }
318 GenasSubset::LogFC => {
319 let a1: Vec<f64> = fit
320 .coefficients
321 .column(c1)
322 .iter()
323 .map(|v| v.abs())
324 .collect();
325 let a2: Vec<f64> = fit
326 .coefficients
327 .column(c2)
328 .iter()
329 .map(|v| v.abs())
330 .collect();
331 let q1 = quantile_type7(&a1, 0.9);
332 let q2 = quantile_type7(&a2, 0.9);
333 (0..ng).map(|g| a1[g] > q1 || a2[g] > q2).collect()
334 }
335 GenasSubset::PredFC => {
336 let pfc1 = pred_fcm(fit, c1, true, true, PropTrueNullMethod::Lfdr)?;
337 let pfc2 = pred_fcm(fit, c2, true, true, PropTrueNullMethod::Lfdr)?;
338 let a1: Vec<f64> = pfc1.iter().map(|v| v.abs()).collect();
339 let a2: Vec<f64> = pfc2.iter().map(|v| v.abs()).collect();
340 let q1 = quantile_type7(&a1, 0.9);
341 let q2 = quantile_type7(&a2, 0.9);
342 (0..ng).map(|g| a1[g] > q1 || a2[g] > q2).collect()
343 }
344 };
345 Ok(mask)
346}
347
348fn build_subfit(fit: &MArrayLM, sel: &[usize], c1: usize, c2: usize) -> MArrayLM {
352 let m = sel.len();
353 let mut coefficients = Array2::<f64>::zeros((m, 2));
354 let mut stdev_unscaled = Array2::<f64>::zeros((m, 2));
355 let mut sigma = Array1::<f64>::zeros(m);
356 let mut df_residual = Array1::<f64>::zeros(m);
357 let mut amean = Array1::<f64>::zeros(m);
358 let mut gene_names = Vec::with_capacity(m);
359 for (i, &g) in sel.iter().enumerate() {
360 coefficients[[i, 0]] = fit.coefficients[[g, c1]];
361 coefficients[[i, 1]] = fit.coefficients[[g, c2]];
362 stdev_unscaled[[i, 0]] = fit.stdev_unscaled[[g, c1]];
363 stdev_unscaled[[i, 1]] = fit.stdev_unscaled[[g, c2]];
364 sigma[i] = fit.sigma[g];
365 df_residual[i] = fit.df_residual[g];
366 amean[i] = if g < fit.amean.len() {
367 fit.amean[g]
368 } else {
369 0.0
370 };
371 gene_names.push(fit.gene_names.get(g).cloned().unwrap_or_default());
372 }
373 let b = cov_block(fit, c1, c2);
374 let mut cov_coefficients = Array2::<f64>::zeros((2, 2));
375 cov_coefficients[[0, 0]] = b[0][0];
376 cov_coefficients[[0, 1]] = b[0][1];
377 cov_coefficients[[1, 0]] = b[1][0];
378 cov_coefficients[[1, 1]] = b[1][1];
379 let coef_names = vec![
380 fit.coef_names.get(c1).cloned().unwrap_or_default(),
381 fit.coef_names.get(c2).cloned().unwrap_or_default(),
382 ];
383
384 MArrayLM {
385 coefficients,
386 stdev_unscaled,
387 sigma,
388 df_residual,
389 cov_coefficients,
390 gene_names,
391 coef_names,
392 amean,
393 design: None,
394 contrasts: None,
395 df_prior: None,
396 s2_prior: None,
397 var_prior: None,
398 proportion: None,
399 s2_post: None,
400 t: None,
401 df_total: None,
402 p_value: None,
403 lods: None,
404 f_stat: None,
405 f_p_value: None,
406 }
407}
408
409fn null_genas() -> Genas {
411 Genas {
412 technical_correlation: f64::NAN,
413 covariance_matrix: [[f64::NAN; 2]; 2],
414 biological_correlation: f64::NAN,
415 deviance: 0.0,
416 p_value: 1.0,
417 n: 0,
418 }
419}
420
421fn is_varying(v: Option<&Array1<f64>>) -> bool {
424 match v {
425 Some(a) => a.len() > 1 && a.iter().any(|&x| x != a[0]),
426 None => false,
427 }
428}
429
430fn quantile_type7(x: &[f64], prob: f64) -> f64 {
432 let n = x.len();
433 if n == 0 {
434 return f64::NAN;
435 }
436 let mut s = x.to_vec();
437 s.sort_by(|a, b| a.partial_cmp(b).unwrap());
438 if n == 1 {
439 return s[0];
440 }
441 let h = (n as f64 - 1.0) * prob;
442 let lo = h.floor() as usize;
443 let frac = h - lo as f64;
444 if lo + 1 < n {
445 s[lo] + frac * (s[lo + 1] - s[lo])
446 } else {
447 s[lo]
448 }
449}
450
451fn rank_average(x: &[f64]) -> Vec<f64> {
453 let n = x.len();
454 let mut idx: Vec<usize> = (0..n).collect();
455 idx.sort_by(|&a, &b| x[a].partial_cmp(&x[b]).unwrap());
456 let mut ranks = vec![0.0; n];
457 let mut i = 0;
458 while i < n {
459 let mut j = i;
460 while j + 1 < n && x[idx[j + 1]] == x[idx[i]] {
461 j += 1;
462 }
463 let avg = ((i + 1 + j + 1) as f64) / 2.0;
464 for &k in &idx[i..=j] {
465 ranks[k] = avg;
466 }
467 i = j + 1;
468 }
469 ranks
470}
471
472fn loglik_core(
476 v0: [[f64; 2]; 2],
477 v: [[f64; 2]; 2],
478 b0: &[f64],
479 b1: &[f64],
480 s2post: &[f64],
481 dft: &[f64],
482 m: f64,
483) -> f64 {
484 let m00 = v0[0][0] + v[0][0];
486 let m01 = v0[0][1] + v[0][1];
487 let m11 = v0[1][1] + v[1][1];
488 let r11 = m00.sqrt();
489 let r12 = m01 / r11;
490 let r22 = (m11 - r12 * r12).sqrt();
491 let second = r11.ln() + r22.ln();
492
493 let mut total = 0.0;
494 for g in 0..b0.len() {
495 let w0 = b0[g] / r11;
497 let w1 = (b1[g] - r12 * w0) / r22;
498 let q = w0 * w0 + w1 * w1;
499 let third = 0.5 * (m + dft[g]) * (1.0 + q / (s2post[g] * dft[g])).ln();
500 total += second + third;
501 }
502 total
503}
504
505fn loglik_null(
507 x: &[f64],
508 v: [[f64; 2]; 2],
509 b0: &[f64],
510 b1: &[f64],
511 s2post: &[f64],
512 dft: &[f64],
513 m: f64,
514) -> f64 {
515 let v0 = [[(2.0 * x[0]).exp(), 0.0], [0.0, (2.0 * x[1]).exp()]];
516 loglik_core(v0, v, b0, b1, s2post, dft, m)
517}
518
519fn loglik_full(
522 x: &[f64],
523 v: [[f64; 2]; 2],
524 b0: &[f64],
525 b1: &[f64],
526 s2post: &[f64],
527 dft: &[f64],
528 m: f64,
529) -> f64 {
530 let d1 = x[0].exp();
531 let d2 = x[1].exp();
532 let bb = x[2];
533 let v0 = [[d1, bb * d1], [bb * d1, bb * bb * d1 + d2]];
534 loglik_core(v0, v, b0, b1, s2post, dft, m)
535}
536
537#[cfg(test)]
538#[allow(clippy::excessive_precision, clippy::approx_constant)]
539mod tests {
540 use super::*;
541 use crate::fit::lmfit;
542 use ndarray::Array2;
543
544 fn rclose(a: f64, b: f64) -> bool {
545 (a - b).abs() <= 1e-6 * (1.0 + b.abs())
546 }
547
548 fn fixture() -> (Array2<f64>, Array2<f64>) {
552 let ngenes = 60usize;
553 let narrays = 9usize;
554 let g_a = [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0];
555 let g_b = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
556 let mut y = Array2::<f64>::zeros((ngenes, narrays));
557 for g in 0..ngenes {
558 let gi = g as i64;
559 let base = ((gi % 7) - 3) as f64;
560 let extra = (((gi * 5) % 11) - 5) as f64;
561 let own_b = (((gi * 3) % 13) - 6) as f64;
562 let eff_a = base * 0.4 + extra * 0.1;
563 let eff_b = base * 0.32 + own_b * 0.1;
564 let mu = 8.0 + (gi % 5) as f64 * 0.25;
565 for k in 0..narrays {
566 let ki = k as i64;
567 let noise = (((gi * 13 + ki * 17) % 19) - 9) as f64 * 0.05;
568 y[[g, k]] = mu + eff_a * g_a[k] + eff_b * g_b[k] + noise;
569 }
570 }
571 let mut design = Array2::<f64>::zeros((narrays, 3));
572 for k in 0..narrays {
573 design[[k, 0]] = 1.0;
574 design[[k, 1]] = g_a[k];
575 design[[k, 2]] = g_b[k];
576 }
577 (y, design)
578 }
579
580 fn ebayes_fit() -> MArrayLM {
581 let (y, design) = fixture();
582 let names: Vec<String> = (1..=60).map(|i| format!("g{i}")).collect();
583 let coefs = vec!["Intercept".to_string(), "gA".to_string(), "gB".to_string()];
584 let mut fit = lmfit(&y, &design, names, coefs).unwrap();
585 ebayes(&mut fit, 0.01, (0.1, 4.0), false, false).unwrap();
587 fit
588 }
589
590 #[test]
591 fn genas_matches_r() {
592 let fit = ebayes_fit();
593 let g = genas(&fit, (1, 2), GenasSubset::All).unwrap();
595
596 assert_eq!(g.n, 60);
597 assert!(
598 rclose(g.technical_correlation, 0.49999999999999994),
599 "tech {}",
600 g.technical_correlation
601 );
602 assert!(
603 rclose(g.biological_correlation, 0.72110133367492135),
604 "biol {}",
605 g.biological_correlation
606 );
607 assert!(
608 rclose(g.covariance_matrix[0][0], 20.311293729600376),
609 "v00 {}",
610 g.covariance_matrix[0][0]
611 );
612 assert!(
613 rclose(g.covariance_matrix[0][1], 12.393027523641022),
614 "v01 {}",
615 g.covariance_matrix[0][1]
616 );
617 assert!(
618 rclose(g.covariance_matrix[1][0], 12.393027523641022),
619 "v10 {}",
620 g.covariance_matrix[1][0]
621 );
622 assert!(
623 rclose(g.covariance_matrix[1][1], 14.542016870927945),
624 "v11 {}",
625 g.covariance_matrix[1][1]
626 );
627 assert!(rclose(g.deviance, 35.500819277411154), "dev {}", g.deviance);
628 assert!(rclose(g.p_value, 2.5494331700086711e-09), "p {}", g.p_value);
629 }
630
631 #[test]
632 fn genas_subset_matches_r() {
633 let fit = ebayes_fit();
635 let check = |subset: GenasSubset,
636 n: usize,
637 biol: f64,
638 cov: [f64; 4],
639 dev: f64,
640 pval: f64,
641 tag: &str| {
642 let g = genas(&fit, (1, 2), subset).unwrap();
643 assert_eq!(g.n, n, "{tag} n");
644 assert!(rclose(g.technical_correlation, 0.5), "{tag} tech");
645 assert!(rclose(g.biological_correlation, biol), "{tag} biol");
646 assert!(rclose(g.covariance_matrix[0][0], cov[0]), "{tag} v00");
647 assert!(rclose(g.covariance_matrix[0][1], cov[1]), "{tag} v01");
648 assert!(rclose(g.covariance_matrix[1][0], cov[2]), "{tag} v10");
649 assert!(rclose(g.covariance_matrix[1][1], cov[3]), "{tag} v11");
650 assert!(rclose(g.deviance, dev), "{tag} dev");
651 assert!(rclose(g.p_value, pval), "{tag} pval");
652 };
653
654 check(
655 GenasSubset::Fpval,
656 55,
657 0.71848111283183225,
658 [
659 23.231392431559968,
660 14.120763825027172,
661 14.120763825027172,
662 16.626867102316595,
663 ],
664 32.698047599684912,
665 1.0764529339097856e-08,
666 "Fpval",
667 );
668 check(
669 GenasSubset::PUnion,
670 55,
671 0.72141688090435385,
672 [
673 23.20355805632785,
674 14.181934574327077,
675 14.181934574327077,
676 16.654966711355684,
677 ],
678 33.078191943303352,
679 8.8526093918565966e-09,
680 "p.union",
681 );
682 check(
683 GenasSubset::PInt,
684 38,
685 0.83237776620740411,
686 [
687 31.358733693853345,
688 22.383537180965163,
689 22.383537180965163,
690 23.059929516708259,
691 ],
692 37.494823730220958,
693 9.1655906123216993e-10,
694 "p.int",
695 );
696 check(
697 GenasSubset::LogFC,
698 10,
699 0.9036479641020686,
700 [
701 74.055616589393026,
702 62.941872383594607,
703 62.941872383594607,
704 65.512287646621971,
705 ],
706 15.829494073613475,
707 6.9313604519058013e-05,
708 "logFC",
709 );
710 check(
711 GenasSubset::PredFC,
712 10,
713 0.9036479641020686,
714 [
715 74.055616589393026,
716 62.941872383594607,
717 62.941872383594607,
718 65.512287646621971,
719 ],
720 15.829494073613475,
721 6.9313604519058013e-05,
722 "predFC",
723 );
724 }
725}