Skip to main content

rsomics_limma_ebayes/
lib.rs

1//! lmFit + eBayes + topTable(sort.by="none") for a log-expression matrix.
2//!
3//! Clean-room reimplementation of limma's moderated t-statistic pipeline:
4//! Smyth (2004), Stat Appl Genet Mol Biol 3(1):3, doi:10.2202/1544-6115.1027.
5//! No limma (GPL) source was consulted; the method follows the published
6//! paper and is validated black-box against the binary.
7
8mod ebayes;
9mod fit;
10mod matrix;
11mod special;
12
13use std::io::{BufWriter, Write};
14use std::path::Path;
15
16use rsomics_common::{Result, RsomicsError};
17
18pub use matrix::{read_contrast, read_design, read_expr};
19
20pub struct Options<'a> {
21    pub expr: &'a Path,
22    pub design: &'a Path,
23    pub contrast: Option<&'a Path>,
24    /// 1-based coefficient (or contrast) to tabulate; defaults to the last.
25    pub coef: Option<usize>,
26    pub proportion: f64,
27}
28
29/// topTable(sort.by="none") row, in input gene order.
30pub struct Row {
31    pub gene: String,
32    pub logfc: f64,
33    pub ave_expr: f64,
34    pub t: f64,
35    pub p_value: f64,
36    pub adj_p_val: f64,
37    pub b: f64,
38}
39
40pub struct Results {
41    pub coef_name: String,
42    pub rows: Vec<Row>,
43    pub df_total: f64,
44    pub df_prior: f64,
45    pub s2_prior: f64,
46}
47
48pub fn run(opts: &Options) -> Result<Results> {
49    let expr = read_expr(opts.expr)?;
50    let design = read_design(opts.design)?;
51    if design.x.len() != expr.samples.len() {
52        return Err(RsomicsError::InvalidInput(format!(
53            "design has {} rows, expression has {} samples",
54            design.x.len(),
55            expr.samples.len()
56        )));
57    }
58
59    let base = fit::lm_fit(&expr.y, &expr.genes, &design.x, &design.coef_names)?;
60    let xtx_inv = fit::Qr::new(&design.x)?.xtx_inv();
61
62    let fit = if let Some(cpath) = opts.contrast {
63        let contrast = read_contrast(cpath, &design.coef_names)?;
64        fit::contrasts_fit(&base, &contrast, &xtx_inv)
65    } else {
66        base
67    };
68
69    let m = ebayes::ebayes(&fit, &fit.stdev_unscaled, opts.proportion);
70
71    let nc = fit.coef_names.len();
72    let coef = opts.coef.unwrap_or(nc);
73    if coef < 1 || coef > nc {
74        return Err(RsomicsError::InvalidInput(format!(
75            "--coef {coef} out of range 1..={nc}"
76        )));
77    }
78    let k = coef - 1;
79
80    let pvals: Vec<f64> = (0..fit.genes.len()).map(|gi| m.p[gi][k]).collect();
81    let adj = bh_adjust(&pvals);
82
83    let mut rows = Vec::with_capacity(fit.genes.len());
84    for (gi, gene) in fit.genes.iter().enumerate() {
85        rows.push(Row {
86            gene: gene.clone(),
87            logfc: fit.coefficients[gi][k],
88            ave_expr: fit.amean[gi],
89            t: m.t[gi][k],
90            p_value: m.p[gi][k],
91            adj_p_val: adj[gi],
92            b: m.lods[gi][k],
93        });
94    }
95
96    Ok(Results {
97        coef_name: fit.coef_names[k].clone(),
98        rows,
99        df_total: m.df_total,
100        df_prior: m.df_prior,
101        s2_prior: m.s2_prior,
102    })
103}
104
105/// Benjamini-Hochberg adjusted p-values, returned in input order.
106fn bh_adjust(p: &[f64]) -> Vec<f64> {
107    let n = p.len();
108    let mut idx: Vec<usize> = (0..n).collect();
109    idx.sort_by(|&a, &b| p[b].partial_cmp(&p[a]).unwrap());
110    let mut adj = vec![0.0; n];
111    let mut cummin = f64::INFINITY;
112    for (rank, &i) in idx.iter().enumerate() {
113        let m = (n - rank) as f64;
114        let v = (n as f64 / m * p[i]).min(1.0);
115        cummin = cummin.min(v);
116        adj[i] = cummin;
117    }
118    adj
119}
120
121pub fn write_results(res: &Results, out: &mut dyn Write) -> Result<()> {
122    let mut w = BufWriter::with_capacity(1 << 20, out);
123    writeln!(w, "gene\tlogFC\tAveExpr\tt\tP.Value\tadj.P.Val\tB").map_err(RsomicsError::Io)?;
124    let mut fmt = ryu::Buffer::new();
125    let mut line = String::with_capacity(128);
126    for row in &res.rows {
127        line.clear();
128        line.push_str(&row.gene);
129        for v in [
130            row.logfc,
131            row.ave_expr,
132            row.t,
133            row.p_value,
134            row.adj_p_val,
135            row.b,
136        ] {
137            line.push('\t');
138            line.push_str(fmt.format(v));
139        }
140        line.push('\n');
141        w.write_all(line.as_bytes()).map_err(RsomicsError::Io)?;
142    }
143    w.flush().map_err(RsomicsError::Io)?;
144    Ok(())
145}