Skip to main content

rsomics_avelogcpm/
lib.rs

1use std::fs::File;
2use std::io::{BufRead, BufReader, BufWriter, Write};
3use std::path::Path;
4
5use rsomics_common::{Result, RsomicsError};
6
7pub struct Matrix {
8    pub header: String,
9    pub genes: Vec<String>,
10    pub counts: Vec<f64>,
11    pub n_samples: usize,
12}
13
14impl Matrix {
15    pub fn load(path: &Path) -> Result<Self> {
16        let file = File::open(path)
17            .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", path.display())))?;
18        let reader = BufReader::new(file);
19        let mut lines = reader.lines();
20
21        let header = lines
22            .next()
23            .ok_or_else(|| RsomicsError::InvalidInput("empty count matrix".into()))?
24            .map_err(RsomicsError::Io)?;
25        let n_samples = header.split('\t').count() - 1;
26
27        let mut genes = Vec::new();
28        let mut counts = Vec::new();
29        for line in lines {
30            let line = line.map_err(RsomicsError::Io)?;
31            if line.is_empty() {
32                continue;
33            }
34            let mut fields = line.split('\t');
35            let gene = fields
36                .next()
37                .ok_or_else(|| RsomicsError::InvalidInput("row without a gene id".into()))?;
38            genes.push(gene.to_string());
39            let before = counts.len();
40            for f in fields {
41                counts.push(f.parse::<f64>().map_err(|_| {
42                    RsomicsError::InvalidInput(format!("non-numeric count '{f}' for gene {gene}"))
43                })?);
44            }
45            if counts.len() - before != n_samples {
46                return Err(RsomicsError::InvalidInput(format!(
47                    "gene {gene}: {} values, header has {n_samples} samples",
48                    counts.len() - before
49                )));
50            }
51        }
52        Ok(Self {
53            header,
54            genes,
55            counts,
56            n_samples,
57        })
58    }
59
60    pub fn n_genes(&self) -> usize {
61        self.genes.len()
62    }
63}
64
65fn load_norm_factors(path: &Path, n_samples: usize) -> Result<Vec<f64>> {
66    let file = File::open(path)
67        .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", path.display())))?;
68    let mut factors = Vec::with_capacity(n_samples);
69    for line in BufReader::new(file).lines() {
70        let line = line.map_err(RsomicsError::Io)?;
71        let line = line.trim();
72        if line.is_empty() || line.starts_with('#') {
73            continue;
74        }
75        let val = line.rsplit('\t').next().unwrap_or(line);
76        factors.push(
77            val.parse::<f64>().map_err(|_| {
78                RsomicsError::InvalidInput(format!("non-numeric norm factor '{val}'"))
79            })?,
80        );
81    }
82    if factors.len() != n_samples {
83        return Err(RsomicsError::InvalidInput(format!(
84            "{} norm factors for {n_samples} samples",
85            factors.len()
86        )));
87    }
88    Ok(factors)
89}
90
91fn library_sizes(m: &Matrix, norm_factors: Option<&[f64]>) -> Vec<f64> {
92    let mut sizes = vec![0.0f64; m.n_samples];
93    for row in m.counts.chunks_exact(m.n_samples) {
94        for (s, &c) in sizes.iter_mut().zip(row) {
95            *s += c;
96        }
97    }
98    if let Some(nf) = norm_factors {
99        for (s, &f) in sizes.iter_mut().zip(nf) {
100            *s *= f;
101        }
102    }
103    sizes
104}
105
106pub struct AveLogCpmOpts {
107    pub prior_count: f64,
108    pub dispersion: f64,
109}
110
111const MAXIT: usize = 50;
112const TOL: f64 = 1e-10;
113const LOG2: f64 = std::f64::consts::LN_2;
114
115// One-group negative-binomial fit (edgeR mglmOneGroup / glm_one_group): Fisher
116// scoring for the single coefficient beta where mu[j] = exp(beta) * adj_lib[j].
117// Returns beta on the natural-log scale; -inf when every count is zero.
118fn fit_one_group(row: &[f64], adj_lib: &[f64], total_lib: f64, dispersion: f64) -> f64 {
119    let total_count: f64 = row.iter().sum();
120    if total_count == 0.0 {
121        return f64::NEG_INFINITY;
122    }
123    let mut beta = (total_count / total_lib).ln();
124    for _ in 0..MAXIT {
125        let mut dl = 0.0;
126        let mut info = 0.0;
127        for (&y, &lib) in row.iter().zip(adj_lib) {
128            let mu = beta.exp() * lib;
129            let denom = 1.0 + mu * dispersion;
130            dl += (y - mu) / denom;
131            info += mu / denom;
132        }
133        let step = dl / info;
134        beta += step;
135        if step.abs() < TOL {
136            break;
137        }
138    }
139    beta
140}
141
142pub fn ave_log_cpm(
143    counts_path: &Path,
144    norm_factors_path: Option<&Path>,
145    opts: &AveLogCpmOpts,
146    output: &mut dyn Write,
147) -> Result<u64> {
148    let m = Matrix::load(counts_path)?;
149    let norm_factors = match norm_factors_path {
150        Some(p) => Some(load_norm_factors(p, m.n_samples)?),
151        None => None,
152    };
153    let lib = library_sizes(&m, norm_factors.as_deref());
154
155    let gene_col = m.header.split('\t').next().unwrap_or("gene");
156
157    // addPriorCount: the prior is scaled to each library's share of the mean
158    // size, added to every count, and twice the scaled prior is added to the
159    // library size before the log offset.
160    let mean_lib = lib.iter().sum::<f64>() / lib.len() as f64;
161    let scaled_prior: Vec<f64> = lib
162        .iter()
163        .map(|&l| opts.prior_count * l / mean_lib)
164        .collect();
165    let adj_lib: Vec<f64> = lib
166        .iter()
167        .zip(&scaled_prior)
168        .map(|(&l, &p)| l + 2.0 * p)
169        .collect();
170    let total_adj_lib: f64 = adj_lib.iter().sum();
171    let cpm_shift = 1e6f64.ln();
172
173    let mut out = BufWriter::new(output);
174    writeln!(out, "{gene_col}\tAveLogCPM").map_err(RsomicsError::Io)?;
175
176    let mut row_aug = vec![0.0f64; m.n_samples];
177    for (gene, row) in m.genes.iter().zip(m.counts.chunks_exact(m.n_samples)) {
178        for ((dst, &c), &p) in row_aug.iter_mut().zip(row).zip(&scaled_prior) {
179            *dst = c + p;
180        }
181        let beta = fit_one_group(&row_aug, &adj_lib, total_adj_lib, opts.dispersion);
182        let ave = (beta + cpm_shift) / LOG2;
183        writeln!(out, "{gene}\t{}", fmt(ave)).map_err(RsomicsError::Io)?;
184    }
185    out.flush().map_err(RsomicsError::Io)?;
186    Ok(m.n_genes() as u64)
187}
188
189fn fmt(v: f64) -> String {
190    format!("{v:.6}")
191}
192
193#[cfg(test)]
194mod tests {
195    use super::*;
196
197    fn mat() -> Matrix {
198        Matrix {
199            header: "gene\ts1\ts2\ts3".into(),
200            genes: vec!["g1".into(), "g2".into(), "g3".into(), "g4".into()],
201            counts: vec![
202                0.0, 10.0, 20.0, 5.0, 0.0, 100.0, 50.0, 1.0, 3.0, 0.0, 0.0, 7.0,
203            ],
204            n_samples: 3,
205        }
206    }
207
208    #[test]
209    fn lib_sizes_are_column_sums() {
210        let m = mat();
211        assert_eq!(library_sizes(&m, None), vec![55.0, 11.0, 130.0]);
212    }
213
214    #[test]
215    fn all_zero_row_is_handled() {
216        let row = [0.0, 0.0, 0.0];
217        let lib = [100.0, 100.0, 100.0];
218        let beta = fit_one_group(&row, &lib, 300.0, 0.05);
219        assert!(beta.is_infinite() && beta < 0.0);
220    }
221
222    // Poisson limit (dispersion -> 0): beta is the closed-form log(sum y / sum lib).
223    #[test]
224    fn poisson_limit_is_closed_form() {
225        let row = [4.0, 8.0, 12.0];
226        let lib = [100.0, 200.0, 300.0];
227        let beta = fit_one_group(&row, &lib, 600.0, 0.0);
228        let expect = (24.0f64 / 600.0).ln();
229        assert!((beta - expect).abs() < 1e-9, "{beta} vs {expect}");
230    }
231
232    // edgeR aveLogCPM(y, prior.count=2, dispersion=0.05) probed directly in R.
233    #[test]
234    fn ave_log_cpm_matches_edger() {
235        let m = mat();
236        let lib = library_sizes(&m, None);
237        let mean_lib = lib.iter().sum::<f64>() / lib.len() as f64;
238        let scaled: Vec<f64> = lib.iter().map(|&l| 2.0 * l / mean_lib).collect();
239        let adj: Vec<f64> = lib
240            .iter()
241            .zip(&scaled)
242            .map(|(&l, &p)| l + 2.0 * p)
243            .collect();
244        let total: f64 = adj.iter().sum();
245        let row: Vec<f64> = m.counts[0..3]
246            .iter()
247            .zip(&scaled)
248            .map(|(&c, &p)| c + p)
249            .collect();
250        let beta = fit_one_group(&row, &adj, total, 0.05);
251        let got = (beta + 1e6f64.ln()) / std::f64::consts::LN_2;
252        let expect = 17.558323;
253        assert!((got - expect).abs() < 1e-4, "{got} vs {expect}");
254    }
255}