1use std::fs::File;
2use std::io::{BufRead, BufReader, BufWriter, Write};
3use std::path::Path;
4
5use rsomics_common::{Result, RsomicsError};
6
7pub struct Matrix {
8 pub header: String,
9 pub genes: Vec<String>,
10 pub counts: Vec<f64>,
11 pub n_samples: usize,
12}
13
14impl Matrix {
15 pub fn load(path: &Path) -> Result<Self> {
16 let file = File::open(path)
17 .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", path.display())))?;
18 let reader = BufReader::new(file);
19 let mut lines = reader.lines();
20
21 let header = lines
22 .next()
23 .ok_or_else(|| RsomicsError::InvalidInput("empty count matrix".into()))?
24 .map_err(RsomicsError::Io)?;
25 let n_samples = header.split('\t').count() - 1;
26
27 let mut genes = Vec::new();
28 let mut counts = Vec::new();
29 for line in lines {
30 let line = line.map_err(RsomicsError::Io)?;
31 if line.is_empty() {
32 continue;
33 }
34 let mut fields = line.split('\t');
35 let gene = fields
36 .next()
37 .ok_or_else(|| RsomicsError::InvalidInput("row without a gene id".into()))?;
38 genes.push(gene.to_string());
39 let before = counts.len();
40 for f in fields {
41 counts.push(f.parse::<f64>().map_err(|_| {
42 RsomicsError::InvalidInput(format!("non-numeric count '{f}' for gene {gene}"))
43 })?);
44 }
45 if counts.len() - before != n_samples {
46 return Err(RsomicsError::InvalidInput(format!(
47 "gene {gene}: {} values, header has {n_samples} samples",
48 counts.len() - before
49 )));
50 }
51 }
52 Ok(Self {
53 header,
54 genes,
55 counts,
56 n_samples,
57 })
58 }
59
60 pub fn n_genes(&self) -> usize {
61 self.genes.len()
62 }
63}
64
65fn load_norm_factors(path: &Path, n_samples: usize) -> Result<Vec<f64>> {
66 let file = File::open(path)
67 .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", path.display())))?;
68 let mut factors = Vec::with_capacity(n_samples);
69 for line in BufReader::new(file).lines() {
70 let line = line.map_err(RsomicsError::Io)?;
71 let line = line.trim();
72 if line.is_empty() || line.starts_with('#') {
73 continue;
74 }
75 let val = line.rsplit('\t').next().unwrap_or(line);
76 factors.push(
77 val.parse::<f64>().map_err(|_| {
78 RsomicsError::InvalidInput(format!("non-numeric norm factor '{val}'"))
79 })?,
80 );
81 }
82 if factors.len() != n_samples {
83 return Err(RsomicsError::InvalidInput(format!(
84 "{} norm factors for {n_samples} samples",
85 factors.len()
86 )));
87 }
88 Ok(factors)
89}
90
91fn library_sizes(m: &Matrix, norm_factors: Option<&[f64]>) -> Vec<f64> {
92 let mut sizes = vec![0.0f64; m.n_samples];
93 for row in m.counts.chunks_exact(m.n_samples) {
94 for (s, &c) in sizes.iter_mut().zip(row) {
95 *s += c;
96 }
97 }
98 if let Some(nf) = norm_factors {
99 for (s, &f) in sizes.iter_mut().zip(nf) {
100 *s *= f;
101 }
102 }
103 sizes
104}
105
106pub struct AveLogCpmOpts {
107 pub prior_count: f64,
108 pub dispersion: f64,
109}
110
111const MAXIT: usize = 50;
112const TOL: f64 = 1e-10;
113const LOG2: f64 = std::f64::consts::LN_2;
114
115fn fit_one_group(row: &[f64], adj_lib: &[f64], total_lib: f64, dispersion: f64) -> f64 {
119 let total_count: f64 = row.iter().sum();
120 if total_count == 0.0 {
121 return f64::NEG_INFINITY;
122 }
123 let mut beta = (total_count / total_lib).ln();
124 for _ in 0..MAXIT {
125 let mut dl = 0.0;
126 let mut info = 0.0;
127 for (&y, &lib) in row.iter().zip(adj_lib) {
128 let mu = beta.exp() * lib;
129 let denom = 1.0 + mu * dispersion;
130 dl += (y - mu) / denom;
131 info += mu / denom;
132 }
133 let step = dl / info;
134 beta += step;
135 if step.abs() < TOL {
136 break;
137 }
138 }
139 beta
140}
141
142pub fn ave_log_cpm(
143 counts_path: &Path,
144 norm_factors_path: Option<&Path>,
145 opts: &AveLogCpmOpts,
146 output: &mut dyn Write,
147) -> Result<u64> {
148 let m = Matrix::load(counts_path)?;
149 let norm_factors = match norm_factors_path {
150 Some(p) => Some(load_norm_factors(p, m.n_samples)?),
151 None => None,
152 };
153 let lib = library_sizes(&m, norm_factors.as_deref());
154
155 let gene_col = m.header.split('\t').next().unwrap_or("gene");
156
157 let mean_lib = lib.iter().sum::<f64>() / lib.len() as f64;
161 let scaled_prior: Vec<f64> = lib
162 .iter()
163 .map(|&l| opts.prior_count * l / mean_lib)
164 .collect();
165 let adj_lib: Vec<f64> = lib
166 .iter()
167 .zip(&scaled_prior)
168 .map(|(&l, &p)| l + 2.0 * p)
169 .collect();
170 let total_adj_lib: f64 = adj_lib.iter().sum();
171 let cpm_shift = 1e6f64.ln();
172
173 let mut out = BufWriter::new(output);
174 writeln!(out, "{gene_col}\tAveLogCPM").map_err(RsomicsError::Io)?;
175
176 let mut row_aug = vec![0.0f64; m.n_samples];
177 for (gene, row) in m.genes.iter().zip(m.counts.chunks_exact(m.n_samples)) {
178 for ((dst, &c), &p) in row_aug.iter_mut().zip(row).zip(&scaled_prior) {
179 *dst = c + p;
180 }
181 let beta = fit_one_group(&row_aug, &adj_lib, total_adj_lib, opts.dispersion);
182 let ave = (beta + cpm_shift) / LOG2;
183 writeln!(out, "{gene}\t{}", fmt(ave)).map_err(RsomicsError::Io)?;
184 }
185 out.flush().map_err(RsomicsError::Io)?;
186 Ok(m.n_genes() as u64)
187}
188
189fn fmt(v: f64) -> String {
190 format!("{v:.6}")
191}
192
193#[cfg(test)]
194mod tests {
195 use super::*;
196
197 fn mat() -> Matrix {
198 Matrix {
199 header: "gene\ts1\ts2\ts3".into(),
200 genes: vec!["g1".into(), "g2".into(), "g3".into(), "g4".into()],
201 counts: vec![
202 0.0, 10.0, 20.0, 5.0, 0.0, 100.0, 50.0, 1.0, 3.0, 0.0, 0.0, 7.0,
203 ],
204 n_samples: 3,
205 }
206 }
207
208 #[test]
209 fn lib_sizes_are_column_sums() {
210 let m = mat();
211 assert_eq!(library_sizes(&m, None), vec![55.0, 11.0, 130.0]);
212 }
213
214 #[test]
215 fn all_zero_row_is_handled() {
216 let row = [0.0, 0.0, 0.0];
217 let lib = [100.0, 100.0, 100.0];
218 let beta = fit_one_group(&row, &lib, 300.0, 0.05);
219 assert!(beta.is_infinite() && beta < 0.0);
220 }
221
222 #[test]
224 fn poisson_limit_is_closed_form() {
225 let row = [4.0, 8.0, 12.0];
226 let lib = [100.0, 200.0, 300.0];
227 let beta = fit_one_group(&row, &lib, 600.0, 0.0);
228 let expect = (24.0f64 / 600.0).ln();
229 assert!((beta - expect).abs() < 1e-9, "{beta} vs {expect}");
230 }
231
232 #[test]
234 fn ave_log_cpm_matches_edger() {
235 let m = mat();
236 let lib = library_sizes(&m, None);
237 let mean_lib = lib.iter().sum::<f64>() / lib.len() as f64;
238 let scaled: Vec<f64> = lib.iter().map(|&l| 2.0 * l / mean_lib).collect();
239 let adj: Vec<f64> = lib
240 .iter()
241 .zip(&scaled)
242 .map(|(&l, &p)| l + 2.0 * p)
243 .collect();
244 let total: f64 = adj.iter().sum();
245 let row: Vec<f64> = m.counts[0..3]
246 .iter()
247 .zip(&scaled)
248 .map(|(&c, &p)| c + p)
249 .collect();
250 let beta = fit_one_group(&row, &adj, total, 0.05);
251 let got = (beta + 1e6f64.ln()) / std::f64::consts::LN_2;
252 let expect = 17.558323;
253 assert!((got - expect).abs() < 1e-4, "{got} vs {expect}");
254 }
255}