use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::path::Path;
use rsomics_common::{Result, RsomicsError};
pub struct Matrix {
pub header: String,
pub genes: Vec<String>,
pub counts: Vec<f64>,
pub n_samples: usize,
}
impl Matrix {
pub fn load(path: &Path) -> Result<Self> {
let file = File::open(path)
.map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", path.display())))?;
let reader = BufReader::new(file);
let mut lines = reader.lines();
let header = lines
.next()
.ok_or_else(|| RsomicsError::InvalidInput("empty count matrix".into()))?
.map_err(RsomicsError::Io)?;
let n_samples = header.split('\t').count() - 1;
let mut genes = Vec::new();
let mut counts = Vec::new();
for line in lines {
let line = line.map_err(RsomicsError::Io)?;
if line.is_empty() {
continue;
}
let mut fields = line.split('\t');
let gene = fields
.next()
.ok_or_else(|| RsomicsError::InvalidInput("row without a gene id".into()))?;
genes.push(gene.to_string());
let before = counts.len();
for f in fields {
counts.push(f.parse::<f64>().map_err(|_| {
RsomicsError::InvalidInput(format!("non-numeric count '{f}' for gene {gene}"))
})?);
}
if counts.len() - before != n_samples {
return Err(RsomicsError::InvalidInput(format!(
"gene {gene}: {} values, header has {n_samples} samples",
counts.len() - before
)));
}
}
Ok(Self {
header,
genes,
counts,
n_samples,
})
}
pub fn n_genes(&self) -> usize {
self.genes.len()
}
}
fn load_norm_factors(path: &Path, n_samples: usize) -> Result<Vec<f64>> {
let file = File::open(path)
.map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", path.display())))?;
let mut factors = Vec::with_capacity(n_samples);
for line in BufReader::new(file).lines() {
let line = line.map_err(RsomicsError::Io)?;
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let val = line.rsplit('\t').next().unwrap_or(line);
factors.push(
val.parse::<f64>().map_err(|_| {
RsomicsError::InvalidInput(format!("non-numeric norm factor '{val}'"))
})?,
);
}
if factors.len() != n_samples {
return Err(RsomicsError::InvalidInput(format!(
"{} norm factors for {n_samples} samples",
factors.len()
)));
}
Ok(factors)
}
fn library_sizes(m: &Matrix, norm_factors: Option<&[f64]>) -> Vec<f64> {
let mut sizes = vec![0.0f64; m.n_samples];
for row in m.counts.chunks_exact(m.n_samples) {
for (s, &c) in sizes.iter_mut().zip(row) {
*s += c;
}
}
if let Some(nf) = norm_factors {
for (s, &f) in sizes.iter_mut().zip(nf) {
*s *= f;
}
}
sizes
}
pub struct AveLogCpmOpts {
pub prior_count: f64,
pub dispersion: f64,
}
const MAXIT: usize = 50;
const TOL: f64 = 1e-10;
const LOG2: f64 = std::f64::consts::LN_2;
fn fit_one_group(row: &[f64], adj_lib: &[f64], total_lib: f64, dispersion: f64) -> f64 {
let total_count: f64 = row.iter().sum();
if total_count == 0.0 {
return f64::NEG_INFINITY;
}
let mut beta = (total_count / total_lib).ln();
for _ in 0..MAXIT {
let mut dl = 0.0;
let mut info = 0.0;
for (&y, &lib) in row.iter().zip(adj_lib) {
let mu = beta.exp() * lib;
let denom = 1.0 + mu * dispersion;
dl += (y - mu) / denom;
info += mu / denom;
}
let step = dl / info;
beta += step;
if step.abs() < TOL {
break;
}
}
beta
}
pub fn ave_log_cpm(
counts_path: &Path,
norm_factors_path: Option<&Path>,
opts: &AveLogCpmOpts,
output: &mut dyn Write,
) -> Result<u64> {
let m = Matrix::load(counts_path)?;
let norm_factors = match norm_factors_path {
Some(p) => Some(load_norm_factors(p, m.n_samples)?),
None => None,
};
let lib = library_sizes(&m, norm_factors.as_deref());
let gene_col = m.header.split('\t').next().unwrap_or("gene");
let mean_lib = lib.iter().sum::<f64>() / lib.len() as f64;
let scaled_prior: Vec<f64> = lib
.iter()
.map(|&l| opts.prior_count * l / mean_lib)
.collect();
let adj_lib: Vec<f64> = lib
.iter()
.zip(&scaled_prior)
.map(|(&l, &p)| l + 2.0 * p)
.collect();
let total_adj_lib: f64 = adj_lib.iter().sum();
let cpm_shift = 1e6f64.ln();
let mut out = BufWriter::new(output);
writeln!(out, "{gene_col}\tAveLogCPM").map_err(RsomicsError::Io)?;
let mut row_aug = vec![0.0f64; m.n_samples];
for (gene, row) in m.genes.iter().zip(m.counts.chunks_exact(m.n_samples)) {
for ((dst, &c), &p) in row_aug.iter_mut().zip(row).zip(&scaled_prior) {
*dst = c + p;
}
let beta = fit_one_group(&row_aug, &adj_lib, total_adj_lib, opts.dispersion);
let ave = (beta + cpm_shift) / LOG2;
writeln!(out, "{gene}\t{}", fmt(ave)).map_err(RsomicsError::Io)?;
}
out.flush().map_err(RsomicsError::Io)?;
Ok(m.n_genes() as u64)
}
fn fmt(v: f64) -> String {
format!("{v:.6}")
}
#[cfg(test)]
mod tests {
use super::*;
fn mat() -> Matrix {
Matrix {
header: "gene\ts1\ts2\ts3".into(),
genes: vec!["g1".into(), "g2".into(), "g3".into(), "g4".into()],
counts: vec![
0.0, 10.0, 20.0, 5.0, 0.0, 100.0, 50.0, 1.0, 3.0, 0.0, 0.0, 7.0,
],
n_samples: 3,
}
}
#[test]
fn lib_sizes_are_column_sums() {
let m = mat();
assert_eq!(library_sizes(&m, None), vec![55.0, 11.0, 130.0]);
}
#[test]
fn all_zero_row_is_handled() {
let row = [0.0, 0.0, 0.0];
let lib = [100.0, 100.0, 100.0];
let beta = fit_one_group(&row, &lib, 300.0, 0.05);
assert!(beta.is_infinite() && beta < 0.0);
}
#[test]
fn poisson_limit_is_closed_form() {
let row = [4.0, 8.0, 12.0];
let lib = [100.0, 200.0, 300.0];
let beta = fit_one_group(&row, &lib, 600.0, 0.0);
let expect = (24.0f64 / 600.0).ln();
assert!((beta - expect).abs() < 1e-9, "{beta} vs {expect}");
}
#[test]
fn ave_log_cpm_matches_edger() {
let m = mat();
let lib = library_sizes(&m, None);
let mean_lib = lib.iter().sum::<f64>() / lib.len() as f64;
let scaled: Vec<f64> = lib.iter().map(|&l| 2.0 * l / mean_lib).collect();
let adj: Vec<f64> = lib
.iter()
.zip(&scaled)
.map(|(&l, &p)| l + 2.0 * p)
.collect();
let total: f64 = adj.iter().sum();
let row: Vec<f64> = m.counts[0..3]
.iter()
.zip(&scaled)
.map(|(&c, &p)| c + p)
.collect();
let beta = fit_one_group(&row, &adj, total, 0.05);
let got = (beta + 1e6f64.ln()) / std::f64::consts::LN_2;
let expect = 17.558323;
assert!((got - expect).abs() < 1e-4, "{got} vs {expect}");
}
}