rsomics-avelogcpm 0.1.0

Per-gene average log2-CPM of a count matrix via the edgeR aveLogCPM one-group negative-binomial fit
Documentation
use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::path::Path;

use rsomics_common::{Result, RsomicsError};

pub struct Matrix {
    pub header: String,
    pub genes: Vec<String>,
    pub counts: Vec<f64>,
    pub n_samples: usize,
}

impl Matrix {
    pub fn load(path: &Path) -> Result<Self> {
        let file = File::open(path)
            .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", path.display())))?;
        let reader = BufReader::new(file);
        let mut lines = reader.lines();

        let header = lines
            .next()
            .ok_or_else(|| RsomicsError::InvalidInput("empty count matrix".into()))?
            .map_err(RsomicsError::Io)?;
        let n_samples = header.split('\t').count() - 1;

        let mut genes = Vec::new();
        let mut counts = Vec::new();
        for line in lines {
            let line = line.map_err(RsomicsError::Io)?;
            if line.is_empty() {
                continue;
            }
            let mut fields = line.split('\t');
            let gene = fields
                .next()
                .ok_or_else(|| RsomicsError::InvalidInput("row without a gene id".into()))?;
            genes.push(gene.to_string());
            let before = counts.len();
            for f in fields {
                counts.push(f.parse::<f64>().map_err(|_| {
                    RsomicsError::InvalidInput(format!("non-numeric count '{f}' for gene {gene}"))
                })?);
            }
            if counts.len() - before != n_samples {
                return Err(RsomicsError::InvalidInput(format!(
                    "gene {gene}: {} values, header has {n_samples} samples",
                    counts.len() - before
                )));
            }
        }
        Ok(Self {
            header,
            genes,
            counts,
            n_samples,
        })
    }

    pub fn n_genes(&self) -> usize {
        self.genes.len()
    }
}

fn load_norm_factors(path: &Path, n_samples: usize) -> Result<Vec<f64>> {
    let file = File::open(path)
        .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", path.display())))?;
    let mut factors = Vec::with_capacity(n_samples);
    for line in BufReader::new(file).lines() {
        let line = line.map_err(RsomicsError::Io)?;
        let line = line.trim();
        if line.is_empty() || line.starts_with('#') {
            continue;
        }
        let val = line.rsplit('\t').next().unwrap_or(line);
        factors.push(
            val.parse::<f64>().map_err(|_| {
                RsomicsError::InvalidInput(format!("non-numeric norm factor '{val}'"))
            })?,
        );
    }
    if factors.len() != n_samples {
        return Err(RsomicsError::InvalidInput(format!(
            "{} norm factors for {n_samples} samples",
            factors.len()
        )));
    }
    Ok(factors)
}

fn library_sizes(m: &Matrix, norm_factors: Option<&[f64]>) -> Vec<f64> {
    let mut sizes = vec![0.0f64; m.n_samples];
    for row in m.counts.chunks_exact(m.n_samples) {
        for (s, &c) in sizes.iter_mut().zip(row) {
            *s += c;
        }
    }
    if let Some(nf) = norm_factors {
        for (s, &f) in sizes.iter_mut().zip(nf) {
            *s *= f;
        }
    }
    sizes
}

pub struct AveLogCpmOpts {
    pub prior_count: f64,
    pub dispersion: f64,
}

const MAXIT: usize = 50;
const TOL: f64 = 1e-10;
const LOG2: f64 = std::f64::consts::LN_2;

// One-group negative-binomial fit (edgeR mglmOneGroup / glm_one_group): Fisher
// scoring for the single coefficient beta where mu[j] = exp(beta) * adj_lib[j].
// Returns beta on the natural-log scale; -inf when every count is zero.
fn fit_one_group(row: &[f64], adj_lib: &[f64], total_lib: f64, dispersion: f64) -> f64 {
    let total_count: f64 = row.iter().sum();
    if total_count == 0.0 {
        return f64::NEG_INFINITY;
    }
    let mut beta = (total_count / total_lib).ln();
    for _ in 0..MAXIT {
        let mut dl = 0.0;
        let mut info = 0.0;
        for (&y, &lib) in row.iter().zip(adj_lib) {
            let mu = beta.exp() * lib;
            let denom = 1.0 + mu * dispersion;
            dl += (y - mu) / denom;
            info += mu / denom;
        }
        let step = dl / info;
        beta += step;
        if step.abs() < TOL {
            break;
        }
    }
    beta
}

pub fn ave_log_cpm(
    counts_path: &Path,
    norm_factors_path: Option<&Path>,
    opts: &AveLogCpmOpts,
    output: &mut dyn Write,
) -> Result<u64> {
    let m = Matrix::load(counts_path)?;
    let norm_factors = match norm_factors_path {
        Some(p) => Some(load_norm_factors(p, m.n_samples)?),
        None => None,
    };
    let lib = library_sizes(&m, norm_factors.as_deref());

    let gene_col = m.header.split('\t').next().unwrap_or("gene");

    // addPriorCount: the prior is scaled to each library's share of the mean
    // size, added to every count, and twice the scaled prior is added to the
    // library size before the log offset.
    let mean_lib = lib.iter().sum::<f64>() / lib.len() as f64;
    let scaled_prior: Vec<f64> = lib
        .iter()
        .map(|&l| opts.prior_count * l / mean_lib)
        .collect();
    let adj_lib: Vec<f64> = lib
        .iter()
        .zip(&scaled_prior)
        .map(|(&l, &p)| l + 2.0 * p)
        .collect();
    let total_adj_lib: f64 = adj_lib.iter().sum();
    let cpm_shift = 1e6f64.ln();

    let mut out = BufWriter::new(output);
    writeln!(out, "{gene_col}\tAveLogCPM").map_err(RsomicsError::Io)?;

    let mut row_aug = vec![0.0f64; m.n_samples];
    for (gene, row) in m.genes.iter().zip(m.counts.chunks_exact(m.n_samples)) {
        for ((dst, &c), &p) in row_aug.iter_mut().zip(row).zip(&scaled_prior) {
            *dst = c + p;
        }
        let beta = fit_one_group(&row_aug, &adj_lib, total_adj_lib, opts.dispersion);
        let ave = (beta + cpm_shift) / LOG2;
        writeln!(out, "{gene}\t{}", fmt(ave)).map_err(RsomicsError::Io)?;
    }
    out.flush().map_err(RsomicsError::Io)?;
    Ok(m.n_genes() as u64)
}

fn fmt(v: f64) -> String {
    format!("{v:.6}")
}

#[cfg(test)]
mod tests {
    use super::*;

    fn mat() -> Matrix {
        Matrix {
            header: "gene\ts1\ts2\ts3".into(),
            genes: vec!["g1".into(), "g2".into(), "g3".into(), "g4".into()],
            counts: vec![
                0.0, 10.0, 20.0, 5.0, 0.0, 100.0, 50.0, 1.0, 3.0, 0.0, 0.0, 7.0,
            ],
            n_samples: 3,
        }
    }

    #[test]
    fn lib_sizes_are_column_sums() {
        let m = mat();
        assert_eq!(library_sizes(&m, None), vec![55.0, 11.0, 130.0]);
    }

    #[test]
    fn all_zero_row_is_handled() {
        let row = [0.0, 0.0, 0.0];
        let lib = [100.0, 100.0, 100.0];
        let beta = fit_one_group(&row, &lib, 300.0, 0.05);
        assert!(beta.is_infinite() && beta < 0.0);
    }

    // Poisson limit (dispersion -> 0): beta is the closed-form log(sum y / sum lib).
    #[test]
    fn poisson_limit_is_closed_form() {
        let row = [4.0, 8.0, 12.0];
        let lib = [100.0, 200.0, 300.0];
        let beta = fit_one_group(&row, &lib, 600.0, 0.0);
        let expect = (24.0f64 / 600.0).ln();
        assert!((beta - expect).abs() < 1e-9, "{beta} vs {expect}");
    }

    // edgeR aveLogCPM(y, prior.count=2, dispersion=0.05) probed directly in R.
    #[test]
    fn ave_log_cpm_matches_edger() {
        let m = mat();
        let lib = library_sizes(&m, None);
        let mean_lib = lib.iter().sum::<f64>() / lib.len() as f64;
        let scaled: Vec<f64> = lib.iter().map(|&l| 2.0 * l / mean_lib).collect();
        let adj: Vec<f64> = lib
            .iter()
            .zip(&scaled)
            .map(|(&l, &p)| l + 2.0 * p)
            .collect();
        let total: f64 = adj.iter().sum();
        let row: Vec<f64> = m.counts[0..3]
            .iter()
            .zip(&scaled)
            .map(|(&c, &p)| c + p)
            .collect();
        let beta = fit_one_group(&row, &adj, total, 0.05);
        let got = (beta + 1e6f64.ln()) / std::f64::consts::LN_2;
        let expect = 17.558323;
        assert!((got - expect).abs() < 1e-4, "{got} vs {expect}");
    }
}