rsomics-limma-array-weights 0.1.0

Estimate per-sample (array) quality weights by REML for a log-expression matrix + design — a clean-room Rust reimplementation of limma's arrayWeights
Documentation
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;

use rsomics_common::{Result, RsomicsError};

fn open(path: &Path) -> Result<BufReader<File>> {
    let f = File::open(path)
        .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", path.display())))?;
    Ok(BufReader::new(f))
}

fn parse_f64(s: &str) -> Result<f64> {
    let t = s.trim();
    t.parse::<f64>()
        .map_err(|_| RsomicsError::InvalidInput(format!("non-numeric value '{t}'")))
}

pub struct Expr {
    pub samples: Vec<String>,
    /// row-major [gene][sample]
    pub y: Vec<Vec<f64>>,
}

pub fn read_expr(path: &Path) -> Result<Expr> {
    let mut lines = open(path)?.lines();
    let header = lines
        .next()
        .ok_or_else(|| RsomicsError::InvalidInput("empty expression matrix".into()))?
        .map_err(RsomicsError::Io)?;
    let samples: Vec<String> = header.split('\t').skip(1).map(str::to_string).collect();
    if samples.is_empty() {
        return Err(RsomicsError::InvalidInput(
            "expression matrix needs at least one sample column".into(),
        ));
    }
    let mut y = Vec::new();
    for line in lines {
        let line = line.map_err(RsomicsError::Io)?;
        if line.is_empty() {
            continue;
        }
        let mut f = line.split('\t');
        let gene = f
            .next()
            .ok_or_else(|| RsomicsError::InvalidInput("missing gene id".into()))?;
        let row: Vec<f64> = f.map(parse_f64).collect::<Result<_>>()?;
        if row.len() != samples.len() {
            return Err(RsomicsError::InvalidInput(format!(
                "gene '{gene}' has {} values, header declares {} samples",
                row.len(),
                samples.len()
            )));
        }
        y.push(row);
    }
    if y.is_empty() {
        return Err(RsomicsError::InvalidInput("no genes in matrix".into()));
    }
    Ok(Expr { samples, y })
}

pub struct Design {
    /// row-major [sample][coef]
    pub x: Vec<Vec<f64>>,
}

pub fn read_design(path: &Path) -> Result<Design> {
    let mut lines = open(path)?.lines();
    let header = lines
        .next()
        .ok_or_else(|| RsomicsError::InvalidInput("empty design matrix".into()))?
        .map_err(RsomicsError::Io)?;
    let coef_names: Vec<String> = header.split('\t').skip(1).map(str::to_string).collect();
    if coef_names.is_empty() {
        return Err(RsomicsError::InvalidInput(
            "design matrix needs at least one coefficient column".into(),
        ));
    }
    let mut x = Vec::new();
    for line in lines {
        let line = line.map_err(RsomicsError::Io)?;
        if line.is_empty() {
            continue;
        }
        let mut f = line.split('\t');
        let id = f
            .next()
            .ok_or_else(|| RsomicsError::InvalidInput("missing design row id".into()))?;
        let row: Vec<f64> = f.map(parse_f64).collect::<Result<_>>()?;
        if row.len() != coef_names.len() {
            return Err(RsomicsError::InvalidInput(format!(
                "design row '{id}' has {} values, header declares {} coefficients",
                row.len(),
                coef_names.len()
            )));
        }
        x.push(row);
    }
    Ok(Design { x })
}