limma-rust 0.1.0

Pure-Rust port of the Bioconductor limma differential-expression package
Documentation
//! Averaging over replicate probes (`avereps`) or replicate arrays
//! (`avearrays`).
//!
//! Pure-Rust port of limma's `avereps.default` and `avearrays.default` for
//! numeric matrices: an NA-aware mean over the rows ([`avereps`]) or columns
//! ([`avearrays`]) sharing an ID, with one group per unique ID in order of
//! first appearance. NaN entries are treated as missing (dropped from both the
//! sum and the count), matching R's `rowsum(..., na.rm=TRUE)` divided by the
//! per-group count of non-missing values. The weighted `avearrays` branch
//! (which limma routes through `lmFit`) is not ported; only the unweighted
//! default is provided.

use ndarray::Array2;
use std::collections::HashMap;
use std::hash::Hash;

/// Map each element of `id` to a group index, preserving first-appearance
/// order. Returns the per-element group indices and the unique IDs.
fn group_ids<T: Eq + Hash + Clone>(id: &[T]) -> (Vec<usize>, Vec<T>) {
    let mut order: Vec<T> = Vec::new();
    let mut index: HashMap<&T, usize> = HashMap::new();
    let mut group_of = Vec::with_capacity(id.len());
    for t in id {
        let g = *index.entry(t).or_insert_with(|| {
            order.push(t.clone());
            order.len() - 1
        });
        group_of.push(g);
    }
    (group_of, order)
}

/// `avereps(x, ID)`: average the rows of `x` that share an ID, returning the
/// collapsed matrix (one row per unique ID, in first-appearance order) and the
/// unique IDs. `id` must have one entry per row of `x`. NaN entries are treated
/// as missing; a group/column that is entirely missing yields NaN.
pub fn avereps<T: Eq + Hash + Clone>(x: &Array2<f64>, id: &[T]) -> (Array2<f64>, Vec<T>) {
    let (nr, nc) = x.dim();
    assert_eq!(id.len(), nr, "ID length must equal the number of rows");
    let (group_of, order) = group_ids(id);
    let ng = order.len();

    let mut sum = Array2::<f64>::zeros((ng, nc));
    let mut cnt = Array2::<f64>::zeros((ng, nc));
    for r in 0..nr {
        let g = group_of[r];
        for c in 0..nc {
            let v = x[[r, c]];
            if !v.is_nan() {
                sum[[g, c]] += v;
                cnt[[g, c]] += 1.0;
            }
        }
    }
    sum.zip_mut_with(&cnt, |s, &n| *s /= n);
    (sum, order)
}

/// `avearrays(x, ID)`: average the columns of `x` that share an ID, returning
/// the collapsed matrix (one column per unique ID, in first-appearance order)
/// and the unique IDs. `id` must have one entry per column of `x`. NaN entries
/// are treated as missing; a group/row that is entirely missing yields NaN.
pub fn avearrays<T: Eq + Hash + Clone>(x: &Array2<f64>, id: &[T]) -> (Array2<f64>, Vec<T>) {
    let (nr, nc) = x.dim();
    assert_eq!(id.len(), nc, "ID length must equal the number of columns");
    let (group_of, order) = group_ids(id);
    let ng = order.len();

    let mut sum = Array2::<f64>::zeros((nr, ng));
    let mut cnt = Array2::<f64>::zeros((nr, ng));
    for c in 0..nc {
        let g = group_of[c];
        for r in 0..nr {
            let v = x[[r, c]];
            if !v.is_nan() {
                sum[[r, g]] += v;
                cnt[[r, g]] += 1.0;
            }
        }
    }
    sum.zip_mut_with(&cnt, |s, &n| *s /= n);
    (sum, order)
}

#[cfg(test)]
mod tests {
    use super::*;
    use ndarray::array;

    const NA: f64 = f64::NAN;

    fn eq(a: &Array2<f64>, b: &Array2<f64>) -> bool {
        a.dim() == b.dim() && a.iter().zip(b.iter()).all(|(&x, &y)| (x - y).abs() < 1e-12)
    }

    #[test]
    fn avereps_matches_r() {
        // 6 rows, IDs a,b,a,c,b,a with one NA in the second column of an "a" row.
        let x = array![
            [1.0, 2.0, 3.0],
            [4.0, 5.0, 6.0],
            [7.0, NA, 9.0],
            [10.0, 11.0, 12.0],
            [13.0, 14.0, 15.0],
            [16.0, 17.0, 18.0],
        ];
        let id = ["a", "b", "a", "c", "b", "a"];
        let (y, ids) = avereps(&x, &id);
        // Reference: avereps(x, ID) in limma 3.68.3.
        let want = array![
            [8.0, 9.5, 10.0],   // a: rows 0,2,5 (col1 averages 2 & 17 only)
            [8.5, 9.5, 10.5],   // b: rows 1,4
            [10.0, 11.0, 12.0], // c: row 3
        ];
        assert_eq!(ids, vec!["a", "b", "c"]);
        assert!(eq(&y, &want), "got {y:?}");
    }

    #[test]
    fn avearrays_matches_r() {
        // 3 rows, 6 columns; column IDs a,b,a,c,b,a (transpose of the avereps case).
        let x = array![
            [1.0, 4.0, 7.0, 10.0, 13.0, 16.0],
            [2.0, 5.0, NA, 11.0, 14.0, 17.0],
            [3.0, 6.0, 9.0, 12.0, 15.0, 18.0],
        ];
        let id = ["a", "b", "a", "c", "b", "a"];
        let (y, ids) = avearrays(&x, &id);
        // Reference: avearrays(x, ID) in limma 3.68.3.
        let want = array![[8.0, 8.5, 10.0], [9.5, 9.5, 11.0], [10.0, 10.5, 12.0],];
        assert_eq!(ids, vec!["a", "b", "c"]);
        assert!(eq(&y, &want), "got {y:?}");
    }

    #[test]
    fn avereps_no_duplicates_is_identity() {
        let x = array![[1.0, 2.0], [3.0, 4.0]];
        let id = ["p", "q"];
        let (y, ids) = avereps(&x, &id);
        assert!(eq(&y, &x));
        assert_eq!(ids, vec!["p", "q"]);
    }
}