limma-rust 0.1.0

Pure-Rust port of the Bioconductor limma differential-expression package
Documentation
//! Port of limma's `modelMatrix` / `uniqueTargets` (modelmatrix.R): construct
//! the design matrix for a two-color (Cy3/Cy5) microarray experiment from a
//! table of target names. Each array contributes a log-ratio Cy5 − Cy3, and
//! the design expresses those log-ratios in terms of either a common-reference
//! parametrization or a caller-supplied parameter matrix.
//!
//! Target names are sorted by byte order (matching R's `sort()` for the usual
//! ASCII target labels); the `makeContrasts` companion lives in
//! [`crate::contrasts`].

use anyhow::{bail, Result};
use ndarray::Array2;

use crate::linalg::{qr_econ, solve_upper};

/// Sorted, de-duplicated target names across the Cy3 and Cy5 columns — the
/// `uniqueTargets` helper.
pub fn unique_targets(cy3: &[&str], cy5: &[&str]) -> Vec<String> {
    let mut names: Vec<String> = cy3
        .iter()
        .chain(cy5.iter())
        .map(|s| s.to_string())
        .collect();
    names.sort();
    names.dedup();
    names
}

/// How a two-color design's coefficients are defined.
pub enum ModelParam<'a> {
    /// A common reference target: coefficients are the log-ratios of every
    /// other target against it, in sorted order (the `ref=` form of
    /// `modelMatrix`).
    Reference(&'a str),
    /// An explicit `ntargets × ncoef` parameter matrix. `target_names` labels
    /// the rows (must equal the sorted unique target names as a set) and
    /// `coef_names` labels the columns (the `parameters=` form).
    Parameters {
        matrix: Array2<f64>,
        target_names: Vec<String>,
        coef_names: Vec<String>,
    },
}

/// Two-color design matrix and its coefficient (column) names.
#[derive(Debug, Clone)]
pub struct ModelMatrix {
    /// `narrays × ncoef` design matrix, rows in input (array) order.
    pub design: Array2<f64>,
    pub coef_names: Vec<String>,
}

/// Port of `modelMatrix(targets, parameters=, ref=)`. `cy3`/`cy5` are the green
/// and red channel target names for each array. Returns the design matrix
/// `zapsmall(t(solve(crossprod(P), crossprod(P, J))), 14)`, where `J[t,a]`
/// is `+1` if target `t` is the Cy5 sample of array `a`, `-1` if it is the
/// Cy3 sample, and `P` is the parameter matrix.
pub fn model_matrix(cy3: &[&str], cy5: &[&str], param: ModelParam) -> Result<ModelMatrix> {
    let narrays = cy3.len();
    if cy5.len() != narrays {
        bail!("Cy3 and Cy5 have different lengths");
    }
    let sorted = unique_targets(cy3, cy5);

    let (parameters, target_names, coef_names) = match param {
        ModelParam::Reference(reference) => {
            if !sorted.iter().any(|t| t == reference) {
                bail!("\"{reference}\" not among the target names found");
            }
            let others: Vec<String> = sorted.into_iter().filter(|t| t != reference).collect();
            let ntargets = others.len() + 1;
            let ncoef = ntargets - 1;
            // parameters = rbind(-1, diag(ntargets-1)).
            let mut p = Array2::<f64>::zeros((ntargets, ncoef));
            for j in 0..ncoef {
                p[[0, j]] = -1.0;
                p[[j + 1, j]] = 1.0;
            }
            let mut names = Vec::with_capacity(ntargets);
            names.push(reference.to_string());
            names.extend(others.iter().cloned());
            (p, names, others)
        }
        ModelParam::Parameters {
            matrix,
            target_names,
            coef_names,
        } => {
            if matrix.nrows() != target_names.len() {
                bail!("rows of parameters don't match unique target names");
            }
            if matrix.ncols() != coef_names.len() {
                bail!("columns of parameters don't match coefficient names");
            }
            let mut a = sorted.clone();
            a.sort();
            let mut b = target_names.clone();
            b.sort();
            if a != b {
                bail!("rownames of parameters don't match unique target names");
            }
            (matrix, target_names, coef_names)
        }
    };

    let ntargets = target_names.len();
    let ncoef = parameters.ncols();

    // J[t,a] = (name_t == Cy5[a]) - (name_t == Cy3[a]).
    let mut j = Array2::<f64>::zeros((ntargets, narrays));
    for (t, name) in target_names.iter().enumerate() {
        for a in 0..narrays {
            let v = i32::from(cy5[a] == name) - i32::from(cy3[a] == name);
            j[[t, a]] = f64::from(v);
        }
    }

    // design[a, :] = (P'P)^-1 P' J[, a], solved column-by-column via QR of P.
    let (q, r) = qr_econ(&parameters);
    let mut design = Array2::<f64>::zeros((narrays, ncoef));
    for a in 0..narrays {
        let qtj = q.t().dot(&j.column(a));
        let beta = solve_upper(&r, &qtj);
        for (k, &bk) in beta.iter().enumerate() {
            design[[a, k]] = bk;
        }
    }
    zapsmall(&mut design, 14);

    Ok(ModelMatrix { design, coef_names })
}

/// In-place port of R's `zapsmall(x, digits)`: round to
/// `digits - floor(log10(max|x|))` decimal places, collapsing rounding noise
/// around the exact design entries to zero.
fn zapsmall(m: &mut Array2<f64>, digits: i32) {
    let mx = m
        .iter()
        .filter(|v| v.is_finite())
        .fold(0.0f64, |acc, &v| acc.max(v.abs()));
    let dp = if mx > 0.0 {
        (digits - mx.log10().floor() as i32).max(0)
    } else {
        digits
    };
    let factor = 10f64.powi(dp);
    for v in m.iter_mut() {
        *v = (*v * factor).round() / factor;
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use ndarray::array;

    fn check(design: &Array2<f64>, rows: &[&[f64]]) {
        assert_eq!(design.nrows(), rows.len(), "row count");
        for (a, row) in rows.iter().enumerate() {
            assert_eq!(design.ncols(), row.len(), "col count");
            for (k, &want) in row.iter().enumerate() {
                assert!(
                    (design[[a, k]] - want).abs() < 1e-10,
                    "design[{a},{k}] = {} vs {want}",
                    design[[a, k]]
                );
            }
        }
    }

    #[test]
    fn unique_targets_sorts_and_dedups() {
        let ut = unique_targets(&["Ref", "Ref", "B", "A"], &["A", "B", "Ref", "Ref"]);
        assert_eq!(ut, vec!["A", "B", "Ref"]);
    }

    /// M1: common reference (ref = "Ref"), four arrays alternating A/B vs Ref.
    #[test]
    fn model_matrix_common_reference() {
        let cy3 = ["Ref", "Ref", "Ref", "Ref"];
        let cy5 = ["A", "B", "A", "B"];
        let out = model_matrix(&cy3, &cy5, ModelParam::Reference("Ref")).unwrap();
        assert_eq!(out.coef_names, vec!["A", "B"]);
        check(
            &out.design,
            &[&[1.0, 0.0], &[0.0, 1.0], &[1.0, 0.0], &[0.0, 1.0]],
        );
    }

    /// M2: dye-swaps with the reference appearing on either channel; the
    /// reference is moved to the front so coefficients are Mut, WT.
    #[test]
    fn model_matrix_dye_swaps() {
        let cy3 = ["WT", "Mut", "Ref", "Ref", "WT"];
        let cy5 = ["Ref", "Ref", "WT", "Mut", "Mut"];
        let out = model_matrix(&cy3, &cy5, ModelParam::Reference("Ref")).unwrap();
        assert_eq!(out.coef_names, vec!["Mut", "WT"]);
        check(
            &out.design,
            &[
                &[0.0, -1.0],
                &[-1.0, 0.0],
                &[0.0, 1.0],
                &[1.0, 0.0],
                &[1.0, -1.0],
            ],
        );
    }

    /// M3: explicit parameter matrix over three targets (A, B, C).
    #[test]
    fn model_matrix_explicit_parameters() {
        let cy3 = ["A", "A", "B", "C"];
        let cy5 = ["B", "C", "C", "A"];
        let param = ModelParam::Parameters {
            matrix: array![[-1.0, -1.0], [1.0, 0.0], [0.0, 1.0]],
            target_names: vec!["A".into(), "B".into(), "C".into()],
            coef_names: vec!["B".into(), "C".into()],
        };
        let out = model_matrix(&cy3, &cy5, param).unwrap();
        assert_eq!(out.coef_names, vec!["B", "C"]);
        check(
            &out.design,
            &[&[1.0, 0.0], &[0.0, 1.0], &[-1.0, 1.0], &[0.0, -1.0]],
        );
    }

    /// M4: three-target common reference (ref = "Ctl").
    #[test]
    fn model_matrix_three_treatments() {
        let cy3 = ["Ctl", "Ctl", "Ctl", "Ctl", "Ctl", "Ctl"];
        let cy5 = ["Drug1", "Drug2", "Drug3", "Drug1", "Drug2", "Drug3"];
        let out = model_matrix(&cy3, &cy5, ModelParam::Reference("Ctl")).unwrap();
        assert_eq!(out.coef_names, vec!["Drug1", "Drug2", "Drug3"]);
        check(
            &out.design,
            &[
                &[1.0, 0.0, 0.0],
                &[0.0, 1.0, 0.0],
                &[0.0, 0.0, 1.0],
                &[1.0, 0.0, 0.0],
                &[0.0, 1.0, 0.0],
                &[0.0, 0.0, 1.0],
            ],
        );
    }

    #[test]
    fn model_matrix_rejects_unknown_reference() {
        let err = model_matrix(&["A"], &["B"], ModelParam::Reference("Z")).unwrap_err();
        assert!(err.to_string().contains("not among the target names"));
    }
}