limma-rust 0.1.0

//! Separate-channel analysis (limma `sepchannel.R`).
//!
//! Pure-Rust port of [`design_i2m`] / [`design_i2a`] (limma's `designI2M` /
//! `designI2A`) — convert an individual-channel design matrix (rows in channel
//! order: array 1 channel 1, array 1 channel 2, array 2 channel 1, …) into the
//! design for two-colour log-ratios (`M`) or average intensities (`A`) — and of
//! [`lmsc_fit`] (`lmscFit`), the single-channel linear model fit that accounts
//! for a known intra-spot correlation.

use anyhow::{bail, Result};
use ndarray::Array2;

use crate::fit::{lmfit, MArrayLM};

/// Log-ratio (`M`) design: row `a` is `channel2 - channel1` of array `a`,
/// i.e. `(I_narrays ⊗ [-1, 1]) · design`. `design` must have an even number of
/// rows (two channels per array).
pub fn design_i2m(design: &Array2<f64>) -> Array2<f64> {
    let nrow = design.nrows();
    assert_eq!(nrow % 2, 0, "design must have two rows per array");
    let narrays = nrow / 2;
    let p = design.ncols();
    let mut out = Array2::<f64>::zeros((narrays, p));
    for a in 0..narrays {
        for j in 0..p {
            out[[a, j]] = design[[2 * a + 1, j]] - design[[2 * a, j]];
        }
    }
    out
}

/// Average-intensity (`A`) design: row `a` is the mean of the two channel rows
/// of array `a`, i.e. `(I_narrays ⊗ [0.5, 0.5]) · design`. `design` must have an
/// even number of rows.
pub fn design_i2a(design: &Array2<f64>) -> Array2<f64> {
    let nrow = design.nrows();
    assert_eq!(nrow % 2, 0, "design must have two rows per array");
    let narrays = nrow / 2;
    let p = design.ncols();
    let mut out = Array2::<f64>::zeros((narrays, p));
    for a in 0..narrays {
        for j in 0..p {
            out[[a, j]] = 0.5 * (design[[2 * a, j]] + design[[2 * a + 1, j]]);
        }
    }
    out
}

/// `exprs.MA(MA)`: extract the individual-channel log-expression matrix from
/// log-ratio (`M`) and average-intensity (`A`) matrices, each `n_genes x
/// n_arrays`. The result is `n_genes x (2*n_arrays)`, with the green channel
/// (`A - M/2`) at column `2j` and the red channel (`A + M/2`) at column `2j+1`
/// of array `j` — the channel order expected by [`design_i2m`] / [`lmsc_fit`].
pub fn exprs_ma(m: &Array2<f64>, a: &Array2<f64>) -> Array2<f64> {
    let ngenes = m.nrows();
    let narrays = m.ncols();
    Array2::from_shape_fn((ngenes, 2 * narrays), |(g, k)| {
        let j = k / 2;
        if k % 2 == 0 {
            a[[g, j]] - m[[g, j]] / 2.0
        } else {
            a[[g, j]] + m[[g, j]] / 2.0
        }
    })
}

/// `lmscFit(object, design, correlation)`: fit a single-channel linear model
/// for each gene across a series of two-colour arrays, allowing for a known
/// intra-spot correlation between the red and green channels.
///
/// * `m`, `a` — `n_genes x n_arrays` log-ratio and average-intensity matrices
///   (must be finite).
/// * `design` — `(2*n_arrays) x p` individual-channel design (rows in channel
///   order; see module docs).
/// * `correlation` — intra-spot correlation, strictly in `(-1, 1)`.
///
/// The two channels are decorrelated by the whitening scales
/// `sdM = sqrt(2(1-rho))` and `sdA = sqrt((1+rho)/2)`, then the stacked
/// `M/sdM`, `A/sdA` responses are regressed on `rbind(designM/sdM,
/// designA/sdA)` by ordinary least squares (limma reuses `lm.fit` here). The
/// returned [`MArrayLM`] carries the original individual-channel `design` and
/// `Amean = rowMeans(A)`, matching `lmscFit`.
pub fn lmsc_fit(
    m: &Array2<f64>,
    a: &Array2<f64>,
    design: &Array2<f64>,
    correlation: f64,
    gene_names: Vec<String>,
    coef_names: Vec<String>,
) -> Result<MArrayLM> {
    let ngenes = m.nrows();
    let narrays = m.ncols();
    if a.dim() != (ngenes, narrays) {
        bail!("dimensions of M and A don't match");
    }
    if m.iter().chain(a.iter()).any(|v| !v.is_finite()) {
        bail!("Missing or infinite values found in M or A");
    }
    let ny = 2 * narrays;
    if design.nrows() != ny {
        bail!(
            "the number of rows of the design matrix ({}) should be twice the number of arrays ({})",
            design.nrows(),
            narrays
        );
    }
    if correlation.is_nan() {
        bail!("intra-spot correlation is NA");
    }
    if correlation.abs() >= 1.0 {
        bail!("correlation must be strictly between -1 and 1");
    }

    let sd_m = (2.0 * (1.0 - correlation)).sqrt();
    let sd_a = ((1.0 + correlation) / 2.0).sqrt();

    // Stacked, whitened responses: genes x (2*narrays), with the M-channels
    // first and the A-channels second, matching y = rbind(t(M)/sdM, t(A)/sdA).
    let exprs = Array2::from_shape_fn((ngenes, ny), |(g, k)| {
        if k < narrays {
            m[[g, k]] / sd_m
        } else {
            a[[g, k - narrays]] / sd_a
        }
    });

    // Stacked, whitened design: X = rbind(designM/sdM, designA/sdA).
    let design_m = design_i2m(design);
    let design_a = design_i2a(design);
    let p = design.ncols();
    let x = Array2::from_shape_fn((ny, p), |(k, j)| {
        if k < narrays {
            design_m[[k, j]] / sd_m
        } else {
            design_a[[k - narrays, j]] / sd_a
        }
    });

    let mut fit = lmfit(&exprs, &x, gene_names, coef_names)?;
    // lmscFit reports Amean from the original A and keeps the channel design.
    fit.amean = (0..ngenes)
        .map(|g| (0..narrays).map(|j| a[[g, j]]).sum::<f64>() / narrays as f64)
        .collect();
    fit.design = Some(design.clone());
    Ok(fit)
}

#[cfg(test)]
mod tests {
    use super::*;
    use ndarray::array;

    fn design() -> Array2<f64> {
        array![
            [1.0, 0.5],
            [0.0, 2.0],
            [-1.0, 1.0],
            [3.0, 0.0],
            [2.0, -1.0],
            [0.5, 0.5],
        ]
    }

    #[test]
    fn design_i2m_matches_r() {
        let got = design_i2m(&design());
        let want = array![[-1.0, 1.5], [4.0, -1.0], [-1.5, 1.5]];
        assert_eq!(got, want);
    }

    #[test]
    fn design_i2a_matches_r() {
        let got = design_i2a(&design());
        let want = array![[0.5, 1.25], [1.0, 0.5], [1.25, -0.25]];
        assert_eq!(got, want);
    }

    #[test]
    fn exprs_ma_matches_r() {
        let m = array![[0.5, -1.0], [1.0, 2.0], [-0.5, 0.0]];
        let a = array![[8.0, 9.0], [10.0, 11.0], [7.0, 6.0]];
        let got = exprs_ma(&m, &a);
        // Columns interleave green (A-M/2) at 2j and red (A+M/2) at 2j+1.
        let want = array![
            [7.75, 8.25, 9.5, 8.5],
            [9.5, 10.5, 10.0, 12.0],
            [7.25, 6.75, 6.0, 6.0],
        ];
        assert_eq!(got, want);
    }

    #[allow(clippy::excessive_precision)]
    fn rclose(a: f64, b: f64) -> bool {
        (a - b).abs() <= 1e-9 * (1.0 + b.abs())
    }

    /// 6 genes x 4 arrays fixture from `scratch/lmscfit_ref.R`.
    #[allow(clippy::excessive_precision)]
    #[test]
    fn lmsc_fit_matches_r() {
        let (ngenes, narrays) = (6usize, 4usize);
        let mut m = Array2::zeros((ngenes, narrays));
        let mut a = Array2::zeros((ngenes, narrays));
        for g0 in 0..ngenes {
            for j0 in 0..narrays {
                let (g, j) = (g0 as i64, j0 as i64);
                m[[g0, j0]] = ((g * 3 + j * 2) % 7 - 3) as f64 * 0.4;
                a[[g0, j0]] = 8.0 + (g % 4) as f64 * 0.5 + ((g + j * 3) % 5) as f64 * 0.3;
            }
        }
        let design = array![
            [1.0, 0.0, 0.0],
            [1.0, 1.0, 0.0],
            [1.0, 0.0, 0.0],
            [1.0, 1.0, 0.0],
            [1.0, 0.0, 1.0],
            [1.0, 1.0, 1.0],
            [1.0, 0.0, 1.0],
            [1.0, 1.0, 1.0],
        ];
        let names = vec!["Int".into(), "Dye".into(), "Trt".into()];
        let fit = lmsc_fit(&m, &a, &design, 0.3, vec![], names).unwrap();

        let coef = [
            [
                8.4499999999999993,
                -2.0952455890385942e-15,
                0.3000000000000001,
            ],
            [
                9.3499999999999996,
                -0.20000000000000265,
                -0.44999999999999968,
            ],
            [9.1499999999999986, 0.29999999999999777, 0.3000000000000011],
            [10.049999999999999, 0.099999999999997938, 0.3000000000000001],
            [
                8.9499999999999993,
                -0.10000000000000214,
                -0.44999999999999934,
            ],
            [9.0999999999999996, -0.30000000000000238, 0.3000000000000001],
        ];
        for (g, row) in coef.iter().enumerate() {
            for (j, &c) in row.iter().enumerate() {
                assert!(rclose(fit.coefficients[[g, j]], c), "coef[{g},{j}]");
            }
        }

        let su = [0.64226162893325656, 0.5916079783099617, 0.80622577482985502];
        for g in 0..ngenes {
            for (j, &s) in su.iter().enumerate() {
                assert!(rclose(fit.stdev_unscaled[[g, j]], s), "stdev[{g},{j}]");
            }
        }

        let sigma = [
            0.84046036573632044,
            0.6907552802135184,
            0.66926234610359303,
            0.66926234610359581,
            0.71912645420875421,
            0.76575036818380149,
        ];
        for (g, &s) in sigma.iter().enumerate() {
            assert!(rclose(fit.sigma[g], s), "sigma[{g}]");
            assert!((fit.df_residual[g] - 5.0).abs() < 1e-12, "df[{g}]");
        }

        let cov = [
            [
                0.41250000000000009,
                -0.17500000000000016,
                -0.32500000000000001,
            ],
            [
                -0.17500000000000016,
                0.35000000000000009,
                8.5386183879315777e-17,
            ],
            [
                -0.32500000000000001,
                8.5386183879315777e-17,
                0.65000000000000013,
            ],
        ];
        for (i, row) in cov.iter().enumerate() {
            for (j, &c) in row.iter().enumerate() {
                assert!(rclose(fit.cov_coefficients[[i, j]], c), "cov[{i},{j}]");
            }
        }

        let amean = [
            8.5999999999999996,
            9.0250000000000004,
            9.4499999999999993,
            10.25,
            8.6750000000000007,
            9.0999999999999996,
        ];
        for (g, &am) in amean.iter().enumerate() {
            assert!(rclose(fit.amean[g], am), "amean[{g}]");
        }

        // Original channel design is preserved.
        assert_eq!(fit.design.as_ref().unwrap(), &design);
    }
}