limma-rust 0.1.0

//! Between-array normalization of single-channel matrices. Port of limma's
//! `normalizeBetweenArrays` matrix path and its constituents `normalizeQuantiles`
//! (quantile), `normalizeMedianValues` (scale) and `normalizeCyclicLoess`
//! (cyclicloess). Two-colour (`RGList`/`MAList`) and `vsn` methods are out of
//! scope for the pure-Rust statistical port.

use anyhow::{bail, Result};
use ndarray::{Array1, Array2, Axis};

use crate::lowess::{approx_rule2, loess_fit_unweighted};

/// Between-array normalization method for a single-channel matrix.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum NormalizeMethod {
    None,
    Scale,
    Quantile,
    CyclicLoess,
}

impl NormalizeMethod {
    /// Parse a method name (`none` | `scale` | `quantile` | `cyclicloess`).
    pub fn parse(s: &str) -> Result<Self> {
        Ok(match s {
            "none" => Self::None,
            "scale" => Self::Scale,
            "quantile" => Self::Quantile,
            "cyclicloess" => Self::CyclicLoess,
            other => bail!(
                "unknown normalize method '{other}' (expected none|scale|quantile|cyclicloess)"
            ),
        })
    }
}

/// `normalizeBetweenArrays(object, method)` for a single-channel matrix
/// (`n_genes x n_samples`). `CyclicLoess` uses the default `fast` method with
/// adaptive span and three iterations.
pub fn normalize_between_arrays(x: &Array2<f64>, method: NormalizeMethod) -> Array2<f64> {
    match method {
        NormalizeMethod::None => x.clone(),
        NormalizeMethod::Scale => normalize_median_values(x),
        NormalizeMethod::Quantile => normalize_quantiles(x, true),
        NormalizeMethod::CyclicLoess => normalize_cyclic_loess(x, 0.7, true, 3, CyclicMethod::Fast),
    }
}

/// Median of the finite entries of a slice, matching R's `median(na.rm=TRUE)`
/// (average of the two central order statistics for an even count). Returns
/// `NaN` when there are no finite entries.
pub(crate) fn median_finite(v: &[f64]) -> f64 {
    let mut s: Vec<f64> = v.iter().copied().filter(|x| x.is_finite()).collect();
    let n = s.len();
    if n == 0 {
        return f64::NAN;
    }
    s.sort_by(|a, b| a.partial_cmp(b).unwrap());
    if n % 2 == 1 {
        s[n / 2]
    } else {
        0.5 * (s[n / 2 - 1] + s[n / 2])
    }
}

/// `normalizeMedianValues`: scale each column to a common median. Columns are
/// divided by `exp(log(median_j) - mean_j log(median))`.
pub fn normalize_median_values(x: &Array2<f64>) -> Array2<f64> {
    let n_cols = x.ncols();
    if n_cols <= 1 {
        return x.clone();
    }
    let log_med: Vec<f64> = (0..n_cols)
        .map(|j| median_finite(&x.column(j).to_vec()).ln())
        .collect();
    let mean = log_med.iter().sum::<f64>() / n_cols as f64;
    let scale: Vec<f64> = log_med.iter().map(|&lm| (lm - mean).exp()).collect();
    let mut out = x.clone();
    for (j, &s) in scale.iter().enumerate() {
        out.column_mut(j).mapv_inplace(|v| v / s);
    }
    out
}

/// `normalizeMedianAbsValues`: scale each column to a common median absolute
/// value. Identical to [`normalize_median_values`] except the per-column median
/// is taken over `abs(x)` (matching limma's `apply(abs(x), 2, median)`).
pub fn normalize_median_abs_values(x: &Array2<f64>) -> Array2<f64> {
    let n_cols = x.ncols();
    if n_cols <= 1 {
        return x.clone();
    }
    let log_med: Vec<f64> = (0..n_cols)
        .map(|j| {
            let absvals: Vec<f64> = x.column(j).iter().map(|v| v.abs()).collect();
            median_finite(&absvals).ln()
        })
        .collect();
    let mean = log_med.iter().sum::<f64>() / n_cols as f64;
    let scale: Vec<f64> = log_med.iter().map(|&lm| (lm - mean).exp()).collect();
    let mut out = x.clone();
    for (j, &s) in scale.iter().enumerate() {
        out.column_mut(j).mapv_inplace(|v| v / s);
    }
    out
}

/// Average (ties-averaged, 1-based) ranks of the finite entries of `col`;
/// non-finite entries receive `NaN`. Matches R's `rank(ties.method="average")`
/// over the finite values.
fn rank_average_finite(col: &[f64]) -> Vec<f64> {
    let n = col.len();
    let mut idx: Vec<usize> = (0..n).filter(|&k| col[k].is_finite()).collect();
    idx.sort_by(|&a, &b| col[a].partial_cmp(&col[b]).unwrap());
    let mut ranks = vec![f64::NAN; n];
    let mut i = 0usize;
    while i < idx.len() {
        let mut j = i;
        while j < idx.len() && col[idx[j]] == col[idx[i]] {
            j += 1;
        }
        let avg = (i + 1 + j) as f64 / 2.0; // mean of 1-based ranks i+1..=j
        for k in i..j {
            ranks[idx[k]] = avg;
        }
        i = j;
    }
    ranks
}

/// `normalizeQuantiles(A, ties)`: give every column the same quantiles, the
/// average of the sorted columns. Missing values are allowed (a column's
/// observed values are stretched to the full grid before averaging, and only
/// its observed entries are re-mapped).
pub fn normalize_quantiles(a: &Array2<f64>, ties: bool) -> Array2<f64> {
    let nr = a.nrows();
    let nc = a.ncols();
    if nc <= 1 || nr == 0 {
        return a.clone();
    }
    // Target grid i = (0..nr-1)/(nr-1).
    let grid: Vec<f64> = (0..nr).map(|k| k as f64 / (nr - 1) as f64).collect();

    // S: each column's sorted values stretched onto the grid; O: order (the
    // original row index of each ascending value) for the non-ties path.
    let mut s = Array2::<f64>::zeros((nr, nc));
    let mut order = vec![vec![0usize; 0]; nc];
    let mut nobs = vec![nr; nc];
    for j in 0..nc {
        let col = a.column(j);
        let mut idx: Vec<usize> = (0..nr).filter(|&k| col[k].is_finite()).collect();
        idx.sort_by(|&p, &q| col[p].partial_cmp(&col[q]).unwrap());
        let sorted: Vec<f64> = idx.iter().map(|&k| col[k]).collect();
        let nobsj = sorted.len();
        nobs[j] = nobsj;
        order[j] = idx;
        if nobsj == nr {
            for k in 0..nr {
                s[[k, j]] = sorted[k];
            }
        } else {
            // Stretch the nobsj sorted values onto the full grid.
            let sub: Vec<f64> = (0..nobsj)
                .map(|k| k as f64 / (nobsj - 1).max(1) as f64)
                .collect();
            for (k, &gi) in grid.iter().enumerate() {
                s[[k, j]] = approx_rule2(&sub, &sorted, gi);
            }
        }
    }
    let m: Array1<f64> = s.mean_axis(Axis(1)).unwrap();
    let m_vec = m.to_vec();

    let mut out = a.clone();
    for j in 0..nc {
        let col = a.column(j);
        if ties {
            let r = rank_average_finite(&col.to_vec());
            for k in 0..nr {
                if col[k].is_finite() {
                    let pos = (r[k] - 1.0) / (nobs[j] - 1) as f64;
                    out[[k, j]] = approx_rule2(&grid, &m_vec, pos);
                }
            }
        } else if nobs[j] == nr {
            for (rank0, &k) in order[j].iter().enumerate() {
                out[[k, j]] = m_vec[rank0];
            }
        } else {
            for (rank0, &k) in order[j].iter().enumerate() {
                let pos = rank0 as f64 / (nobs[j] - 1) as f64;
                out[[k, j]] = approx_rule2(&grid, &m_vec, pos);
            }
        }
    }
    out
}

/// Cyclic-loess variant.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum CyclicMethod {
    Fast,
    Pairs,
    Affy,
}

/// `normalizeCyclicLoess`: iteratively detrend each column's deviation from the
/// row means (`fast`) or pairwise MA differences (`pairs`/`affy`) with a LOWESS
/// fit (`loessFit`, weights = NULL). With `adaptive_span` the span is chosen by
/// `chooseLowessSpan(nrow, 50, 0.3, 1/3)`.
pub fn normalize_cyclic_loess(
    x: &Array2<f64>,
    span: f64,
    adaptive_span: bool,
    iterations: usize,
    method: CyclicMethod,
) -> Array2<f64> {
    let nr = x.nrows();
    let n = x.ncols();
    let span = if adaptive_span {
        crate::voom::choose_lowess_span(nr, 50.0, 0.3, 1.0 / 3.0)
    } else {
        span
    };
    let mut x = x.clone();
    // MA scratch reused across every pair/column/iteration (refilled in place):
    // the Pairs/Affy paths otherwise allocate two length-nr Vecs per array pair
    // per iteration, i.e. O(n^2) allocations.
    let mut m = vec![0.0f64; nr];
    let mut a = vec![0.0f64; nr];
    match method {
        CyclicMethod::Fast => {
            for _ in 0..iterations {
                for (g, ag) in a.iter_mut().enumerate() {
                    *ag = row_nanmean(&x, g);
                }
                for i in 0..n {
                    for g in 0..nr {
                        m[g] = x[[g, i]] - a[g];
                    }
                    let f = loess_fit_unweighted(&m, &a, span, 4).0;
                    for g in 0..nr {
                        x[[g, i]] -= f[g];
                    }
                }
            }
        }
        CyclicMethod::Pairs => {
            for _ in 0..iterations {
                for i in 0..n - 1 {
                    for j in i + 1..n {
                        for g in 0..nr {
                            m[g] = x[[g, j]] - x[[g, i]];
                            a[g] = 0.5 * (x[[g, j]] + x[[g, i]]);
                        }
                        let f = loess_fit_unweighted(&m, &a, span, 4).0;
                        for g in 0..nr {
                            x[[g, i]] += f[g] / 2.0;
                            x[[g, j]] -= f[g] / 2.0;
                        }
                    }
                }
            }
        }
        CyclicMethod::Affy => {
            for _ in 0..iterations {
                let mut adjustment = Array2::<f64>::zeros((nr, n));
                for i in 0..n - 1 {
                    for j in i + 1..n {
                        for g in 0..nr {
                            m[g] = x[[g, j]] - x[[g, i]];
                            a[g] = 0.5 * (x[[g, j]] + x[[g, i]]);
                        }
                        let f = loess_fit_unweighted(&m, &a, span, 4).0;
                        for g in 0..nr {
                            adjustment[[g, j]] += f[g];
                            adjustment[[g, i]] -= f[g];
                        }
                    }
                }
                for g in 0..nr {
                    for c in 0..n {
                        x[[g, c]] -= adjustment[[g, c]] / n as f64;
                    }
                }
            }
        }
    }
    x
}

fn row_nanmean(x: &Array2<f64>, g: usize) -> f64 {
    let mut sum = 0.0;
    let mut cnt = 0usize;
    for &v in x.row(g) {
        if v.is_finite() {
            sum += v;
            cnt += 1;
        }
    }
    if cnt > 0 {
        sum / cnt as f64
    } else {
        f64::NAN
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use ndarray::array;

    fn fixture() -> Array2<f64> {
        array![
            [5.1, 4.8, 6.2, 5.5],
            [2.3, 3.1, 2.8, 3.5],
            [7.7, 7.2, 8.1, 6.9],
            [1.1, 0.9, 1.4, 1.2],
            [9.3, 9.1, 8.8, 9.5],
            [4.4, 4.9, 5.2, 4.1],
        ]
    }

    fn assert_close(got: &Array2<f64>, want: &Array2<f64>, tol: f64) {
        assert_eq!(got.dim(), want.dim());
        for (a, b) in got.iter().zip(want.iter()) {
            assert!((a - b).abs() < tol, "got {a} want {b}");
        }
    }

    // Reference matrices from R limma 3.68.3 (scratch/norm_ref.R).

    #[test]
    fn quantile_matches_r() {
        let got = normalize_quantiles(&fixture(), true);
        let want = array![
            [5.425, 4.625, 5.425, 5.425],
            [2.925, 2.925, 2.925, 2.925],
            [7.475, 7.475, 7.475, 7.475],
            [1.150, 1.150, 1.150, 1.150],
            [9.175, 9.175, 9.175, 9.175],
            [4.625, 5.425, 4.625, 4.625],
        ];
        assert_close(&got, &want, 1e-9);
    }

    #[test]
    fn scale_matches_r() {
        let got = normalize_median_values(&fixture());
        let want = array![
            [
                5.379778894332,
                4.958922934739,
                5.450102801447,
                5.741287729347
            ],
            [
                2.426174795483,
                3.202637728685,
                2.461336749041,
                3.653546736857
            ],
            [
                8.122411271834,
                7.438384402108,
                7.120295595439,
                7.202706424090
            ],
            [
                1.160344467405,
                0.929798050264,
                1.230668374520,
                1.252644595494
            ],
            [
                9.810185042605,
                9.401291397109,
                7.735629782699,
                9.916769714327
            ],
            [
                4.641377869620,
                5.062233829212,
                4.571053962504,
                4.279869034604
            ],
        ];
        assert_close(&got, &want, 1e-9);
    }

    #[test]
    fn scale_abs_matches_r() {
        let nan = f64::NAN;
        let x = array![
            [1.0, -2.0, 3.0],
            [-4.0, 5.0, -6.0],
            [7.0, -8.0, nan],
            [-10.0, 11.0, 12.0],
            [13.0, nan, -15.0],
            [16.0, -17.0, 18.0],
        ];
        let got = normalize_median_abs_values(&x);
        let want = array![
            [1.09937146549536, -2.33616436417763, 2.33616436417763],
            [-4.39748586198142, 5.84041091044408, -4.67232872835526],
            [7.69560025846748, -9.34465745671052, nan],
            [-10.9937146549535, 12.848904002977, 9.34465745671052],
            [14.2918290514396, nan, -11.6808218208881],
            [17.5899434479257, -19.8573970955099, 14.0169861850658],
        ];
        assert_eq!(got.dim(), want.dim());
        for (a, b) in got.iter().zip(want.iter()) {
            if b.is_nan() {
                assert!(a.is_nan(), "got {a} want NaN");
            } else {
                assert!((a - b).abs() < 1e-9, "got {a} want {b}");
            }
        }
        // single column returned unchanged
        let one = x.column(0).to_owned().insert_axis(Axis(1));
        let got1 = normalize_median_abs_values(&one);
        assert_eq!(got1, one);
    }

    #[test]
    fn cyclicloess_matches_r() {
        let got = normalize_cyclic_loess(&fixture(), 0.7, true, 3, CyclicMethod::Fast);
        let want = array![
            [
                5.296751774373,
                4.950393059958,
                5.410921516718,
                5.838353274939
            ],
            [
                2.578701290681,
                3.133887404904,
                2.497436394874,
                3.373476538746
            ],
            [
                7.652064568973,
                7.390792439486,
                7.799979876589,
                7.020333019702
            ],
            [
                1.278090240451,
                0.963550076309,
                1.356899073035,
                0.913640591855
            ],
            [
                9.065987375234,
                9.278688667701,
                8.955145308355,
                9.419784843868
            ],
            [
                4.764093788823,
                4.932065839925,
                4.658296269798,
                4.136224331580
            ],
        ];
        assert_close(&got, &want, 1e-6);
    }

    #[test]
    fn quantile_with_missing_keeps_na() {
        let mut x = fixture();
        x[[2, 1]] = f64::NAN;
        let got = normalize_quantiles(&x, true);
        // The missing entry stays NA; observed entries match R, exercising the
        // sub-grid stretch + approx remap for the short column.
        assert!(got[[2, 1]].is_nan());
        let want = array![
            [5.4100, 4.9325, 5.4100, 5.4100],
            [2.8150, 3.2250, 2.8150, 2.8150],
            [7.1100, f64::NAN, 7.1100, 7.1100],
            [1.1500, 1.1500, 1.1500, 1.1500],
            [9.1750, 9.1750, 9.1750, 9.1750],
            [4.4550, 6.6850, 4.4550, 4.4550],
        ];
        for g in 0..x.nrows() {
            for j in 0..x.ncols() {
                if g == 2 && j == 1 {
                    continue;
                }
                assert!((got[[g, j]] - want[[g, j]]).abs() < 1e-9, "[{g},{j}]");
            }
        }
    }

    #[test]
    fn single_column_is_identity() {
        let x = array![[1.0], [2.0], [3.0]];
        assert_eq!(normalize_quantiles(&x, true), x);
        assert_eq!(normalize_median_values(&x), x);
    }
}