Skip to main content

limma/
norm.rs

1//! Between-array normalization of single-channel matrices. Port of limma's
2//! `normalizeBetweenArrays` matrix path and its constituents `normalizeQuantiles`
3//! (quantile), `normalizeMedianValues` (scale) and `normalizeCyclicLoess`
4//! (cyclicloess). Two-colour (`RGList`/`MAList`) and `vsn` methods are out of
5//! scope for the pure-Rust statistical port.
6
7use anyhow::{bail, Result};
8use ndarray::{Array1, Array2, Axis};
9
10use crate::lowess::{approx_rule2, loess_fit_unweighted};
11
12/// Between-array normalization method for a single-channel matrix.
13#[derive(Clone, Copy, Debug, PartialEq, Eq)]
14pub enum NormalizeMethod {
15    None,
16    Scale,
17    Quantile,
18    CyclicLoess,
19}
20
21impl NormalizeMethod {
22    /// Parse a method name (`none` | `scale` | `quantile` | `cyclicloess`).
23    pub fn parse(s: &str) -> Result<Self> {
24        Ok(match s {
25            "none" => Self::None,
26            "scale" => Self::Scale,
27            "quantile" => Self::Quantile,
28            "cyclicloess" => Self::CyclicLoess,
29            other => bail!(
30                "unknown normalize method '{other}' (expected none|scale|quantile|cyclicloess)"
31            ),
32        })
33    }
34}
35
36/// `normalizeBetweenArrays(object, method)` for a single-channel matrix
37/// (`n_genes x n_samples`). `CyclicLoess` uses the default `fast` method with
38/// adaptive span and three iterations.
39pub fn normalize_between_arrays(x: &Array2<f64>, method: NormalizeMethod) -> Array2<f64> {
40    match method {
41        NormalizeMethod::None => x.clone(),
42        NormalizeMethod::Scale => normalize_median_values(x),
43        NormalizeMethod::Quantile => normalize_quantiles(x, true),
44        NormalizeMethod::CyclicLoess => normalize_cyclic_loess(x, 0.7, true, 3, CyclicMethod::Fast),
45    }
46}
47
48/// Median of the finite entries of a slice, matching R's `median(na.rm=TRUE)`
49/// (average of the two central order statistics for an even count). Returns
50/// `NaN` when there are no finite entries.
51pub(crate) fn median_finite(v: &[f64]) -> f64 {
52    let mut s: Vec<f64> = v.iter().copied().filter(|x| x.is_finite()).collect();
53    let n = s.len();
54    if n == 0 {
55        return f64::NAN;
56    }
57    s.sort_by(|a, b| a.partial_cmp(b).unwrap());
58    if n % 2 == 1 {
59        s[n / 2]
60    } else {
61        0.5 * (s[n / 2 - 1] + s[n / 2])
62    }
63}
64
65/// `normalizeMedianValues`: scale each column to a common median. Columns are
66/// divided by `exp(log(median_j) - mean_j log(median))`.
67pub fn normalize_median_values(x: &Array2<f64>) -> Array2<f64> {
68    let n_cols = x.ncols();
69    if n_cols <= 1 {
70        return x.clone();
71    }
72    let log_med: Vec<f64> = (0..n_cols)
73        .map(|j| median_finite(&x.column(j).to_vec()).ln())
74        .collect();
75    let mean = log_med.iter().sum::<f64>() / n_cols as f64;
76    let scale: Vec<f64> = log_med.iter().map(|&lm| (lm - mean).exp()).collect();
77    let mut out = x.clone();
78    for (j, &s) in scale.iter().enumerate() {
79        out.column_mut(j).mapv_inplace(|v| v / s);
80    }
81    out
82}
83
84/// `normalizeMedianAbsValues`: scale each column to a common median absolute
85/// value. Identical to [`normalize_median_values`] except the per-column median
86/// is taken over `abs(x)` (matching limma's `apply(abs(x), 2, median)`).
87pub fn normalize_median_abs_values(x: &Array2<f64>) -> Array2<f64> {
88    let n_cols = x.ncols();
89    if n_cols <= 1 {
90        return x.clone();
91    }
92    let log_med: Vec<f64> = (0..n_cols)
93        .map(|j| {
94            let absvals: Vec<f64> = x.column(j).iter().map(|v| v.abs()).collect();
95            median_finite(&absvals).ln()
96        })
97        .collect();
98    let mean = log_med.iter().sum::<f64>() / n_cols as f64;
99    let scale: Vec<f64> = log_med.iter().map(|&lm| (lm - mean).exp()).collect();
100    let mut out = x.clone();
101    for (j, &s) in scale.iter().enumerate() {
102        out.column_mut(j).mapv_inplace(|v| v / s);
103    }
104    out
105}
106
107/// Average (ties-averaged, 1-based) ranks of the finite entries of `col`;
108/// non-finite entries receive `NaN`. Matches R's `rank(ties.method="average")`
109/// over the finite values.
110fn rank_average_finite(col: &[f64]) -> Vec<f64> {
111    let n = col.len();
112    let mut idx: Vec<usize> = (0..n).filter(|&k| col[k].is_finite()).collect();
113    idx.sort_by(|&a, &b| col[a].partial_cmp(&col[b]).unwrap());
114    let mut ranks = vec![f64::NAN; n];
115    let mut i = 0usize;
116    while i < idx.len() {
117        let mut j = i;
118        while j < idx.len() && col[idx[j]] == col[idx[i]] {
119            j += 1;
120        }
121        let avg = (i + 1 + j) as f64 / 2.0; // mean of 1-based ranks i+1..=j
122        for k in i..j {
123            ranks[idx[k]] = avg;
124        }
125        i = j;
126    }
127    ranks
128}
129
130/// `normalizeQuantiles(A, ties)`: give every column the same quantiles, the
131/// average of the sorted columns. Missing values are allowed (a column's
132/// observed values are stretched to the full grid before averaging, and only
133/// its observed entries are re-mapped).
134pub fn normalize_quantiles(a: &Array2<f64>, ties: bool) -> Array2<f64> {
135    let nr = a.nrows();
136    let nc = a.ncols();
137    if nc <= 1 || nr == 0 {
138        return a.clone();
139    }
140    // Target grid i = (0..nr-1)/(nr-1).
141    let grid: Vec<f64> = (0..nr).map(|k| k as f64 / (nr - 1) as f64).collect();
142
143    // S: each column's sorted values stretched onto the grid; O: order (the
144    // original row index of each ascending value) for the non-ties path.
145    let mut s = Array2::<f64>::zeros((nr, nc));
146    let mut order = vec![vec![0usize; 0]; nc];
147    let mut nobs = vec![nr; nc];
148    for j in 0..nc {
149        let col = a.column(j);
150        let mut idx: Vec<usize> = (0..nr).filter(|&k| col[k].is_finite()).collect();
151        idx.sort_by(|&p, &q| col[p].partial_cmp(&col[q]).unwrap());
152        let sorted: Vec<f64> = idx.iter().map(|&k| col[k]).collect();
153        let nobsj = sorted.len();
154        nobs[j] = nobsj;
155        order[j] = idx;
156        if nobsj == nr {
157            for k in 0..nr {
158                s[[k, j]] = sorted[k];
159            }
160        } else {
161            // Stretch the nobsj sorted values onto the full grid.
162            let sub: Vec<f64> = (0..nobsj)
163                .map(|k| k as f64 / (nobsj - 1).max(1) as f64)
164                .collect();
165            for (k, &gi) in grid.iter().enumerate() {
166                s[[k, j]] = approx_rule2(&sub, &sorted, gi);
167            }
168        }
169    }
170    let m: Array1<f64> = s.mean_axis(Axis(1)).unwrap();
171    let m_vec = m.to_vec();
172
173    let mut out = a.clone();
174    for j in 0..nc {
175        let col = a.column(j);
176        if ties {
177            let r = rank_average_finite(&col.to_vec());
178            for k in 0..nr {
179                if col[k].is_finite() {
180                    let pos = (r[k] - 1.0) / (nobs[j] - 1) as f64;
181                    out[[k, j]] = approx_rule2(&grid, &m_vec, pos);
182                }
183            }
184        } else if nobs[j] == nr {
185            for (rank0, &k) in order[j].iter().enumerate() {
186                out[[k, j]] = m_vec[rank0];
187            }
188        } else {
189            for (rank0, &k) in order[j].iter().enumerate() {
190                let pos = rank0 as f64 / (nobs[j] - 1) as f64;
191                out[[k, j]] = approx_rule2(&grid, &m_vec, pos);
192            }
193        }
194    }
195    out
196}
197
198/// Cyclic-loess variant.
199#[derive(Clone, Copy, Debug, PartialEq, Eq)]
200pub enum CyclicMethod {
201    Fast,
202    Pairs,
203    Affy,
204}
205
206/// `normalizeCyclicLoess`: iteratively detrend each column's deviation from the
207/// row means (`fast`) or pairwise MA differences (`pairs`/`affy`) with a LOWESS
208/// fit (`loessFit`, weights = NULL). With `adaptive_span` the span is chosen by
209/// `chooseLowessSpan(nrow, 50, 0.3, 1/3)`.
210pub fn normalize_cyclic_loess(
211    x: &Array2<f64>,
212    span: f64,
213    adaptive_span: bool,
214    iterations: usize,
215    method: CyclicMethod,
216) -> Array2<f64> {
217    let nr = x.nrows();
218    let n = x.ncols();
219    let span = if adaptive_span {
220        crate::voom::choose_lowess_span(nr, 50.0, 0.3, 1.0 / 3.0)
221    } else {
222        span
223    };
224    let mut x = x.clone();
225    // MA scratch reused across every pair/column/iteration (refilled in place):
226    // the Pairs/Affy paths otherwise allocate two length-nr Vecs per array pair
227    // per iteration, i.e. O(n^2) allocations.
228    let mut m = vec![0.0f64; nr];
229    let mut a = vec![0.0f64; nr];
230    match method {
231        CyclicMethod::Fast => {
232            for _ in 0..iterations {
233                for (g, ag) in a.iter_mut().enumerate() {
234                    *ag = row_nanmean(&x, g);
235                }
236                for i in 0..n {
237                    for g in 0..nr {
238                        m[g] = x[[g, i]] - a[g];
239                    }
240                    let f = loess_fit_unweighted(&m, &a, span, 4).0;
241                    for g in 0..nr {
242                        x[[g, i]] -= f[g];
243                    }
244                }
245            }
246        }
247        CyclicMethod::Pairs => {
248            for _ in 0..iterations {
249                for i in 0..n - 1 {
250                    for j in i + 1..n {
251                        for g in 0..nr {
252                            m[g] = x[[g, j]] - x[[g, i]];
253                            a[g] = 0.5 * (x[[g, j]] + x[[g, i]]);
254                        }
255                        let f = loess_fit_unweighted(&m, &a, span, 4).0;
256                        for g in 0..nr {
257                            x[[g, i]] += f[g] / 2.0;
258                            x[[g, j]] -= f[g] / 2.0;
259                        }
260                    }
261                }
262            }
263        }
264        CyclicMethod::Affy => {
265            for _ in 0..iterations {
266                let mut adjustment = Array2::<f64>::zeros((nr, n));
267                for i in 0..n - 1 {
268                    for j in i + 1..n {
269                        for g in 0..nr {
270                            m[g] = x[[g, j]] - x[[g, i]];
271                            a[g] = 0.5 * (x[[g, j]] + x[[g, i]]);
272                        }
273                        let f = loess_fit_unweighted(&m, &a, span, 4).0;
274                        for g in 0..nr {
275                            adjustment[[g, j]] += f[g];
276                            adjustment[[g, i]] -= f[g];
277                        }
278                    }
279                }
280                for g in 0..nr {
281                    for c in 0..n {
282                        x[[g, c]] -= adjustment[[g, c]] / n as f64;
283                    }
284                }
285            }
286        }
287    }
288    x
289}
290
291fn row_nanmean(x: &Array2<f64>, g: usize) -> f64 {
292    let mut sum = 0.0;
293    let mut cnt = 0usize;
294    for &v in x.row(g) {
295        if v.is_finite() {
296            sum += v;
297            cnt += 1;
298        }
299    }
300    if cnt > 0 {
301        sum / cnt as f64
302    } else {
303        f64::NAN
304    }
305}
306
307#[cfg(test)]
308mod tests {
309    use super::*;
310    use ndarray::array;
311
312    fn fixture() -> Array2<f64> {
313        array![
314            [5.1, 4.8, 6.2, 5.5],
315            [2.3, 3.1, 2.8, 3.5],
316            [7.7, 7.2, 8.1, 6.9],
317            [1.1, 0.9, 1.4, 1.2],
318            [9.3, 9.1, 8.8, 9.5],
319            [4.4, 4.9, 5.2, 4.1],
320        ]
321    }
322
323    fn assert_close(got: &Array2<f64>, want: &Array2<f64>, tol: f64) {
324        assert_eq!(got.dim(), want.dim());
325        for (a, b) in got.iter().zip(want.iter()) {
326            assert!((a - b).abs() < tol, "got {a} want {b}");
327        }
328    }
329
330    // Reference matrices from R limma 3.68.3 (scratch/norm_ref.R).
331
332    #[test]
333    fn quantile_matches_r() {
334        let got = normalize_quantiles(&fixture(), true);
335        let want = array![
336            [5.425, 4.625, 5.425, 5.425],
337            [2.925, 2.925, 2.925, 2.925],
338            [7.475, 7.475, 7.475, 7.475],
339            [1.150, 1.150, 1.150, 1.150],
340            [9.175, 9.175, 9.175, 9.175],
341            [4.625, 5.425, 4.625, 4.625],
342        ];
343        assert_close(&got, &want, 1e-9);
344    }
345
346    #[test]
347    fn scale_matches_r() {
348        let got = normalize_median_values(&fixture());
349        let want = array![
350            [
351                5.379778894332,
352                4.958922934739,
353                5.450102801447,
354                5.741287729347
355            ],
356            [
357                2.426174795483,
358                3.202637728685,
359                2.461336749041,
360                3.653546736857
361            ],
362            [
363                8.122411271834,
364                7.438384402108,
365                7.120295595439,
366                7.202706424090
367            ],
368            [
369                1.160344467405,
370                0.929798050264,
371                1.230668374520,
372                1.252644595494
373            ],
374            [
375                9.810185042605,
376                9.401291397109,
377                7.735629782699,
378                9.916769714327
379            ],
380            [
381                4.641377869620,
382                5.062233829212,
383                4.571053962504,
384                4.279869034604
385            ],
386        ];
387        assert_close(&got, &want, 1e-9);
388    }
389
390    #[test]
391    fn scale_abs_matches_r() {
392        let nan = f64::NAN;
393        let x = array![
394            [1.0, -2.0, 3.0],
395            [-4.0, 5.0, -6.0],
396            [7.0, -8.0, nan],
397            [-10.0, 11.0, 12.0],
398            [13.0, nan, -15.0],
399            [16.0, -17.0, 18.0],
400        ];
401        let got = normalize_median_abs_values(&x);
402        let want = array![
403            [1.09937146549536, -2.33616436417763, 2.33616436417763],
404            [-4.39748586198142, 5.84041091044408, -4.67232872835526],
405            [7.69560025846748, -9.34465745671052, nan],
406            [-10.9937146549535, 12.848904002977, 9.34465745671052],
407            [14.2918290514396, nan, -11.6808218208881],
408            [17.5899434479257, -19.8573970955099, 14.0169861850658],
409        ];
410        assert_eq!(got.dim(), want.dim());
411        for (a, b) in got.iter().zip(want.iter()) {
412            if b.is_nan() {
413                assert!(a.is_nan(), "got {a} want NaN");
414            } else {
415                assert!((a - b).abs() < 1e-9, "got {a} want {b}");
416            }
417        }
418        // single column returned unchanged
419        let one = x.column(0).to_owned().insert_axis(Axis(1));
420        let got1 = normalize_median_abs_values(&one);
421        assert_eq!(got1, one);
422    }
423
424    #[test]
425    fn cyclicloess_matches_r() {
426        let got = normalize_cyclic_loess(&fixture(), 0.7, true, 3, CyclicMethod::Fast);
427        let want = array![
428            [
429                5.296751774373,
430                4.950393059958,
431                5.410921516718,
432                5.838353274939
433            ],
434            [
435                2.578701290681,
436                3.133887404904,
437                2.497436394874,
438                3.373476538746
439            ],
440            [
441                7.652064568973,
442                7.390792439486,
443                7.799979876589,
444                7.020333019702
445            ],
446            [
447                1.278090240451,
448                0.963550076309,
449                1.356899073035,
450                0.913640591855
451            ],
452            [
453                9.065987375234,
454                9.278688667701,
455                8.955145308355,
456                9.419784843868
457            ],
458            [
459                4.764093788823,
460                4.932065839925,
461                4.658296269798,
462                4.136224331580
463            ],
464        ];
465        assert_close(&got, &want, 1e-6);
466    }
467
468    #[test]
469    fn quantile_with_missing_keeps_na() {
470        let mut x = fixture();
471        x[[2, 1]] = f64::NAN;
472        let got = normalize_quantiles(&x, true);
473        // The missing entry stays NA; observed entries match R, exercising the
474        // sub-grid stretch + approx remap for the short column.
475        assert!(got[[2, 1]].is_nan());
476        let want = array![
477            [5.4100, 4.9325, 5.4100, 5.4100],
478            [2.8150, 3.2250, 2.8150, 2.8150],
479            [7.1100, f64::NAN, 7.1100, 7.1100],
480            [1.1500, 1.1500, 1.1500, 1.1500],
481            [9.1750, 9.1750, 9.1750, 9.1750],
482            [4.4550, 6.6850, 4.4550, 4.4550],
483        ];
484        for g in 0..x.nrows() {
485            for j in 0..x.ncols() {
486                if g == 2 && j == 1 {
487                    continue;
488                }
489                assert!((got[[g, j]] - want[[g, j]]).abs() < 1e-9, "[{g},{j}]");
490            }
491        }
492    }
493
494    #[test]
495    fn single_column_is_identity() {
496        let x = array![[1.0], [2.0], [3.0]];
497        assert_eq!(normalize_quantiles(&x, true), x);
498        assert_eq!(normalize_median_values(&x), x);
499    }
500}