1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
//! Averaging over replicate probes (`avereps`) or replicate arrays
//! (`avearrays`).
//!
//! Pure-Rust port of limma's `avereps.default` and `avearrays.default` for
//! numeric matrices: an NA-aware mean over the rows ([`avereps`]) or columns
//! ([`avearrays`]) sharing an ID, with one group per unique ID in order of
//! first appearance. NaN entries are treated as missing (dropped from both the
//! sum and the count), matching R's `rowsum(..., na.rm=TRUE)` divided by the
//! per-group count of non-missing values. The weighted `avearrays` branch
//! (which limma routes through `lmFit`) is not ported; only the unweighted
//! default is provided.
use ndarray::Array2;
use std::collections::HashMap;
use std::hash::Hash;
/// Map each element of `id` to a group index, preserving first-appearance
/// order. Returns the per-element group indices and the unique IDs.
fn group_ids<T: Eq + Hash + Clone>(id: &[T]) -> (Vec<usize>, Vec<T>) {
let mut order: Vec<T> = Vec::new();
let mut index: HashMap<&T, usize> = HashMap::new();
let mut group_of = Vec::with_capacity(id.len());
for t in id {
let g = *index.entry(t).or_insert_with(|| {
order.push(t.clone());
order.len() - 1
});
group_of.push(g);
}
(group_of, order)
}
/// `avereps(x, ID)`: average the rows of `x` that share an ID, returning the
/// collapsed matrix (one row per unique ID, in first-appearance order) and the
/// unique IDs. `id` must have one entry per row of `x`. NaN entries are treated
/// as missing; a group/column that is entirely missing yields NaN.
pub fn avereps<T: Eq + Hash + Clone>(x: &Array2<f64>, id: &[T]) -> (Array2<f64>, Vec<T>) {
let (nr, nc) = x.dim();
assert_eq!(id.len(), nr, "ID length must equal the number of rows");
let (group_of, order) = group_ids(id);
let ng = order.len();
let mut sum = Array2::<f64>::zeros((ng, nc));
let mut cnt = Array2::<f64>::zeros((ng, nc));
for r in 0..nr {
let g = group_of[r];
for c in 0..nc {
let v = x[[r, c]];
if !v.is_nan() {
sum[[g, c]] += v;
cnt[[g, c]] += 1.0;
}
}
}
sum.zip_mut_with(&cnt, |s, &n| *s /= n);
(sum, order)
}
/// `avearrays(x, ID)`: average the columns of `x` that share an ID, returning
/// the collapsed matrix (one column per unique ID, in first-appearance order)
/// and the unique IDs. `id` must have one entry per column of `x`. NaN entries
/// are treated as missing; a group/row that is entirely missing yields NaN.
pub fn avearrays<T: Eq + Hash + Clone>(x: &Array2<f64>, id: &[T]) -> (Array2<f64>, Vec<T>) {
let (nr, nc) = x.dim();
assert_eq!(id.len(), nc, "ID length must equal the number of columns");
let (group_of, order) = group_ids(id);
let ng = order.len();
let mut sum = Array2::<f64>::zeros((nr, ng));
let mut cnt = Array2::<f64>::zeros((nr, ng));
for c in 0..nc {
let g = group_of[c];
for r in 0..nr {
let v = x[[r, c]];
if !v.is_nan() {
sum[[r, g]] += v;
cnt[[r, g]] += 1.0;
}
}
}
sum.zip_mut_with(&cnt, |s, &n| *s /= n);
(sum, order)
}
#[cfg(test)]
mod tests {
use super::*;
use ndarray::array;
const NA: f64 = f64::NAN;
fn eq(a: &Array2<f64>, b: &Array2<f64>) -> bool {
a.dim() == b.dim() && a.iter().zip(b.iter()).all(|(&x, &y)| (x - y).abs() < 1e-12)
}
#[test]
fn avereps_matches_r() {
// 6 rows, IDs a,b,a,c,b,a with one NA in the second column of an "a" row.
let x = array![
[1.0, 2.0, 3.0],
[4.0, 5.0, 6.0],
[7.0, NA, 9.0],
[10.0, 11.0, 12.0],
[13.0, 14.0, 15.0],
[16.0, 17.0, 18.0],
];
let id = ["a", "b", "a", "c", "b", "a"];
let (y, ids) = avereps(&x, &id);
// Reference: avereps(x, ID) in limma 3.68.3.
let want = array![
[8.0, 9.5, 10.0], // a: rows 0,2,5 (col1 averages 2 & 17 only)
[8.5, 9.5, 10.5], // b: rows 1,4
[10.0, 11.0, 12.0], // c: row 3
];
assert_eq!(ids, vec!["a", "b", "c"]);
assert!(eq(&y, &want), "got {y:?}");
}
#[test]
fn avearrays_matches_r() {
// 3 rows, 6 columns; column IDs a,b,a,c,b,a (transpose of the avereps case).
let x = array![
[1.0, 4.0, 7.0, 10.0, 13.0, 16.0],
[2.0, 5.0, NA, 11.0, 14.0, 17.0],
[3.0, 6.0, 9.0, 12.0, 15.0, 18.0],
];
let id = ["a", "b", "a", "c", "b", "a"];
let (y, ids) = avearrays(&x, &id);
// Reference: avearrays(x, ID) in limma 3.68.3.
let want = array![[8.0, 8.5, 10.0], [9.5, 9.5, 11.0], [10.0, 10.5, 12.0],];
assert_eq!(ids, vec!["a", "b", "c"]);
assert!(eq(&y, &want), "got {y:?}");
}
#[test]
fn avereps_no_duplicates_is_identity() {
let x = array![[1.0, 2.0], [3.0, 4.0]];
let id = ["p", "q"];
let (y, ids) = avereps(&x, &id);
assert!(eq(&y, &x));
assert_eq!(ids, vec!["p", "q"]);
}
}