1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
//! Make a string vector unique (limma `combine.R` `makeUnique`).
//!
//! The remaining `combine.R` entries (`cbind`/`rbind`/`merge` methods for the
//! `RGList`/`MAList`/`EList` classes) operate on two-colour/array container
//! objects and are out of scope for the numeric port.
use std::collections::HashMap;
/// `makeUnique(x)`: for every value that occurs more than once, replace its
/// k-th occurrence (in original order) with the value followed by `k`, zero
/// padded to the width of that value's total count. Values that occur exactly
/// once are returned unchanged.
///
/// This reproduces limma's exact scheme and, like limma, does **not** guarantee
/// global uniqueness for pathological inputs (e.g. `["a1", "a", "a"]` becomes
/// `["a1", "a1", "a2"]`).
pub fn make_unique<S: AsRef<str>>(x: &[S]) -> Vec<String> {
let mut counts: HashMap<&str, usize> = HashMap::new();
for s in x {
*counts.entry(s.as_ref()).or_insert(0) += 1;
}
let mut seen: HashMap<&str, usize> = HashMap::new();
x.iter()
.map(|s| {
let key = s.as_ref();
let total = counts[key];
if total <= 1 {
return key.to_string();
}
let k = seen.entry(key).or_insert(0);
*k += 1;
// width = 1 + floor(log10(total)) = number of decimal digits.
let width = total.to_string().len();
format!("{key}{k:0width$}")
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn make_unique_matches_r() {
// duplicates get a 1-based suffix in order of appearance
assert_eq!(
make_unique(&["a", "b", "a", "c", "a", "b"]),
["a1", "b1", "a2", "c", "a3", "b2"]
);
// count >= 10 -> zero-padded to two digits
let g = make_unique(&["g"; 12]);
assert_eq!(
g,
["g01", "g02", "g03", "g04", "g05", "g06", "g07", "g08", "g09", "g10", "g11", "g12"]
);
// a non-duplicated value is untouched even alongside duplicates
assert_eq!(make_unique(&["x", "x", "y"]), ["x1", "x2", "y"]);
// all unique -> unchanged
assert_eq!(make_unique(&["p", "q", "r"]), ["p", "q", "r"]);
}
}