limma-rust 0.1.0

Pure-Rust port of the Bioconductor limma differential-expression package
Documentation
//! Make a string vector unique (limma `combine.R` `makeUnique`).
//!
//! The remaining `combine.R` entries (`cbind`/`rbind`/`merge` methods for the
//! `RGList`/`MAList`/`EList` classes) operate on two-colour/array container
//! objects and are out of scope for the numeric port.

use std::collections::HashMap;

/// `makeUnique(x)`: for every value that occurs more than once, replace its
/// k-th occurrence (in original order) with the value followed by `k`, zero
/// padded to the width of that value's total count. Values that occur exactly
/// once are returned unchanged.
///
/// This reproduces limma's exact scheme and, like limma, does **not** guarantee
/// global uniqueness for pathological inputs (e.g. `["a1", "a", "a"]` becomes
/// `["a1", "a1", "a2"]`).
pub fn make_unique<S: AsRef<str>>(x: &[S]) -> Vec<String> {
    let mut counts: HashMap<&str, usize> = HashMap::new();
    for s in x {
        *counts.entry(s.as_ref()).or_insert(0) += 1;
    }
    let mut seen: HashMap<&str, usize> = HashMap::new();
    x.iter()
        .map(|s| {
            let key = s.as_ref();
            let total = counts[key];
            if total <= 1 {
                return key.to_string();
            }
            let k = seen.entry(key).or_insert(0);
            *k += 1;
            // width = 1 + floor(log10(total)) = number of decimal digits.
            let width = total.to_string().len();
            format!("{key}{k:0width$}")
        })
        .collect()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn make_unique_matches_r() {
        // duplicates get a 1-based suffix in order of appearance
        assert_eq!(
            make_unique(&["a", "b", "a", "c", "a", "b"]),
            ["a1", "b1", "a2", "c", "a3", "b2"]
        );
        // count >= 10 -> zero-padded to two digits
        let g = make_unique(&["g"; 12]);
        assert_eq!(
            g,
            ["g01", "g02", "g03", "g04", "g05", "g06", "g07", "g08", "g09", "g10", "g11", "g12"]
        );
        // a non-duplicated value is untouched even alongside duplicates
        assert_eq!(make_unique(&["x", "x", "y"]), ["x1", "x2", "y"]);
        // all unique -> unchanged
        assert_eq!(make_unique(&["p", "q", "r"]), ["p", "q", "r"]);
    }
}