rsomics-gradient-trajectory 0.1.0

Gradient/trajectory ANOVA over ordination coordinates (QIIME-style microbiome trajectory analysis): per-group trajectory vectors plus closed-form one-way ANOVA F/p, selectable algorithm — a Rust reimplementation of scikit-bio's skbio.stats.gradient.
Documentation
use std::cmp::Ordering;

/// A natural-sort key matching `natsort.realsorted` (the `ns.REAL` algorithm
/// scikit-bio uses to order samples within a group). A string splits into
/// alternating text/number runs; numbers are signed floats (decimal or
/// scientific), text runs compare lexically. Tokenisation always begins with a
/// text run (possibly empty) so the parity of a position fixes its kind, and two
/// keys never compare a number against text.
///
/// Non-ASCII Unicode numerals (natsort's giant numeral class) are treated as
/// text; sample IDs and metadata gradients are ASCII in practice.
#[derive(Clone, Debug)]
pub struct NatKey(Vec<Token>);

#[derive(Clone, Debug)]
enum Token {
    Text(String),
    Num(f64),
}

impl NatKey {
    pub fn new(s: &str) -> NatKey {
        let bytes = s.as_bytes();
        let mut tokens = Vec::new();
        let mut text_start = 0usize;
        let mut i = 0usize;
        while i < bytes.len() {
            if let Some(end) = match_number(bytes, i) {
                tokens.push(Token::Text(s[text_start..i].to_string()));
                tokens.push(Token::Num(s[i..end].parse().unwrap()));
                i = end;
                text_start = end;
            } else {
                i += 1;
            }
        }
        tokens.push(Token::Text(s[text_start..].to_string()));
        NatKey(tokens)
    }
}

/// Match `[-+]?(?:\d+\.?\d*|\.\d+)(?:[eE][-+]?\d+)?` at `start`, returning the
/// end byte index of a token Rust's `f64::from_str` will then accept.
fn match_number(b: &[u8], start: usize) -> Option<usize> {
    let mut i = start;
    if i < b.len() && (b[i] == b'+' || b[i] == b'-') {
        i += 1;
    }
    let mant_start = i;
    if i < b.len() && b[i].is_ascii_digit() {
        while i < b.len() && b[i].is_ascii_digit() {
            i += 1;
        }
        if i < b.len() && b[i] == b'.' {
            i += 1;
            while i < b.len() && b[i].is_ascii_digit() {
                i += 1;
            }
        }
    } else if i < b.len() && b[i] == b'.' && i + 1 < b.len() && b[i + 1].is_ascii_digit() {
        i += 1;
        while i < b.len() && b[i].is_ascii_digit() {
            i += 1;
        }
    } else {
        return None;
    }
    debug_assert!(i > mant_start);

    let before_exp = i;
    if i < b.len() && (b[i] == b'e' || b[i] == b'E') {
        let mut j = i + 1;
        if j < b.len() && (b[j] == b'+' || b[j] == b'-') {
            j += 1;
        }
        if j < b.len() && b[j].is_ascii_digit() {
            while j < b.len() && b[j].is_ascii_digit() {
                j += 1;
            }
            i = j;
        } else {
            i = before_exp;
        }
    }
    Some(i)
}

impl PartialEq for NatKey {
    fn eq(&self, other: &Self) -> bool {
        self.cmp(other) == Ordering::Equal
    }
}
impl Eq for NatKey {}
impl PartialOrd for NatKey {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
}
impl Ord for NatKey {
    fn cmp(&self, other: &Self) -> Ordering {
        let mut a = self.0.iter();
        let mut b = other.0.iter();
        loop {
            match (a.next(), b.next()) {
                (None, None) => return Ordering::Equal,
                (None, Some(_)) => return Ordering::Less,
                (Some(_), None) => return Ordering::Greater,
                (Some(x), Some(y)) => {
                    let c = match (x, y) {
                        (Token::Text(p), Token::Text(q)) => p.cmp(q),
                        (Token::Num(p), Token::Num(q)) => p.total_cmp(q),
                        // Positions alternate text/number identically for both
                        // keys, so a mismatch cannot occur.
                        _ => unreachable!(),
                    };
                    if c != Ordering::Equal {
                        return c;
                    }
                }
            }
        }
    }
}

/// Stable natural sort of `items` keyed by `key`, mirroring `realsorted`.
pub fn realsorted_by<T, F: Fn(&T) -> String>(items: &mut [T], key: F) {
    items.sort_by(|x, y| NatKey::new(&key(x)).cmp(&NatKey::new(&key(y))));
}

#[cfg(test)]
mod tests {
    use super::*;

    fn sorted(v: &[&str]) -> Vec<String> {
        let mut items: Vec<String> = v.iter().map(|s| s.to_string()).collect();
        realsorted_by(&mut items, |s| s.clone());
        items
    }

    #[test]
    fn numeric_runs() {
        assert_eq!(
            sorted(&["S10", "S2", "S1", "S20"]),
            ["S1", "S2", "S10", "S20"]
        );
    }

    #[test]
    fn signed_floats() {
        assert_eq!(
            sorted(&["3.5", "10.1", "2.0", "-1.0"]),
            ["-1.0", "2.0", "3.5", "10.1"]
        );
    }

    #[test]
    fn number_leads_text() {
        // '1' -> ('', 1.0), 'a' -> ('a',); '' < 'a' so the number sorts first.
        assert_eq!(sorted(&["a", "1", "a1", "1a"]), ["1", "1a", "a", "a1"]);
    }

    #[test]
    fn scientific_notation() {
        assert_eq!(sorted(&["1e3", "100", "2000"]), ["100", "1e3", "2000"]);
    }

    #[test]
    fn multi_dot_splits_into_two_numbers() {
        // natsort tokenises 'v1.2.3' as ('v', 1.2, '', 0.3); the trailing empty
        // text token here is order-equivalent (it ties against another trailing
        // empty), so only the numeric tokens need to match.
        let k = NatKey::new("v1.2.3");
        match (&k.0[1], &k.0[3]) {
            (Token::Num(a), Token::Num(b)) => {
                assert_eq!(*a, 1.2);
                assert_eq!(*b, 0.3);
            }
            _ => panic!("expected two numeric tokens at positions 1 and 3"),
        }
    }

    #[test]
    fn matches_natsort_battery() {
        // Expected orderings captured from natsort.realsorted (ns.REAL) on the
        // 4090 oracle. Input is the sorted output shuffled into a fixed order.
        let cases: &[(&[&str], &[&str])] = &[
            (
                &["S100", "S2", "S00010", "S1", "S20", "S3"],
                &["S1", "S2", "S3", "S00010", "S20", "S100"],
            ),
            (
                &["10.1", "-1.0", "3.5", "0", "2.0", "-10.5"],
                &["-10.5", "-1.0", "0", "2.0", "3.5", "10.1"],
            ),
            // v1.2.3 and v1.2.30 share the key ('v',1.2,'',0.3); the stable sort
            // keeps their input order, exactly as natsort does.
            (
                &["v1.2.3x", "v1.2", "v1.10", "v1.2.30", "v1.2.3"],
                &["v1.10", "v1.2", "v1.2.30", "v1.2.3", "v1.2.3x"],
            ),
            (
                &["a1", "10", "1", "", "2b", "a", "1a"],
                &["", "1", "1a", "2b", "10", "a", "a1"],
            ),
            (
                &["t1.0", "t-1.0", "t0.0", "t-10.0", "t-2.0"],
                &["t-10.0", "t-2.0", "t-1.0", "t0.0", "t1.0"],
            ),
            (
                &["2000", "1e3", "99", "100", "1.5e2"],
                &["99", "100", "1.5e2", "1e3", "2000"],
            ),
        ];
        for (input, expected) in cases {
            assert_eq!(&sorted(input), expected, "input {input:?}");
        }
    }

    #[test]
    fn trailing_e_is_text() {
        // '1e' -> ('', 1.0, 'e'): bare e with no exponent digits stays text.
        let k = NatKey::new("1e");
        assert_eq!(k.0.len(), 3);
        matches!(k.0[2], Token::Text(ref t) if t == "e");
    }
}