gsva-rust 0.1.0

Pure-Rust port of the GSVA family of gene-set enrichment methods (GSVA, ssGSEA, z-score, PLAGE), validated for numeric parity against the Bioconductor GSVA package.
Documentation
//! Gene sets and a dependency-free GMT parser.

/// A named gene set: a signature name and its member gene identifiers.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct GeneSet {
    /// Gene-set name (first GMT column).
    pub name: String,
    /// Member gene identifiers, in file order.
    pub genes: Vec<String>,
}

impl GeneSet {
    /// Build a gene set from a name and member genes.
    pub fn new(name: impl Into<String>, genes: Vec<String>) -> Self {
        GeneSet {
            name: name.into(),
            genes,
        }
    }
}

/// An ordered collection of [`GeneSet`]s.
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct GeneSets {
    /// The gene sets, in order.
    pub sets: Vec<GeneSet>,
}

impl GeneSets {
    /// Wrap a vector of gene sets.
    pub fn new(sets: Vec<GeneSet>) -> Self {
        GeneSets { sets }
    }

    /// Parse gene sets from GMT text.
    ///
    /// GMT is tab-separated: column 1 is the set name, column 2 a description
    /// (ignored), and the remaining columns are gene identifiers. Blank lines
    /// are skipped, and empty trailing gene cells (a common artifact of
    /// trailing tabs) are dropped.
    pub fn from_gmt(text: &str) -> Self {
        let mut sets = Vec::new();
        for line in text.lines() {
            if line.trim().is_empty() {
                continue;
            }
            let mut fields = line.split('\t');
            let name = match fields.next() {
                Some(n) if !n.is_empty() => n.to_string(),
                _ => continue,
            };
            // Skip the description column.
            let _description = fields.next();
            let genes: Vec<String> = fields
                .filter(|g| !g.trim().is_empty())
                .map(|g| g.trim().to_string())
                .collect();
            sets.push(GeneSet::new(name, genes));
        }
        GeneSets { sets }
    }

    /// Number of gene sets.
    pub fn len(&self) -> usize {
        self.sets.len()
    }

    /// Whether there are no gene sets.
    pub fn is_empty(&self) -> bool {
        self.sets.is_empty()
    }

    /// Iterate over the gene sets.
    pub fn iter(&self) -> std::slice::Iter<'_, GeneSet> {
        self.sets.iter()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parses_gmt() {
        let text = "SET_A\tdesc A\tG1\tG2\tG3\nSET_B\tdesc B\tG2\tG4\t\n";
        let gs = GeneSets::from_gmt(text);
        assert_eq!(gs.len(), 2);
        assert_eq!(
            gs.sets[0],
            GeneSet::new("SET_A", vec!["G1".into(), "G2".into(), "G3".into()])
        );
        // Trailing empty cell dropped.
        assert_eq!(
            gs.sets[1],
            GeneSet::new("SET_B", vec!["G2".into(), "G4".into()])
        );
    }

    #[test]
    fn skips_blank_lines() {
        let text = "\nSET_A\tna\tG1\n\n";
        let gs = GeneSets::from_gmt(text);
        assert_eq!(gs.len(), 1);
    }
}