Skip to main content

gsva/
geneset.rs

1//! Gene sets and a dependency-free GMT parser.
2
3/// A named gene set: a signature name and its member gene identifiers.
4#[derive(Clone, Debug, PartialEq, Eq)]
5pub struct GeneSet {
6    /// Gene-set name (first GMT column).
7    pub name: String,
8    /// Member gene identifiers, in file order.
9    pub genes: Vec<String>,
10}
11
12impl GeneSet {
13    /// Build a gene set from a name and member genes.
14    pub fn new(name: impl Into<String>, genes: Vec<String>) -> Self {
15        GeneSet {
16            name: name.into(),
17            genes,
18        }
19    }
20}
21
22/// An ordered collection of [`GeneSet`]s.
23#[derive(Clone, Debug, Default, PartialEq, Eq)]
24pub struct GeneSets {
25    /// The gene sets, in order.
26    pub sets: Vec<GeneSet>,
27}
28
29impl GeneSets {
30    /// Wrap a vector of gene sets.
31    pub fn new(sets: Vec<GeneSet>) -> Self {
32        GeneSets { sets }
33    }
34
35    /// Parse gene sets from GMT text.
36    ///
37    /// GMT is tab-separated: column 1 is the set name, column 2 a description
38    /// (ignored), and the remaining columns are gene identifiers. Blank lines
39    /// are skipped, and empty trailing gene cells (a common artifact of
40    /// trailing tabs) are dropped.
41    pub fn from_gmt(text: &str) -> Self {
42        let mut sets = Vec::new();
43        for line in text.lines() {
44            if line.trim().is_empty() {
45                continue;
46            }
47            let mut fields = line.split('\t');
48            let name = match fields.next() {
49                Some(n) if !n.is_empty() => n.to_string(),
50                _ => continue,
51            };
52            // Skip the description column.
53            let _description = fields.next();
54            let genes: Vec<String> = fields
55                .filter(|g| !g.trim().is_empty())
56                .map(|g| g.trim().to_string())
57                .collect();
58            sets.push(GeneSet::new(name, genes));
59        }
60        GeneSets { sets }
61    }
62
63    /// Number of gene sets.
64    pub fn len(&self) -> usize {
65        self.sets.len()
66    }
67
68    /// Whether there are no gene sets.
69    pub fn is_empty(&self) -> bool {
70        self.sets.is_empty()
71    }
72
73    /// Iterate over the gene sets.
74    pub fn iter(&self) -> std::slice::Iter<'_, GeneSet> {
75        self.sets.iter()
76    }
77}
78
79#[cfg(test)]
80mod tests {
81    use super::*;
82
83    #[test]
84    fn parses_gmt() {
85        let text = "SET_A\tdesc A\tG1\tG2\tG3\nSET_B\tdesc B\tG2\tG4\t\n";
86        let gs = GeneSets::from_gmt(text);
87        assert_eq!(gs.len(), 2);
88        assert_eq!(
89            gs.sets[0],
90            GeneSet::new("SET_A", vec!["G1".into(), "G2".into(), "G3".into()])
91        );
92        // Trailing empty cell dropped.
93        assert_eq!(
94            gs.sets[1],
95            GeneSet::new("SET_B", vec!["G2".into(), "G4".into()])
96        );
97    }
98
99    #[test]
100    fn skips_blank_lines() {
101        let text = "\nSET_A\tna\tG1\n\n";
102        let gs = GeneSets::from_gmt(text);
103        assert_eq!(gs.len(), 1);
104    }
105}