Skip to main content

jellyfish_reader/
string_mers.rs

1use crate::mer::{MerDna, encode_base};
2
3/// Iterator that extracts k-mers from a DNA sequence string.
4///
5/// Slides a window of size `k` across the input sequence, yielding each
6/// valid k-mer. Invalid characters (non-ACGT) cause the window to reset.
7///
8/// # Examples
9///
10/// ```
11/// use jellyfish_reader::StringMers;
12///
13/// let mers: Vec<_> = StringMers::new("ACGTACGT", 4)
14///     .map(|m| m.to_string())
15///     .collect();
16/// assert_eq!(mers, vec!["ACGT", "CGTA", "GTAC", "TACG", "ACGT"]);
17/// ```
18pub struct StringMers<'a> {
19    seq: &'a [u8],
20    k: usize,
21    pos: usize,
22    valid_count: usize,
23    current: MerDna,
24    canonical: bool,
25}
26
27impl<'a> StringMers<'a> {
28    /// Create a new k-mer iterator over the given DNA sequence.
29    pub fn new(seq: &'a str, k: usize) -> Self {
30        Self {
31            seq: seq.as_bytes(),
32            k,
33            pos: 0,
34            valid_count: 0,
35            current: MerDna::new(k),
36            canonical: false,
37        }
38    }
39
40    /// Create a new canonical k-mer iterator.
41    ///
42    /// Each yielded k-mer is the lexicographically smaller of itself
43    /// and its reverse complement.
44    pub fn canonicals(seq: &'a str, k: usize) -> Self {
45        Self {
46            seq: seq.as_bytes(),
47            k,
48            pos: 0,
49            valid_count: 0,
50            current: MerDna::new(k),
51            canonical: true,
52        }
53    }
54}
55
56impl<'a> Iterator for StringMers<'a> {
57    type Item = MerDna;
58
59    fn next(&mut self) -> Option<MerDna> {
60        while self.pos < self.seq.len() {
61            let ch = self.seq[self.pos];
62            self.pos += 1;
63
64            match encode_base(ch) {
65                Some(_code) => {
66                    self.current.shift_left(ch);
67                    self.valid_count += 1;
68
69                    if self.valid_count >= self.k {
70                        return if self.canonical {
71                            Some(self.current.get_canonical())
72                        } else {
73                            Some(self.current.clone())
74                        };
75                    }
76                }
77                None => {
78                    // Invalid character, reset window
79                    self.valid_count = 0;
80                    self.current = MerDna::new(self.k);
81                }
82            }
83        }
84        None
85    }
86}
87
88/// Convenience function to create a k-mer iterator.
89pub fn string_mers(seq: &str, k: usize) -> StringMers<'_> {
90    StringMers::new(seq, k)
91}
92
93/// Convenience function to create a canonical k-mer iterator.
94pub fn string_canonicals(seq: &str, k: usize) -> StringMers<'_> {
95    StringMers::canonicals(seq, k)
96}
97
98#[cfg(test)]
99mod tests {
100    use super::*;
101
102    #[test]
103    fn test_basic_extraction() {
104        let mers: Vec<String> = StringMers::new("ACGTACGT", 4)
105            .map(|m| m.to_string())
106            .collect();
107        assert_eq!(mers, vec!["ACGT", "CGTA", "GTAC", "TACG", "ACGT"]);
108    }
109
110    #[test]
111    fn test_exact_length() {
112        let mers: Vec<String> = StringMers::new("ACGT", 4).map(|m| m.to_string()).collect();
113        assert_eq!(mers, vec!["ACGT"]);
114    }
115
116    #[test]
117    fn test_shorter_than_k() {
118        let mers: Vec<String> = StringMers::new("ACG", 4).map(|m| m.to_string()).collect();
119        assert!(mers.is_empty());
120    }
121
122    #[test]
123    fn test_empty_string() {
124        let mers: Vec<String> = StringMers::new("", 4).map(|m| m.to_string()).collect();
125        assert!(mers.is_empty());
126    }
127
128    #[test]
129    fn test_skip_invalid_characters() {
130        // N in the middle should reset the window
131        let mers: Vec<String> = StringMers::new("ACGTNACGT", 4)
132            .map(|m| m.to_string())
133            .collect();
134        // First 4-mer: "ACGT", then N resets, then we need 4 more valid bases
135        assert_eq!(mers, vec!["ACGT", "ACGT"]);
136    }
137
138    #[test]
139    fn test_invalid_at_start() {
140        let mers: Vec<String> = StringMers::new("NACGT", 4).map(|m| m.to_string()).collect();
141        assert_eq!(mers, vec!["ACGT"]);
142    }
143
144    #[test]
145    fn test_all_invalid() {
146        let mers: Vec<String> = StringMers::new("NNNN", 4).map(|m| m.to_string()).collect();
147        assert!(mers.is_empty());
148    }
149
150    #[test]
151    fn test_k_equals_1() {
152        let mers: Vec<String> = StringMers::new("ACGT", 1).map(|m| m.to_string()).collect();
153        assert_eq!(mers, vec!["A", "C", "G", "T"]);
154    }
155
156    #[test]
157    fn test_lowercase() {
158        let mers: Vec<String> = StringMers::new("acgt", 4).map(|m| m.to_string()).collect();
159        assert_eq!(mers, vec!["ACGT"]);
160    }
161
162    #[test]
163    fn test_canonical_mode() {
164        // AAAA and TTTT are reversecomplement pairs; canonical picks AAAA
165        let mers: Vec<String> = StringMers::canonicals("TTTT", 4)
166            .map(|m| m.to_string())
167            .collect();
168        assert_eq!(mers, vec!["AAAA"]);
169    }
170
171    #[test]
172    fn test_canonical_palindrome() {
173        let mers: Vec<String> = StringMers::canonicals("ACGT", 4)
174            .map(|m| m.to_string())
175            .collect();
176        assert_eq!(mers, vec!["ACGT"]);
177    }
178
179    #[test]
180    fn test_canonical_various() {
181        // For each k-mer, canonical should be the lexicographically smaller of
182        // itself and its reverse complement
183        let mers: Vec<String> = StringMers::canonicals("ACGTACGT", 4)
184            .map(|m| m.to_string())
185            .collect();
186
187        // Verify each is canonical (equal to its own canonical)
188        for mer_str in &mers {
189            let mer: MerDna = mer_str.parse().unwrap();
190            assert_eq!(mer.get_canonical().to_string(), *mer_str);
191        }
192    }
193
194    #[test]
195    fn test_convenience_functions() {
196        let mers1: Vec<String> = string_mers("ACGT", 4).map(|m| m.to_string()).collect();
197        let mers2: Vec<String> = StringMers::new("ACGT", 4).map(|m| m.to_string()).collect();
198        assert_eq!(mers1, mers2);
199
200        let can1: Vec<String> = string_canonicals("ACGT", 4)
201            .map(|m| m.to_string())
202            .collect();
203        let can2: Vec<String> = StringMers::canonicals("ACGT", 4)
204            .map(|m| m.to_string())
205            .collect();
206        assert_eq!(can1, can2);
207    }
208
209    #[test]
210    fn test_multiple_invalid_regions() {
211        let mers: Vec<String> = StringMers::new("ACGTNNTACG", 3)
212            .map(|m| m.to_string())
213            .collect();
214        // "ACG" then "T" then NN resets, then "TAC" then "ACG"
215        assert_eq!(mers, vec!["ACG", "CGT", "TAC", "ACG"]);
216    }
217
218    #[test]
219    fn test_long_sequence() {
220        let seq = "ACGT".repeat(100);
221        let mers: Vec<String> = StringMers::new(&seq, 25).map(|m| m.to_string()).collect();
222        assert_eq!(mers.len(), seq.len() - 24);
223    }
224
225    #[test]
226    fn test_homopolymer_run() {
227        let mers: Vec<String> = StringMers::new("AAAAA", 3).map(|m| m.to_string()).collect();
228        assert_eq!(mers, vec!["AAA", "AAA", "AAA"]);
229    }
230}