jellyfish_reader/
string_mers.rs1use crate::mer::{MerDna, encode_base};
2
3pub struct StringMers<'a> {
19 seq: &'a [u8],
20 k: usize,
21 pos: usize,
22 valid_count: usize,
23 current: MerDna,
24 canonical: bool,
25}
26
27impl<'a> StringMers<'a> {
28 pub fn new(seq: &'a str, k: usize) -> Self {
30 Self {
31 seq: seq.as_bytes(),
32 k,
33 pos: 0,
34 valid_count: 0,
35 current: MerDna::new(k),
36 canonical: false,
37 }
38 }
39
40 pub fn canonicals(seq: &'a str, k: usize) -> Self {
45 Self {
46 seq: seq.as_bytes(),
47 k,
48 pos: 0,
49 valid_count: 0,
50 current: MerDna::new(k),
51 canonical: true,
52 }
53 }
54}
55
56impl<'a> Iterator for StringMers<'a> {
57 type Item = MerDna;
58
59 fn next(&mut self) -> Option<MerDna> {
60 while self.pos < self.seq.len() {
61 let ch = self.seq[self.pos];
62 self.pos += 1;
63
64 match encode_base(ch) {
65 Some(_code) => {
66 self.current.shift_left(ch);
67 self.valid_count += 1;
68
69 if self.valid_count >= self.k {
70 return if self.canonical {
71 Some(self.current.get_canonical())
72 } else {
73 Some(self.current.clone())
74 };
75 }
76 }
77 None => {
78 self.valid_count = 0;
80 self.current = MerDna::new(self.k);
81 }
82 }
83 }
84 None
85 }
86}
87
88pub fn string_mers(seq: &str, k: usize) -> StringMers<'_> {
90 StringMers::new(seq, k)
91}
92
93pub fn string_canonicals(seq: &str, k: usize) -> StringMers<'_> {
95 StringMers::canonicals(seq, k)
96}
97
98#[cfg(test)]
99mod tests {
100 use super::*;
101
102 #[test]
103 fn test_basic_extraction() {
104 let mers: Vec<String> = StringMers::new("ACGTACGT", 4)
105 .map(|m| m.to_string())
106 .collect();
107 assert_eq!(mers, vec!["ACGT", "CGTA", "GTAC", "TACG", "ACGT"]);
108 }
109
110 #[test]
111 fn test_exact_length() {
112 let mers: Vec<String> = StringMers::new("ACGT", 4).map(|m| m.to_string()).collect();
113 assert_eq!(mers, vec!["ACGT"]);
114 }
115
116 #[test]
117 fn test_shorter_than_k() {
118 let mers: Vec<String> = StringMers::new("ACG", 4).map(|m| m.to_string()).collect();
119 assert!(mers.is_empty());
120 }
121
122 #[test]
123 fn test_empty_string() {
124 let mers: Vec<String> = StringMers::new("", 4).map(|m| m.to_string()).collect();
125 assert!(mers.is_empty());
126 }
127
128 #[test]
129 fn test_skip_invalid_characters() {
130 let mers: Vec<String> = StringMers::new("ACGTNACGT", 4)
132 .map(|m| m.to_string())
133 .collect();
134 assert_eq!(mers, vec!["ACGT", "ACGT"]);
136 }
137
138 #[test]
139 fn test_invalid_at_start() {
140 let mers: Vec<String> = StringMers::new("NACGT", 4).map(|m| m.to_string()).collect();
141 assert_eq!(mers, vec!["ACGT"]);
142 }
143
144 #[test]
145 fn test_all_invalid() {
146 let mers: Vec<String> = StringMers::new("NNNN", 4).map(|m| m.to_string()).collect();
147 assert!(mers.is_empty());
148 }
149
150 #[test]
151 fn test_k_equals_1() {
152 let mers: Vec<String> = StringMers::new("ACGT", 1).map(|m| m.to_string()).collect();
153 assert_eq!(mers, vec!["A", "C", "G", "T"]);
154 }
155
156 #[test]
157 fn test_lowercase() {
158 let mers: Vec<String> = StringMers::new("acgt", 4).map(|m| m.to_string()).collect();
159 assert_eq!(mers, vec!["ACGT"]);
160 }
161
162 #[test]
163 fn test_canonical_mode() {
164 let mers: Vec<String> = StringMers::canonicals("TTTT", 4)
166 .map(|m| m.to_string())
167 .collect();
168 assert_eq!(mers, vec!["AAAA"]);
169 }
170
171 #[test]
172 fn test_canonical_palindrome() {
173 let mers: Vec<String> = StringMers::canonicals("ACGT", 4)
174 .map(|m| m.to_string())
175 .collect();
176 assert_eq!(mers, vec!["ACGT"]);
177 }
178
179 #[test]
180 fn test_canonical_various() {
181 let mers: Vec<String> = StringMers::canonicals("ACGTACGT", 4)
184 .map(|m| m.to_string())
185 .collect();
186
187 for mer_str in &mers {
189 let mer: MerDna = mer_str.parse().unwrap();
190 assert_eq!(mer.get_canonical().to_string(), *mer_str);
191 }
192 }
193
194 #[test]
195 fn test_convenience_functions() {
196 let mers1: Vec<String> = string_mers("ACGT", 4).map(|m| m.to_string()).collect();
197 let mers2: Vec<String> = StringMers::new("ACGT", 4).map(|m| m.to_string()).collect();
198 assert_eq!(mers1, mers2);
199
200 let can1: Vec<String> = string_canonicals("ACGT", 4)
201 .map(|m| m.to_string())
202 .collect();
203 let can2: Vec<String> = StringMers::canonicals("ACGT", 4)
204 .map(|m| m.to_string())
205 .collect();
206 assert_eq!(can1, can2);
207 }
208
209 #[test]
210 fn test_multiple_invalid_regions() {
211 let mers: Vec<String> = StringMers::new("ACGTNNTACG", 3)
212 .map(|m| m.to_string())
213 .collect();
214 assert_eq!(mers, vec!["ACG", "CGT", "TAC", "ACG"]);
216 }
217
218 #[test]
219 fn test_long_sequence() {
220 let seq = "ACGT".repeat(100);
221 let mers: Vec<String> = StringMers::new(&seq, 25).map(|m| m.to_string()).collect();
222 assert_eq!(mers.len(), seq.len() - 24);
223 }
224
225 #[test]
226 fn test_homopolymer_run() {
227 let mers: Vec<String> = StringMers::new("AAAAA", 3).map(|m| m.to_string()).collect();
228 assert_eq!(mers, vec!["AAA", "AAA", "AAA"]);
229 }
230}