minimizer_iter/
lib.rs

1pub mod algorithm;
2mod builder;
3pub mod iterator;
4
5pub use builder::MinimizerBuilder;
6pub use minimizer_queue::DefaultHashBuilder;
7
8#[cfg(test)]
9mod tests {
10    use super::*;
11    use biotest::Format;
12    use nohash_hasher::BuildNoHashHasher;
13
14    #[test]
15    fn test_minimizer_iter() {
16        let seq = b"TGATTGCACAATC";
17        let minimizer_size = 3;
18        let width = 4;
19        let hasher = BuildNoHashHasher::<u64>::default();
20        let mut min_iter = MinimizerBuilder::new()
21            .minimizer_size(minimizer_size)
22            .width(width)
23            .hasher(hasher)
24            .iter(seq);
25
26        assert_eq!(min_iter.next(), Some((0b001111, 2))); // ATT
27        assert_eq!(min_iter.next(), Some((0b010001, 6))); // CAC
28        assert_eq!(min_iter.next(), Some((0b000100, 7))); // ACA
29        assert_eq!(min_iter.next(), Some((0b000011, 9))); // AAT
30        assert_eq!(min_iter.next(), None);
31    }
32
33    #[test]
34    fn test_minimizer_iter_pos() {
35        let seq = b"TGATTGCACAATC";
36        let minimizer_size = 3;
37        let width = 4;
38        let hasher = BuildNoHashHasher::<u64>::default();
39        let mut min_iter = MinimizerBuilder::<u64>::new()
40            .minimizer_size(minimizer_size)
41            .width(width)
42            .hasher(hasher)
43            .iter_pos(seq);
44
45        assert_eq!(min_iter.next(), Some(2)); // ATT
46        assert_eq!(min_iter.next(), Some(6)); // CAC
47        assert_eq!(min_iter.next(), Some(7)); // ACA
48        assert_eq!(min_iter.next(), Some(9)); // AAT
49        assert_eq!(min_iter.next(), None);
50    }
51
52    #[test]
53    fn test_mod_minimizer_iter() {
54        let seq = b"TGATTGCACAATC";
55        let minimizer_size = 4;
56        let width = 4;
57        let hasher = BuildNoHashHasher::<u64>::default();
58        let mut min_iter = MinimizerBuilder::new_mod()
59            .minimizer_size(minimizer_size)
60            .width(width)
61            .hasher(hasher)
62            .iter(seq);
63
64        assert_eq!(min_iter.next(), Some((0b00111110, 2))); // ATTG
65        assert_eq!(min_iter.next(), Some((0b01000100, 6))); // CACA
66        assert_eq!(min_iter.next(), Some((0b00010000, 7))); // ACAA
67        assert_eq!(min_iter.next(), Some((0b00001101, 9))); // AATC
68        assert_eq!(min_iter.next(), None);
69    }
70
71    #[test]
72    fn test_mod_minimizer_iter_pos() {
73        let seq = b"TGATTGCACAATC";
74        let minimizer_size = 4;
75        let width = 4;
76        let hasher = BuildNoHashHasher::<u64>::default();
77        let mut min_iter = MinimizerBuilder::<u64, _>::new_mod()
78            .minimizer_size(minimizer_size)
79            .width(width)
80            .hasher(hasher)
81            .iter_pos(seq);
82
83        assert_eq!(min_iter.next(), Some(2)); // ATTG
84        assert_eq!(min_iter.next(), Some(6)); // CACA
85        assert_eq!(min_iter.next(), Some(7)); // ACAA
86        assert_eq!(min_iter.next(), Some(9)); // AATC
87        assert_eq!(min_iter.next(), None);
88    }
89
90    fn gen_seq(len: usize) -> Vec<u8> {
91        let mut rng = biotest::rand();
92        let mut seq = Vec::with_capacity(len);
93        let generator = biotest::Sequence::builder()
94            .sequence_len(len)
95            .build()
96            .unwrap();
97        generator.record(&mut seq, &mut rng).unwrap();
98        seq
99    }
100
101    fn rc(seq: &[u8]) -> Vec<u8> {
102        seq.iter()
103            .rev()
104            .map(|&b| match b {
105                b'A' => b'T',
106                b'a' => b't',
107                b'T' => b'A',
108                b't' => b'a',
109                b'C' => b'G',
110                b'c' => b'g',
111                b'G' => b'C',
112                b'g' => b'c',
113                b => b,
114            })
115            .collect()
116    }
117
118    #[test]
119    fn test_canonical_minimizer_iter() {
120        let seq_len = 1_000_000;
121        let seq = &gen_seq(seq_len);
122        let seq_rc = &rc(seq);
123        let minimizer_size = 21;
124        let width = 11;
125
126        let mins: Vec<u64> = MinimizerBuilder::new()
127            .canonical()
128            .minimizer_size(minimizer_size)
129            .width(width)
130            .iter(seq)
131            .map(|(min, _, _)| min)
132            .collect();
133        let mut mins_rc: Vec<u64> = MinimizerBuilder::new()
134            .canonical()
135            .minimizer_size(minimizer_size)
136            .width(width)
137            .iter(seq_rc)
138            .map(|(min, _, _)| min)
139            .collect();
140        mins_rc.reverse();
141
142        assert_eq!(mins, mins_rc);
143    }
144
145    #[test]
146    fn test_canonical_minimizer_iter_pos() {
147        let seq_len = 1_000_000;
148        let seq = &gen_seq(seq_len);
149        let seq_rc = &rc(seq);
150        let minimizer_size = 21;
151        let width = 11;
152
153        let mins: Vec<_> = MinimizerBuilder::<u64>::new()
154            .canonical()
155            .minimizer_size(minimizer_size)
156            .width(width)
157            .iter_pos(seq)
158            .map(|(pos, _)| pos)
159            .collect();
160        let mut mins_rc: Vec<_> = MinimizerBuilder::<u64>::new()
161            .canonical()
162            .minimizer_size(minimizer_size)
163            .width(width)
164            .iter_pos(seq_rc)
165            .map(|(pos, _)| seq_len - pos - minimizer_size)
166            .collect();
167        mins_rc.reverse();
168
169        assert_eq!(mins, mins_rc);
170    }
171
172    #[test]
173    fn test_canonical_mod_minimizer_iter() {
174        let seq_len = 1_000_000;
175        let seq = &gen_seq(seq_len);
176        let seq_rc = &rc(seq);
177        let minimizer_size = 21;
178        let width = 11;
179
180        let mins: Vec<_> = MinimizerBuilder::<u64, _>::new_mod()
181            .canonical()
182            .minimizer_size(minimizer_size)
183            .width(width)
184            .iter(seq)
185            .map(|(min, _, _)| min)
186            .collect();
187        let mut mins_rc: Vec<_> = MinimizerBuilder::<u64, _>::new_mod()
188            .canonical()
189            .minimizer_size(minimizer_size)
190            .width(width)
191            .iter(seq_rc)
192            .map(|(min, _, _)| min)
193            .collect();
194        mins_rc.reverse();
195
196        assert_eq!(mins, mins_rc);
197    }
198
199    #[test]
200    fn test_canonical_mod_minimizer_iter_pos() {
201        let seq_len = 1_000_000;
202        let seq = &gen_seq(seq_len);
203        let seq_rc = &rc(seq);
204        let minimizer_size = 21;
205        let width = 11;
206        let mins: Vec<_> = MinimizerBuilder::<u64, _>::new_mod()
207            .canonical()
208            .minimizer_size(minimizer_size)
209            .width(width)
210            .iter_pos(seq)
211            .map(|(pos, _)| pos)
212            .collect();
213        let mut mins_rc: Vec<_> = MinimizerBuilder::<u64, _>::new_mod()
214            .canonical()
215            .minimizer_size(minimizer_size)
216            .width(width)
217            .iter_pos(seq_rc)
218            .map(|(pos, _)| seq_len - pos - minimizer_size)
219            .collect();
220        mins_rc.reverse();
221
222        assert_eq!(mins, mins_rc);
223    }
224
225    #[test]
226    fn test_repetitive_minimizer_iter_pos() {
227        const SEQ_LEN: usize = 100;
228        let seq = &[b'A'; SEQ_LEN];
229        let seq_rc = &rc(seq);
230        let minimizer_size = 21;
231        let width = 11;
232
233        let mins: Vec<_> = MinimizerBuilder::<u64>::new()
234            .canonical()
235            .minimizer_size(minimizer_size)
236            .width(width)
237            .iter_pos(seq)
238            .map(|(pos, _)| pos)
239            .collect();
240        let mut mins_rc: Vec<_> = MinimizerBuilder::<u64>::new()
241            .canonical()
242            .minimizer_size(minimizer_size)
243            .width(width)
244            .iter_pos(seq_rc)
245            .map(|(pos, _)| SEQ_LEN - pos - minimizer_size)
246            .collect();
247        mins_rc.reverse();
248
249        assert_eq!(mins, mins_rc);
250    }
251
252    #[test]
253    fn test_repetitive_2_minimizer_iter_pos() {
254        const SEQ_LEN: usize = 100;
255        let seq = &mut [b'A'; SEQ_LEN];
256        for i in (1..SEQ_LEN).step_by(2) {
257            seq[i] = b'G';
258        }
259        let seq_rc = &rc(seq);
260        let minimizer_size = 21;
261        let width = 11;
262
263        let mins: Vec<_> = MinimizerBuilder::<u64>::new()
264            .canonical()
265            .minimizer_size(minimizer_size)
266            .width(width)
267            .iter_pos(seq)
268            .map(|(pos, _)| pos)
269            .collect();
270        let mut mins_rc: Vec<_> = MinimizerBuilder::<u64>::new()
271            .canonical()
272            .minimizer_size(minimizer_size)
273            .width(width)
274            .iter_pos(seq_rc)
275            .map(|(pos, _)| SEQ_LEN - pos - minimizer_size)
276            .collect();
277        mins_rc.reverse();
278
279        assert_eq!(mins, mins_rc);
280    }
281}