1use core::cmp::min;
2use core::hash::{BuildHasher, Hash};
3use minimizer_queue::{DefaultHashBuilder, ImplicitMinimizerQueue, MinimizerQueue};
4use num_traits::{AsPrimitive, PrimInt};
5use std::collections::VecDeque;
6
7pub struct MinimizerPosIterator<'a, T: PrimInt + Hash = u64, S: BuildHasher = DefaultHashBuilder> {
9 pub(crate) seq: &'a [u8],
10 pub(crate) queue: ImplicitMinimizerQueue<S>,
11 pub(crate) width: usize,
12 pub(crate) mmer: T,
13 pub(crate) mmer_mask: T,
14 pub(crate) encoding: [u8; 256],
15 pub(crate) base_width: usize,
16 pub(crate) min_pos: usize,
17 pub(crate) end: usize,
18}
19
20impl<'a, T: PrimInt + Hash, S: BuildHasher> MinimizerPosIterator<'a, T, S> {
21 pub fn new(
22 seq: &'a [u8],
23 minimizer_size: usize,
24 width: u16,
25 hasher: S,
26 encoding: [u8; 256],
27 ) -> Self {
28 let queue = ImplicitMinimizerQueue::with_hasher(width, hasher);
29 let width = width as usize;
30 Self {
31 seq,
32 queue,
33 width,
34 mmer: T::zero(),
35 mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(),
36 encoding,
37 base_width: width + minimizer_size - 1,
38 end: width + minimizer_size - 1,
39 min_pos: 0,
40 }
41 }
42}
43
44impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator for MinimizerPosIterator<'a, T, S>
45where
46 u8: AsPrimitive<T>,
47{
48 type Item = usize;
49
50 fn next(&mut self) -> Option<Self::Item> {
51 if self.queue.is_empty() {
52 if self.base_width > self.seq.len() {
53 return None;
54 }
55 for i in 0..(self.base_width - self.width) {
56 self.mmer = (self.mmer << 2)
57 | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
58 }
59 for i in (self.base_width - self.width)..self.base_width {
60 self.mmer = ((self.mmer << 2) & self.mmer_mask)
61 | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
62 self.queue.insert(&self.mmer);
63 }
64 self.min_pos = self.queue.get_min_pos();
65 } else {
66 let mut min_pos = self.min_pos;
67 while self.end < self.seq.len() && min_pos == self.min_pos {
68 self.mmer = ((self.mmer << 2) & self.mmer_mask)
69 | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_());
70 self.queue.insert(&self.mmer);
71 self.end += 1;
72 min_pos = self.end - self.base_width + self.queue.get_min_pos();
73 }
74 if min_pos == self.min_pos {
75 return None;
76 }
77 self.min_pos = min_pos;
78 }
79 Some(self.min_pos)
80 }
81}
82
83pub struct MinimizerIterator<'a, T: PrimInt + Hash = u64, S: BuildHasher = DefaultHashBuilder> {
85 pub(crate) seq: &'a [u8],
86 pub(crate) queue: MinimizerQueue<T, S>,
87 pub(crate) width: usize,
88 pub(crate) mmer: T,
89 pub(crate) mmer_mask: T,
90 pub(crate) encoding: [u8; 256],
91 pub(crate) base_width: usize,
92 pub(crate) min_pos: (T, usize),
93 pub(crate) end: usize,
94}
95
96impl<'a, T: PrimInt + Hash, S: BuildHasher> MinimizerIterator<'a, T, S> {
97 pub fn new(
98 seq: &'a [u8],
99 minimizer_size: usize,
100 width: u16,
101 hasher: S,
102 encoding: [u8; 256],
103 ) -> Self {
104 let queue = MinimizerQueue::with_hasher(width, hasher);
105 let width = width as usize;
106 Self {
107 seq,
108 queue,
109 width,
110 mmer: T::zero(),
111 mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(),
112 encoding,
113 base_width: width + minimizer_size - 1,
114 end: width + minimizer_size - 1,
115 min_pos: (T::zero(), 0),
116 }
117 }
118}
119
120impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator for MinimizerIterator<'a, T, S>
121where
122 u8: AsPrimitive<T>,
123{
124 type Item = (T, usize);
125
126 fn next(&mut self) -> Option<Self::Item> {
127 if self.queue.is_empty() {
128 if self.base_width > self.seq.len() {
129 return None;
130 }
131 for i in 0..(self.base_width - self.width) {
132 self.mmer = (self.mmer << 2)
133 | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
134 }
135 for i in (self.base_width - self.width)..self.base_width {
136 self.mmer = ((self.mmer << 2) & self.mmer_mask)
137 | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
138 self.queue.insert(self.mmer);
139 }
140 self.min_pos = self.queue.get_min_pos();
141 } else {
142 let mut min_pos = self.min_pos;
143 while self.end < self.seq.len() && min_pos.1 == self.min_pos.1 {
144 self.mmer = ((self.mmer << 2) & self.mmer_mask)
145 | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_());
146 self.queue.insert(self.mmer);
147 self.end += 1;
148 let _min_pos = self.queue.get_min_pos();
149 min_pos = (_min_pos.0, self.end - self.base_width + _min_pos.1);
150 }
151 if min_pos.1 == self.min_pos.1 {
152 return None;
153 }
154 self.min_pos = min_pos;
155 }
156 Some(self.min_pos)
157 }
158}
159
160pub struct CanonicalMinimizerPosIterator<
163 'a,
164 T: PrimInt + Hash = u64,
165 S: BuildHasher = DefaultHashBuilder,
166> {
167 pub(crate) seq: &'a [u8],
168 pub(crate) queue: ImplicitMinimizerQueue<S>,
169 pub(crate) width: usize,
170 pub(crate) mmer: T,
171 pub(crate) rc_mmer: T,
172 pub(crate) mmer_mask: T,
173 pub(crate) rc_mmer_shift: usize,
174 pub(crate) is_rc: VecDeque<bool>,
175 pub(crate) encoding: [u8; 256],
176 pub(crate) rc_encoding: [u8; 256],
177 pub(crate) base_width: usize,
178 pub(crate) min_pos: (usize, bool),
179 pub(crate) end: usize,
180}
181
182impl<'a, T: PrimInt + Hash, S: BuildHasher> CanonicalMinimizerPosIterator<'a, T, S> {
183 pub fn new(
184 seq: &'a [u8],
185 minimizer_size: usize,
186 width: u16,
187 hasher: S,
188 encoding: [u8; 256],
189 ) -> Self {
190 let queue = ImplicitMinimizerQueue::with_hasher(width, hasher);
191 let width = width as usize;
192 assert_eq!(
193 width % 2,
194 1,
195 "width must be odd to break ties between multiple minimizers"
196 );
197 let mut rc_encoding = encoding;
198 rc_encoding.swap(b'A' as usize, b'T' as usize);
199 rc_encoding.swap(b'a' as usize, b't' as usize);
200 rc_encoding.swap(b'C' as usize, b'G' as usize);
201 rc_encoding.swap(b'c' as usize, b'g' as usize);
202 Self {
203 seq,
204 queue,
205 width,
206 mmer: T::zero(),
207 rc_mmer: T::zero(),
208 mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(),
209 rc_mmer_shift: 2 * (minimizer_size - 1),
210 is_rc: VecDeque::with_capacity(width),
211 encoding,
212 rc_encoding,
213 base_width: width + minimizer_size - 1,
214 end: width + minimizer_size - 1,
215 min_pos: (0, false),
216 }
217 }
218
219 #[inline]
220 fn window_not_canonical(&self) -> bool {
221 self.is_rc[self.width / 2]
222 }
223}
224
225impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator
226 for CanonicalMinimizerPosIterator<'a, T, S>
227where
228 u8: AsPrimitive<T>,
229{
230 type Item = (usize, bool);
231
232 fn next(&mut self) -> Option<Self::Item> {
233 if self.queue.is_empty() {
234 if self.base_width > self.seq.len() {
235 return None;
236 }
237 for i in 0..(self.base_width - self.width) {
238 self.mmer = (self.mmer << 2)
239 | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
240 self.rc_mmer = (self.rc_mmer >> 2)
241 | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
242 << self.rc_mmer_shift);
243 }
244 for i in (self.base_width - self.width)..self.base_width {
245 self.mmer = ((self.mmer << 2) & self.mmer_mask)
246 | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
247 self.rc_mmer = (self.rc_mmer >> 2)
248 | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
249 << self.rc_mmer_shift);
250 let canonical_mmer = min(self.mmer, self.rc_mmer);
251 self.queue.insert(&canonical_mmer);
252 self.is_rc.push_back(canonical_mmer == self.rc_mmer);
253 }
254 let pos = if self.queue.multiple_mins() {
255 let (pos, tie) = self.queue.get_inner_min_pos();
256 tie.map_or(pos, |alt| {
257 if self.window_not_canonical() {
258 alt
259 } else {
260 pos
261 }
262 })
263 } else {
264 self.queue.get_min_pos()
265 };
266 self.min_pos = (pos, self.is_rc[pos])
267 } else {
268 let mut min_pos = self.min_pos;
269 while self.end < self.seq.len() && min_pos == self.min_pos {
270 self.mmer = ((self.mmer << 2) & self.mmer_mask)
271 | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_());
272 self.rc_mmer = (self.rc_mmer >> 2)
273 | (unsafe { self.rc_encoding.get_unchecked(self.seq[self.end] as usize) }
274 .as_()
275 << self.rc_mmer_shift);
276 let canonical_mmer = min(self.mmer, self.rc_mmer);
277 self.queue.insert(&canonical_mmer);
278 self.is_rc.pop_front();
279 self.is_rc.push_back(canonical_mmer == self.rc_mmer);
280 self.end += 1;
281 let pos = if self.queue.multiple_mins() {
282 let (pos, tie) = self.queue.get_inner_min_pos();
283 tie.map_or(pos, |alt| {
284 if self.window_not_canonical() {
285 alt
286 } else {
287 pos
288 }
289 })
290 } else {
291 self.queue.get_min_pos()
292 };
293 min_pos = (self.end - self.base_width + pos, self.is_rc[pos]);
294 }
295 if min_pos == self.min_pos {
296 return None;
297 }
298 self.min_pos = min_pos;
299 }
300 Some(self.min_pos)
301 }
302}
303
304pub struct CanonicalMinimizerIterator<
307 'a,
308 T: PrimInt + Hash = u64,
309 S: BuildHasher = DefaultHashBuilder,
310> {
311 pub(crate) seq: &'a [u8],
312 pub(crate) queue: MinimizerQueue<T, S>,
313 pub(crate) width: usize,
314 pub(crate) mmer: T,
315 pub(crate) rc_mmer: T,
316 pub(crate) mmer_mask: T,
317 pub(crate) rc_mmer_shift: usize,
318 pub(crate) is_rc: VecDeque<bool>,
319 pub(crate) encoding: [u8; 256],
320 pub(crate) rc_encoding: [u8; 256],
321 pub(crate) base_width: usize,
322 pub(crate) min_pos: (T, usize, bool),
323 pub(crate) end: usize,
324}
325
326impl<'a, T: PrimInt + Hash, S: BuildHasher> CanonicalMinimizerIterator<'a, T, S> {
327 pub fn new(
328 seq: &'a [u8],
329 minimizer_size: usize,
330 width: u16,
331 hasher: S,
332 encoding: [u8; 256],
333 ) -> Self {
334 let queue = MinimizerQueue::with_hasher(width, hasher);
335 let width = width as usize;
336 assert_eq!(
337 width % 2,
338 1,
339 "width must be odd to break ties between multiple minimizers"
340 );
341 let mut rc_encoding = encoding;
342 rc_encoding.swap(b'A' as usize, b'T' as usize);
343 rc_encoding.swap(b'a' as usize, b't' as usize);
344 rc_encoding.swap(b'C' as usize, b'G' as usize);
345 rc_encoding.swap(b'c' as usize, b'g' as usize);
346 Self {
347 seq,
348 queue,
349 width,
350 mmer: T::zero(),
351 rc_mmer: T::zero(),
352 mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(),
353 rc_mmer_shift: 2 * (minimizer_size - 1),
354 is_rc: VecDeque::with_capacity(width),
355 encoding,
356 rc_encoding,
357 base_width: width + minimizer_size - 1,
358 end: width + minimizer_size - 1,
359 min_pos: (T::zero(), 0, false),
360 }
361 }
362
363 #[inline]
364 fn window_not_canonical(&self) -> bool {
365 self.is_rc[self.width / 2]
366 }
367}
368
369impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator
370 for CanonicalMinimizerIterator<'a, T, S>
371where
372 u8: AsPrimitive<T>,
373{
374 type Item = (T, usize, bool);
375
376 fn next(&mut self) -> Option<Self::Item> {
377 if self.queue.is_empty() {
378 if self.base_width > self.seq.len() {
379 return None;
380 }
381 for i in 0..(self.base_width - self.width) {
382 self.mmer = (self.mmer << 2)
383 | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
384 self.rc_mmer = (self.rc_mmer >> 2)
385 | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
386 << self.rc_mmer_shift);
387 }
388 for i in (self.base_width - self.width)..self.base_width {
389 self.mmer = ((self.mmer << 2) & self.mmer_mask)
390 | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
391 self.rc_mmer = (self.rc_mmer >> 2)
392 | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
393 << self.rc_mmer_shift);
394 let canonical_mmer = min(self.mmer, self.rc_mmer);
395 self.queue.insert(canonical_mmer);
396 self.is_rc.push_back(canonical_mmer == self.rc_mmer);
397 }
398 let _min_pos = if self.queue.multiple_mins() {
399 let (x, pos, tie) = self.queue.get_inner_min_pos();
400 tie.map_or((x, pos), |alt| {
401 if self.window_not_canonical() {
402 alt
403 } else {
404 (x, pos)
405 }
406 })
407 } else {
408 self.queue.get_min_pos()
409 };
410 self.min_pos = (_min_pos.0, _min_pos.1, self.is_rc[_min_pos.1]);
411 } else {
412 let mut min_pos = self.min_pos;
413 while self.end < self.seq.len() && min_pos.1 == self.min_pos.1 {
414 self.mmer = ((self.mmer << 2) & self.mmer_mask)
415 | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_());
416 self.rc_mmer = (self.rc_mmer >> 2)
417 | (unsafe { self.rc_encoding.get_unchecked(self.seq[self.end] as usize) }
418 .as_()
419 << self.rc_mmer_shift);
420 let canonical_mmer = min(self.mmer, self.rc_mmer);
421 self.queue.insert(canonical_mmer);
422 self.is_rc.pop_front();
423 self.is_rc.push_back(canonical_mmer == self.rc_mmer);
424 self.end += 1;
425 let _min_pos = if self.queue.multiple_mins() {
426 let (x, pos, tie) = self.queue.get_inner_min_pos();
427 tie.map_or((x, pos), |alt| {
428 if self.window_not_canonical() {
429 alt
430 } else {
431 (x, pos)
432 }
433 })
434 } else {
435 self.queue.get_min_pos()
436 };
437 min_pos = (
438 _min_pos.0,
439 self.end - self.base_width + _min_pos.1,
440 self.is_rc[_min_pos.1],
441 );
442 }
443 if min_pos.1 == self.min_pos.1 {
444 return None;
445 }
446 self.min_pos = min_pos;
447 }
448 Some(self.min_pos)
449 }
450}