1use core::cmp::min;
2use core::hash::{BuildHasher, Hash};
3use minimizer_queue::{DefaultHashBuilder, ImplicitMinimizerQueue};
4use num_traits::{AsPrimitive, PrimInt};
5use std::collections::VecDeque;
6use strength_reduce::StrengthReducedU16;
7
8pub struct ModSamplingPosIterator<'a, T: PrimInt + Hash = u64, S: BuildHasher = DefaultHashBuilder>
10{
11 pub(crate) seq: &'a [u8],
12 pub(crate) queue: ImplicitMinimizerQueue<S>,
13 pub(crate) width_m: StrengthReducedU16,
14 pub(crate) width_t: usize,
15 pub(crate) tmer: T,
16 pub(crate) tmer_mask: T,
17 pub(crate) encoding: [u8; 256],
18 pub(crate) base_width: usize,
19 pub(crate) min_pos: usize,
20 pub(crate) end: usize,
21}
22
23impl<'a, T: PrimInt + Hash, S: BuildHasher> ModSamplingPosIterator<'a, T, S> {
24 pub fn new(
25 seq: &'a [u8],
26 minimizer_size: usize,
27 width: u16,
28 t: usize,
29 hasher: S,
30 encoding: [u8; 256],
31 ) -> Self {
32 let width_m = StrengthReducedU16::new(width);
33 let width_t = width + (minimizer_size - t) as u16;
34 let queue = ImplicitMinimizerQueue::with_hasher(width_t, hasher);
35 let width_t = width_t as usize;
36 Self {
37 seq,
38 queue,
39 width_m,
40 width_t,
41 tmer: T::zero(),
42 tmer_mask: (T::one() << (2 * t)) - T::one(),
43 encoding,
44 base_width: width_t + t - 1,
45 end: width_t + t - 1,
46 min_pos: 0,
47 }
48 }
49}
50
51impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator for ModSamplingPosIterator<'a, T, S>
52where
53 u8: AsPrimitive<T>,
54{
55 type Item = usize;
56
57 fn next(&mut self) -> Option<Self::Item> {
58 if self.queue.is_empty() {
59 if self.base_width > self.seq.len() {
60 return None;
61 }
62 for i in 0..(self.base_width - self.width_t) {
63 self.tmer = (self.tmer << 2)
64 | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
65 }
66 for i in (self.base_width - self.width_t)..self.base_width {
67 self.tmer = ((self.tmer << 2) & self.tmer_mask)
68 | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
69 self.queue.insert(&self.tmer);
70 }
71 self.min_pos = (self.queue.get_min_pos() as u16 % self.width_m) as usize;
72 } else {
73 let mut min_pos = self.min_pos;
74 while self.end < self.seq.len() && min_pos == self.min_pos {
75 self.tmer = ((self.tmer << 2) & self.tmer_mask)
76 | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_());
77 self.queue.insert(&self.tmer);
78 self.end += 1;
79 min_pos = self.end - self.base_width
80 + (self.queue.get_min_pos() as u16 % self.width_m) as usize;
81 }
82 if min_pos == self.min_pos {
83 return None;
84 }
85 self.min_pos = min_pos;
86 }
87 Some(self.min_pos)
88 }
89}
90
91pub struct ModSamplingIterator<'a, T: PrimInt + Hash = u64, S: BuildHasher = DefaultHashBuilder> {
93 pub(crate) seq: &'a [u8],
94 pub(crate) queue: ImplicitMinimizerQueue<S>,
95 pub(crate) width_m: StrengthReducedU16,
96 pub(crate) width_t: usize,
97 pub(crate) mmer: T,
98 pub(crate) mmer_mask: T,
99 pub(crate) tmer_mask: T,
100 pub(crate) canon_mmers: VecDeque<T>,
101 pub(crate) encoding: [u8; 256],
102 pub(crate) base_width: usize,
103 pub(crate) min_pos: (T, usize),
104 pub(crate) end: usize,
105}
106
107impl<'a, T: PrimInt + Hash, S: BuildHasher> ModSamplingIterator<'a, T, S> {
108 pub fn new(
109 seq: &'a [u8],
110 minimizer_size: usize,
111 width: u16,
112 t: usize,
113 hasher: S,
114 encoding: [u8; 256],
115 ) -> Self {
116 let width_m = StrengthReducedU16::new(width);
117 let width_t = width + (minimizer_size - t) as u16;
118 let queue = ImplicitMinimizerQueue::with_hasher(width_t, hasher);
119 let width_t = width_t as usize;
120 Self {
121 seq,
122 queue,
123 width_m,
124 width_t,
125 mmer: T::zero(),
126 mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(),
127 tmer_mask: (T::one() << (2 * t)) - T::one(),
128 canon_mmers: VecDeque::with_capacity(width as usize),
129 encoding,
130 base_width: width_t + t - 1,
131 end: width_t + t - 1,
132 min_pos: (T::zero(), 0),
133 }
134 }
135}
136
137impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator for ModSamplingIterator<'a, T, S>
138where
139 u8: AsPrimitive<T>,
140{
141 type Item = (T, usize);
142
143 fn next(&mut self) -> Option<Self::Item> {
144 if self.queue.is_empty() {
145 if self.base_width > self.seq.len() {
146 return None;
147 }
148 let width_m = self.width_m.get() as usize;
149 for i in 0..(self.base_width - self.width_t) {
150 self.mmer = (self.mmer << 2)
151 | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
152 }
153 for i in (self.base_width - self.width_t)..(self.base_width - width_m) {
154 self.mmer = (self.mmer << 2)
155 | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
156 self.queue.insert(&(self.mmer & self.tmer_mask));
157 }
158 for i in (self.base_width - width_m)..self.base_width {
159 self.mmer = ((self.mmer << 2) & self.mmer_mask)
160 | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
161 self.queue.insert(&(self.mmer & self.tmer_mask));
162 self.canon_mmers.push_back(self.mmer);
163 }
164 let _min_pos = (self.queue.get_min_pos() as u16 % self.width_m) as usize;
165 self.min_pos = (self.canon_mmers[_min_pos], _min_pos);
166 } else {
167 let mut min_pos = self.min_pos;
168 while self.end < self.seq.len() && min_pos.1 == self.min_pos.1 {
169 self.mmer = ((self.mmer << 2) & self.mmer_mask)
170 | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_());
171 self.queue.insert(&(self.mmer & self.tmer_mask));
172 self.canon_mmers.pop_front();
173 self.canon_mmers.push_back(self.mmer);
174 self.end += 1;
175 let _min_pos = (self.queue.get_min_pos() as u16 % self.width_m) as usize;
176 min_pos = (
177 self.canon_mmers[_min_pos],
178 self.end - self.base_width + _min_pos,
179 );
180 }
181 if min_pos.1 == self.min_pos.1 {
182 return None;
183 }
184 self.min_pos = min_pos;
185 }
186 Some(self.min_pos)
187 }
188}
189
190pub struct CanonicalModSamplingPosIterator<
193 'a,
194 T: PrimInt + Hash = u64,
195 S: BuildHasher = DefaultHashBuilder,
196> {
197 pub(crate) seq: &'a [u8],
198 pub(crate) queue: ImplicitMinimizerQueue<S>,
199 pub(crate) width_m: StrengthReducedU16,
200 pub(crate) width_t: usize,
201 pub(crate) mmer: T,
202 pub(crate) rc_mmer: T,
203 pub(crate) mmer_mask: T,
204 pub(crate) tmer_mask: T,
205 pub(crate) rc_mmer_shift: usize,
206 pub(crate) rc_tmer_shift: usize,
207 pub(crate) is_rc_m: VecDeque<bool>,
208 pub(crate) encoding: [u8; 256],
209 pub(crate) rc_encoding: [u8; 256],
210 pub(crate) base_width: usize,
211 pub(crate) min_pos: (usize, bool),
212 pub(crate) end: usize,
213}
214
215impl<'a, T: PrimInt + Hash, S: BuildHasher> CanonicalModSamplingPosIterator<'a, T, S> {
216 pub fn new(
217 seq: &'a [u8],
218 minimizer_size: usize,
219 width: u16,
220 t: usize,
221 hasher: S,
222 encoding: [u8; 256],
223 ) -> Self {
224 let width_m = StrengthReducedU16::new(width);
225 let width_t = width + (minimizer_size - t) as u16;
226 assert_eq!(
227 width_t % width_m,
228 0,
229 "(minimizer_size - t) must be a multiple of the width to preserve canonical minimizers"
230 );
231 assert_eq!(
232 width % 2,
233 1,
234 "width must be odd to break ties between multiple minimizers"
235 );
236 let queue = ImplicitMinimizerQueue::with_hasher(width_t, hasher);
237 let width_t = width_t as usize;
238 let mut rc_encoding = encoding;
239 rc_encoding.swap(b'A' as usize, b'T' as usize);
240 rc_encoding.swap(b'a' as usize, b't' as usize);
241 rc_encoding.swap(b'C' as usize, b'G' as usize);
242 rc_encoding.swap(b'c' as usize, b'g' as usize);
243 Self {
244 seq,
245 queue,
246 width_m,
247 width_t,
248 mmer: T::zero(),
249 rc_mmer: T::zero(),
250 mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(),
251 tmer_mask: (T::one() << (2 * t)) - T::one(),
252 rc_mmer_shift: 2 * (minimizer_size - 1),
253 rc_tmer_shift: 2 * (minimizer_size - t),
254 is_rc_m: VecDeque::with_capacity(width as usize),
255 encoding,
256 rc_encoding,
257 base_width: width_t + t - 1,
258 end: width_t + t - 1,
259 min_pos: (0, false),
260 }
261 }
262
263 #[inline]
264 fn window_not_canonical(&self) -> bool {
265 let mid = self.is_rc_m.len() / 2;
266 self.is_rc_m[mid]
267 }
268}
269
270impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator
271 for CanonicalModSamplingPosIterator<'a, T, S>
272where
273 u8: AsPrimitive<T>,
274{
275 type Item = (usize, bool);
276
277 fn next(&mut self) -> Option<Self::Item> {
278 if self.queue.is_empty() {
279 if self.base_width > self.seq.len() {
280 return None;
281 }
282 let width_m = self.width_m.get() as usize;
283 for i in 0..(self.base_width - self.width_t) {
284 self.mmer = ((self.mmer << 2) & self.mmer_mask)
285 | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
286 self.rc_mmer = (self.rc_mmer >> 2)
287 | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
288 << self.rc_mmer_shift);
289 }
290 for i in (self.base_width - self.width_t)..(self.base_width - width_m) {
291 self.mmer = (self.mmer << 2)
292 | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
293 self.rc_mmer = (self.rc_mmer >> 2)
294 | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
295 << self.rc_mmer_shift);
296 let tmer = self.mmer & self.tmer_mask;
297 let rc_tmer = self.rc_mmer >> self.rc_tmer_shift;
298 let canonical_tmer = min(tmer, rc_tmer);
299 self.queue.insert(&canonical_tmer);
300 }
301 for i in (self.base_width - width_m)..self.base_width {
302 self.mmer = ((self.mmer << 2) & self.mmer_mask)
303 | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
304 self.rc_mmer = (self.rc_mmer >> 2)
305 | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
306 << self.rc_mmer_shift);
307 let tmer = self.mmer & self.tmer_mask;
308 let rc_tmer = self.rc_mmer >> self.rc_tmer_shift;
309 let canonical_tmer = min(tmer, rc_tmer);
310 self.queue.insert(&canonical_tmer);
311 self.is_rc_m.push_back(self.rc_mmer <= self.mmer);
312 }
313 let pos = if self.queue.multiple_mins() {
314 let (pos, tie) = self.queue.get_inner_min_pos();
315 tie.map_or(pos, |alt| {
316 if self.window_not_canonical() {
317 alt
318 } else {
319 pos
320 }
321 })
322 } else {
323 self.queue.get_min_pos()
324 };
325 let pos = (pos as u16 % self.width_m) as usize;
326 self.min_pos = (pos, self.is_rc_m[pos]);
327 } else {
328 let mut min_pos = self.min_pos;
329 while self.end < self.seq.len() && min_pos.0 == self.min_pos.0 {
330 self.mmer = ((self.mmer << 2) & self.mmer_mask)
331 | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_());
332 self.rc_mmer = (self.rc_mmer >> 2)
333 | (unsafe { self.rc_encoding.get_unchecked(self.seq[self.end] as usize) }
334 .as_()
335 << self.rc_mmer_shift);
336 let tmer = self.mmer & self.tmer_mask;
337 let rc_tmer = self.rc_mmer >> self.rc_tmer_shift;
338 let canonical_tmer = min(tmer, rc_tmer);
339 self.queue.insert(&canonical_tmer);
340 self.is_rc_m.pop_front();
341 self.is_rc_m.push_back(self.rc_mmer <= self.mmer);
342 self.end += 1;
343 let pos = if self.queue.multiple_mins() {
344 let (pos, tie) = self.queue.get_inner_min_pos();
345 tie.map_or(pos, |alt| {
346 if self.window_not_canonical() {
347 alt
348 } else {
349 pos
350 }
351 })
352 } else {
353 self.queue.get_min_pos()
354 };
355 let pos = (pos as u16 % self.width_m) as usize;
356 min_pos = (self.end - self.base_width + pos, self.is_rc_m[pos]);
357 }
358 if min_pos.0 == self.min_pos.0 {
359 return None;
360 }
361 self.min_pos = min_pos;
362 }
363 Some(self.min_pos)
364 }
365}
366
367pub struct CanonicalModSamplingIterator<
370 'a,
371 T: PrimInt + Hash = u64,
372 S: BuildHasher = DefaultHashBuilder,
373> {
374 pub(crate) seq: &'a [u8],
375 pub(crate) queue: ImplicitMinimizerQueue<S>,
376 pub(crate) width_m: StrengthReducedU16,
377 pub(crate) width_t: usize,
378 pub(crate) mmer: T,
379 pub(crate) rc_mmer: T,
380 pub(crate) mmer_mask: T,
381 pub(crate) tmer_mask: T,
382 pub(crate) rc_mmer_shift: usize,
383 pub(crate) rc_tmer_shift: usize,
384 pub(crate) canon_mmers: VecDeque<(T, bool)>,
385 pub(crate) encoding: [u8; 256],
386 pub(crate) rc_encoding: [u8; 256],
387 pub(crate) base_width: usize,
388 pub(crate) min_pos: (T, usize, bool),
389 pub(crate) end: usize,
390}
391
392impl<'a, T: PrimInt + Hash, S: BuildHasher> CanonicalModSamplingIterator<'a, T, S> {
393 pub fn new(
394 seq: &'a [u8],
395 minimizer_size: usize,
396 width: u16,
397 t: usize,
398 hasher: S,
399 encoding: [u8; 256],
400 ) -> Self {
401 let width_m = StrengthReducedU16::new(width);
402 let width_t = width + (minimizer_size - t) as u16;
403 assert_eq!(
404 width_t % width_m,
405 0,
406 "(minimizer_size - t) must be a multiple of the width to preserve canonical minimizers"
407 );
408 assert_eq!(
409 width % 2,
410 1,
411 "width must be odd to break ties between multiple minimizers"
412 );
413 let queue = ImplicitMinimizerQueue::with_hasher(width_t, hasher);
414 let width_t = width_t as usize;
415 let mut rc_encoding = encoding;
416 rc_encoding.swap(b'A' as usize, b'T' as usize);
417 rc_encoding.swap(b'a' as usize, b't' as usize);
418 rc_encoding.swap(b'C' as usize, b'G' as usize);
419 rc_encoding.swap(b'c' as usize, b'g' as usize);
420 Self {
421 seq,
422 queue,
423 width_m,
424 width_t,
425 mmer: T::zero(),
426 rc_mmer: T::zero(),
427 mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(),
428 tmer_mask: (T::one() << (2 * t)) - T::one(),
429 rc_mmer_shift: 2 * (minimizer_size - 1),
430 rc_tmer_shift: 2 * (minimizer_size - t),
431 canon_mmers: VecDeque::with_capacity(width as usize),
432 encoding,
433 rc_encoding,
434 base_width: width_t + t - 1,
435 end: width_t + t - 1,
436 min_pos: (T::zero(), 0, false),
437 }
438 }
439
440 #[inline]
441 fn window_not_canonical(&self) -> bool {
442 let mid = self.canon_mmers.len() / 2;
443 self.canon_mmers[mid].1
444 }
445}
446
447impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator
448 for CanonicalModSamplingIterator<'a, T, S>
449where
450 u8: AsPrimitive<T>,
451{
452 type Item = (T, usize, bool);
453
454 fn next(&mut self) -> Option<Self::Item> {
455 if self.queue.is_empty() {
456 if self.base_width > self.seq.len() {
457 return None;
458 }
459 let width_m = self.width_m.get() as usize;
460 for i in 0..(self.base_width - self.width_t) {
461 self.mmer = ((self.mmer << 2) & self.mmer_mask)
462 | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
463 self.rc_mmer = (self.rc_mmer >> 2)
464 | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
465 << self.rc_mmer_shift);
466 }
467 for i in (self.base_width - self.width_t)..(self.base_width - width_m) {
468 self.mmer = (self.mmer << 2)
469 | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
470 self.rc_mmer = (self.rc_mmer >> 2)
471 | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
472 << self.rc_mmer_shift);
473 let tmer = self.mmer & self.tmer_mask;
474 let rc_tmer = self.rc_mmer >> self.rc_tmer_shift;
475 let canonical_tmer = min(tmer, rc_tmer);
476 self.queue.insert(&canonical_tmer);
477 }
478 for i in (self.base_width - width_m)..self.base_width {
479 self.mmer = ((self.mmer << 2) & self.mmer_mask)
480 | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
481 self.rc_mmer = (self.rc_mmer >> 2)
482 | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
483 << self.rc_mmer_shift);
484 let tmer = self.mmer & self.tmer_mask;
485 let rc_tmer = self.rc_mmer >> self.rc_tmer_shift;
486 let canonical_tmer = min(tmer, rc_tmer);
487 self.queue.insert(&canonical_tmer);
488 let canonical_mmer = min(self.mmer, self.rc_mmer);
489 self.canon_mmers
490 .push_back((canonical_mmer, canonical_mmer == self.rc_mmer));
491 }
492 let pos = if self.queue.multiple_mins() {
493 let (pos, tie) = self.queue.get_inner_min_pos();
494 tie.map_or(pos, |alt| {
495 if self.window_not_canonical() {
496 alt
497 } else {
498 pos
499 }
500 })
501 } else {
502 self.queue.get_min_pos()
503 };
504 let pos = (pos as u16 % self.width_m) as usize;
505 let (mmer, rc) = self.canon_mmers[pos];
506 self.min_pos = (mmer, pos, rc);
507 } else {
508 let mut min_pos = self.min_pos;
509 while self.end < self.seq.len() && min_pos.1 == self.min_pos.1 {
510 self.mmer = ((self.mmer << 2) & self.mmer_mask)
511 | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_());
512 self.rc_mmer = (self.rc_mmer >> 2)
513 | (unsafe { self.rc_encoding.get_unchecked(self.seq[self.end] as usize) }
514 .as_()
515 << self.rc_mmer_shift);
516 let tmer = self.mmer & self.tmer_mask;
517 let rc_tmer = self.rc_mmer >> self.rc_tmer_shift;
518 let canonical_tmer = min(tmer, rc_tmer);
519 self.queue.insert(&canonical_tmer);
520 let canonical_mmer = min(self.mmer, self.rc_mmer);
521 self.canon_mmers.pop_front();
522 self.canon_mmers
523 .push_back((canonical_mmer, canonical_mmer == self.rc_mmer));
524 self.end += 1;
525 let pos = if self.queue.multiple_mins() {
526 let (pos, tie) = self.queue.get_inner_min_pos();
527 tie.map_or(pos, |alt| {
528 if self.window_not_canonical() {
529 alt
530 } else {
531 pos
532 }
533 })
534 } else {
535 self.queue.get_min_pos()
536 };
537 let pos = (pos as u16 % self.width_m) as usize;
538 let (mmer, rc) = self.canon_mmers[pos];
539 min_pos = (mmer, self.end - self.base_width + pos, rc);
540 }
541 if min_pos.1 == self.min_pos.1 {
542 return None;
543 }
544 self.min_pos = min_pos;
545 }
546 Some(self.min_pos)
547 }
548}