minimizer_iter/iterator/
mod_sampling.rs

1use core::cmp::min;
2use core::hash::{BuildHasher, Hash};
3use minimizer_queue::{DefaultHashBuilder, ImplicitMinimizerQueue};
4use num_traits::{AsPrimitive, PrimInt};
5use std::collections::VecDeque;
6use strength_reduce::StrengthReducedU16;
7
8/// An iterator over the positions of the mod-sampling minimizers of a sequence.
9pub struct ModSamplingPosIterator<'a, T: PrimInt + Hash = u64, S: BuildHasher = DefaultHashBuilder>
10{
11    pub(crate) seq: &'a [u8],
12    pub(crate) queue: ImplicitMinimizerQueue<S>,
13    pub(crate) width_m: StrengthReducedU16,
14    pub(crate) width_t: usize,
15    pub(crate) tmer: T,
16    pub(crate) tmer_mask: T,
17    pub(crate) encoding: [u8; 256],
18    pub(crate) base_width: usize,
19    pub(crate) min_pos: usize,
20    pub(crate) end: usize,
21}
22
23impl<'a, T: PrimInt + Hash, S: BuildHasher> ModSamplingPosIterator<'a, T, S> {
24    pub fn new(
25        seq: &'a [u8],
26        minimizer_size: usize,
27        width: u16,
28        t: usize,
29        hasher: S,
30        encoding: [u8; 256],
31    ) -> Self {
32        let width_m = StrengthReducedU16::new(width);
33        let width_t = width + (minimizer_size - t) as u16;
34        let queue = ImplicitMinimizerQueue::with_hasher(width_t, hasher);
35        let width_t = width_t as usize;
36        Self {
37            seq,
38            queue,
39            width_m,
40            width_t,
41            tmer: T::zero(),
42            tmer_mask: (T::one() << (2 * t)) - T::one(),
43            encoding,
44            base_width: width_t + t - 1,
45            end: width_t + t - 1,
46            min_pos: 0,
47        }
48    }
49}
50
51impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator for ModSamplingPosIterator<'a, T, S>
52where
53    u8: AsPrimitive<T>,
54{
55    type Item = usize;
56
57    fn next(&mut self) -> Option<Self::Item> {
58        if self.queue.is_empty() {
59            if self.base_width > self.seq.len() {
60                return None;
61            }
62            for i in 0..(self.base_width - self.width_t) {
63                self.tmer = (self.tmer << 2)
64                    | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
65            }
66            for i in (self.base_width - self.width_t)..self.base_width {
67                self.tmer = ((self.tmer << 2) & self.tmer_mask)
68                    | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
69                self.queue.insert(&self.tmer);
70            }
71            self.min_pos = (self.queue.get_min_pos() as u16 % self.width_m) as usize;
72        } else {
73            let mut min_pos = self.min_pos;
74            while self.end < self.seq.len() && min_pos == self.min_pos {
75                self.tmer = ((self.tmer << 2) & self.tmer_mask)
76                    | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_());
77                self.queue.insert(&self.tmer);
78                self.end += 1;
79                min_pos = self.end - self.base_width
80                    + (self.queue.get_min_pos() as u16 % self.width_m) as usize;
81            }
82            if min_pos == self.min_pos {
83                return None;
84            }
85            self.min_pos = min_pos;
86        }
87        Some(self.min_pos)
88    }
89}
90
91/// An iterator over the mod-sampling minimizers of a sequence and their positions.
92pub struct ModSamplingIterator<'a, T: PrimInt + Hash = u64, S: BuildHasher = DefaultHashBuilder> {
93    pub(crate) seq: &'a [u8],
94    pub(crate) queue: ImplicitMinimizerQueue<S>,
95    pub(crate) width_m: StrengthReducedU16,
96    pub(crate) width_t: usize,
97    pub(crate) mmer: T,
98    pub(crate) mmer_mask: T,
99    pub(crate) tmer_mask: T,
100    pub(crate) canon_mmers: VecDeque<T>,
101    pub(crate) encoding: [u8; 256],
102    pub(crate) base_width: usize,
103    pub(crate) min_pos: (T, usize),
104    pub(crate) end: usize,
105}
106
107impl<'a, T: PrimInt + Hash, S: BuildHasher> ModSamplingIterator<'a, T, S> {
108    pub fn new(
109        seq: &'a [u8],
110        minimizer_size: usize,
111        width: u16,
112        t: usize,
113        hasher: S,
114        encoding: [u8; 256],
115    ) -> Self {
116        let width_m = StrengthReducedU16::new(width);
117        let width_t = width + (minimizer_size - t) as u16;
118        let queue = ImplicitMinimizerQueue::with_hasher(width_t, hasher);
119        let width_t = width_t as usize;
120        Self {
121            seq,
122            queue,
123            width_m,
124            width_t,
125            mmer: T::zero(),
126            mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(),
127            tmer_mask: (T::one() << (2 * t)) - T::one(),
128            canon_mmers: VecDeque::with_capacity(width as usize),
129            encoding,
130            base_width: width_t + t - 1,
131            end: width_t + t - 1,
132            min_pos: (T::zero(), 0),
133        }
134    }
135}
136
137impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator for ModSamplingIterator<'a, T, S>
138where
139    u8: AsPrimitive<T>,
140{
141    type Item = (T, usize);
142
143    fn next(&mut self) -> Option<Self::Item> {
144        if self.queue.is_empty() {
145            if self.base_width > self.seq.len() {
146                return None;
147            }
148            let width_m = self.width_m.get() as usize;
149            for i in 0..(self.base_width - self.width_t) {
150                self.mmer = (self.mmer << 2)
151                    | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
152            }
153            for i in (self.base_width - self.width_t)..(self.base_width - width_m) {
154                self.mmer = (self.mmer << 2)
155                    | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
156                self.queue.insert(&(self.mmer & self.tmer_mask));
157            }
158            for i in (self.base_width - width_m)..self.base_width {
159                self.mmer = ((self.mmer << 2) & self.mmer_mask)
160                    | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
161                self.queue.insert(&(self.mmer & self.tmer_mask));
162                self.canon_mmers.push_back(self.mmer);
163            }
164            let _min_pos = (self.queue.get_min_pos() as u16 % self.width_m) as usize;
165            self.min_pos = (self.canon_mmers[_min_pos], _min_pos);
166        } else {
167            let mut min_pos = self.min_pos;
168            while self.end < self.seq.len() && min_pos.1 == self.min_pos.1 {
169                self.mmer = ((self.mmer << 2) & self.mmer_mask)
170                    | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_());
171                self.queue.insert(&(self.mmer & self.tmer_mask));
172                self.canon_mmers.pop_front();
173                self.canon_mmers.push_back(self.mmer);
174                self.end += 1;
175                let _min_pos = (self.queue.get_min_pos() as u16 % self.width_m) as usize;
176                min_pos = (
177                    self.canon_mmers[_min_pos],
178                    self.end - self.base_width + _min_pos,
179                );
180            }
181            if min_pos.1 == self.min_pos.1 {
182                return None;
183            }
184            self.min_pos = min_pos;
185        }
186        Some(self.min_pos)
187    }
188}
189
190/// An iterator over the positions of the canonical mod-sampling minimizers of a sequence with a boolean indicating a reverse complement.
191/// It requires an odd width to break ties between multiple minimizers.
192pub struct CanonicalModSamplingPosIterator<
193    'a,
194    T: PrimInt + Hash = u64,
195    S: BuildHasher = DefaultHashBuilder,
196> {
197    pub(crate) seq: &'a [u8],
198    pub(crate) queue: ImplicitMinimizerQueue<S>,
199    pub(crate) width_m: StrengthReducedU16,
200    pub(crate) width_t: usize,
201    pub(crate) mmer: T,
202    pub(crate) rc_mmer: T,
203    pub(crate) mmer_mask: T,
204    pub(crate) tmer_mask: T,
205    pub(crate) rc_mmer_shift: usize,
206    pub(crate) rc_tmer_shift: usize,
207    pub(crate) is_rc_m: VecDeque<bool>,
208    pub(crate) encoding: [u8; 256],
209    pub(crate) rc_encoding: [u8; 256],
210    pub(crate) base_width: usize,
211    pub(crate) min_pos: (usize, bool),
212    pub(crate) end: usize,
213}
214
215impl<'a, T: PrimInt + Hash, S: BuildHasher> CanonicalModSamplingPosIterator<'a, T, S> {
216    pub fn new(
217        seq: &'a [u8],
218        minimizer_size: usize,
219        width: u16,
220        t: usize,
221        hasher: S,
222        encoding: [u8; 256],
223    ) -> Self {
224        let width_m = StrengthReducedU16::new(width);
225        let width_t = width + (minimizer_size - t) as u16;
226        assert_eq!(
227            width_t % width_m,
228            0,
229            "(minimizer_size - t) must be a multiple of the width to preserve canonical minimizers"
230        );
231        assert_eq!(
232            width % 2,
233            1,
234            "width must be odd to break ties between multiple minimizers"
235        );
236        let queue = ImplicitMinimizerQueue::with_hasher(width_t, hasher);
237        let width_t = width_t as usize;
238        let mut rc_encoding = encoding;
239        rc_encoding.swap(b'A' as usize, b'T' as usize);
240        rc_encoding.swap(b'a' as usize, b't' as usize);
241        rc_encoding.swap(b'C' as usize, b'G' as usize);
242        rc_encoding.swap(b'c' as usize, b'g' as usize);
243        Self {
244            seq,
245            queue,
246            width_m,
247            width_t,
248            mmer: T::zero(),
249            rc_mmer: T::zero(),
250            mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(),
251            tmer_mask: (T::one() << (2 * t)) - T::one(),
252            rc_mmer_shift: 2 * (minimizer_size - 1),
253            rc_tmer_shift: 2 * (minimizer_size - t),
254            is_rc_m: VecDeque::with_capacity(width as usize),
255            encoding,
256            rc_encoding,
257            base_width: width_t + t - 1,
258            end: width_t + t - 1,
259            min_pos: (0, false),
260        }
261    }
262
263    #[inline]
264    fn window_not_canonical(&self) -> bool {
265        let mid = self.is_rc_m.len() / 2;
266        self.is_rc_m[mid]
267    }
268}
269
270impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator
271    for CanonicalModSamplingPosIterator<'a, T, S>
272where
273    u8: AsPrimitive<T>,
274{
275    type Item = (usize, bool);
276
277    fn next(&mut self) -> Option<Self::Item> {
278        if self.queue.is_empty() {
279            if self.base_width > self.seq.len() {
280                return None;
281            }
282            let width_m = self.width_m.get() as usize;
283            for i in 0..(self.base_width - self.width_t) {
284                self.mmer = ((self.mmer << 2) & self.mmer_mask)
285                    | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
286                self.rc_mmer = (self.rc_mmer >> 2)
287                    | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
288                        << self.rc_mmer_shift);
289            }
290            for i in (self.base_width - self.width_t)..(self.base_width - width_m) {
291                self.mmer = (self.mmer << 2)
292                    | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
293                self.rc_mmer = (self.rc_mmer >> 2)
294                    | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
295                        << self.rc_mmer_shift);
296                let tmer = self.mmer & self.tmer_mask;
297                let rc_tmer = self.rc_mmer >> self.rc_tmer_shift;
298                let canonical_tmer = min(tmer, rc_tmer);
299                self.queue.insert(&canonical_tmer);
300            }
301            for i in (self.base_width - width_m)..self.base_width {
302                self.mmer = ((self.mmer << 2) & self.mmer_mask)
303                    | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
304                self.rc_mmer = (self.rc_mmer >> 2)
305                    | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
306                        << self.rc_mmer_shift);
307                let tmer = self.mmer & self.tmer_mask;
308                let rc_tmer = self.rc_mmer >> self.rc_tmer_shift;
309                let canonical_tmer = min(tmer, rc_tmer);
310                self.queue.insert(&canonical_tmer);
311                self.is_rc_m.push_back(self.rc_mmer <= self.mmer);
312            }
313            let pos = if self.queue.multiple_mins() {
314                let (pos, tie) = self.queue.get_inner_min_pos();
315                tie.map_or(pos, |alt| {
316                    if self.window_not_canonical() {
317                        alt
318                    } else {
319                        pos
320                    }
321                })
322            } else {
323                self.queue.get_min_pos()
324            };
325            let pos = (pos as u16 % self.width_m) as usize;
326            self.min_pos = (pos, self.is_rc_m[pos]);
327        } else {
328            let mut min_pos = self.min_pos;
329            while self.end < self.seq.len() && min_pos.0 == self.min_pos.0 {
330                self.mmer = ((self.mmer << 2) & self.mmer_mask)
331                    | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_());
332                self.rc_mmer = (self.rc_mmer >> 2)
333                    | (unsafe { self.rc_encoding.get_unchecked(self.seq[self.end] as usize) }
334                        .as_()
335                        << self.rc_mmer_shift);
336                let tmer = self.mmer & self.tmer_mask;
337                let rc_tmer = self.rc_mmer >> self.rc_tmer_shift;
338                let canonical_tmer = min(tmer, rc_tmer);
339                self.queue.insert(&canonical_tmer);
340                self.is_rc_m.pop_front();
341                self.is_rc_m.push_back(self.rc_mmer <= self.mmer);
342                self.end += 1;
343                let pos = if self.queue.multiple_mins() {
344                    let (pos, tie) = self.queue.get_inner_min_pos();
345                    tie.map_or(pos, |alt| {
346                        if self.window_not_canonical() {
347                            alt
348                        } else {
349                            pos
350                        }
351                    })
352                } else {
353                    self.queue.get_min_pos()
354                };
355                let pos = (pos as u16 % self.width_m) as usize;
356                min_pos = (self.end - self.base_width + pos, self.is_rc_m[pos]);
357            }
358            if min_pos.0 == self.min_pos.0 {
359                return None;
360            }
361            self.min_pos = min_pos;
362        }
363        Some(self.min_pos)
364    }
365}
366
367/// An iterator over the canonical mod-sampling minimizers of a sequence and their positions with a boolean indicating a reverse complement.
368/// It requires an odd width to break ties between multiple minimizers.
369pub struct CanonicalModSamplingIterator<
370    'a,
371    T: PrimInt + Hash = u64,
372    S: BuildHasher = DefaultHashBuilder,
373> {
374    pub(crate) seq: &'a [u8],
375    pub(crate) queue: ImplicitMinimizerQueue<S>,
376    pub(crate) width_m: StrengthReducedU16,
377    pub(crate) width_t: usize,
378    pub(crate) mmer: T,
379    pub(crate) rc_mmer: T,
380    pub(crate) mmer_mask: T,
381    pub(crate) tmer_mask: T,
382    pub(crate) rc_mmer_shift: usize,
383    pub(crate) rc_tmer_shift: usize,
384    pub(crate) canon_mmers: VecDeque<(T, bool)>,
385    pub(crate) encoding: [u8; 256],
386    pub(crate) rc_encoding: [u8; 256],
387    pub(crate) base_width: usize,
388    pub(crate) min_pos: (T, usize, bool),
389    pub(crate) end: usize,
390}
391
392impl<'a, T: PrimInt + Hash, S: BuildHasher> CanonicalModSamplingIterator<'a, T, S> {
393    pub fn new(
394        seq: &'a [u8],
395        minimizer_size: usize,
396        width: u16,
397        t: usize,
398        hasher: S,
399        encoding: [u8; 256],
400    ) -> Self {
401        let width_m = StrengthReducedU16::new(width);
402        let width_t = width + (minimizer_size - t) as u16;
403        assert_eq!(
404            width_t % width_m,
405            0,
406            "(minimizer_size - t) must be a multiple of the width to preserve canonical minimizers"
407        );
408        assert_eq!(
409            width % 2,
410            1,
411            "width must be odd to break ties between multiple minimizers"
412        );
413        let queue = ImplicitMinimizerQueue::with_hasher(width_t, hasher);
414        let width_t = width_t as usize;
415        let mut rc_encoding = encoding;
416        rc_encoding.swap(b'A' as usize, b'T' as usize);
417        rc_encoding.swap(b'a' as usize, b't' as usize);
418        rc_encoding.swap(b'C' as usize, b'G' as usize);
419        rc_encoding.swap(b'c' as usize, b'g' as usize);
420        Self {
421            seq,
422            queue,
423            width_m,
424            width_t,
425            mmer: T::zero(),
426            rc_mmer: T::zero(),
427            mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(),
428            tmer_mask: (T::one() << (2 * t)) - T::one(),
429            rc_mmer_shift: 2 * (minimizer_size - 1),
430            rc_tmer_shift: 2 * (minimizer_size - t),
431            canon_mmers: VecDeque::with_capacity(width as usize),
432            encoding,
433            rc_encoding,
434            base_width: width_t + t - 1,
435            end: width_t + t - 1,
436            min_pos: (T::zero(), 0, false),
437        }
438    }
439
440    #[inline]
441    fn window_not_canonical(&self) -> bool {
442        let mid = self.canon_mmers.len() / 2;
443        self.canon_mmers[mid].1
444    }
445}
446
447impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator
448    for CanonicalModSamplingIterator<'a, T, S>
449where
450    u8: AsPrimitive<T>,
451{
452    type Item = (T, usize, bool);
453
454    fn next(&mut self) -> Option<Self::Item> {
455        if self.queue.is_empty() {
456            if self.base_width > self.seq.len() {
457                return None;
458            }
459            let width_m = self.width_m.get() as usize;
460            for i in 0..(self.base_width - self.width_t) {
461                self.mmer = ((self.mmer << 2) & self.mmer_mask)
462                    | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
463                self.rc_mmer = (self.rc_mmer >> 2)
464                    | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
465                        << self.rc_mmer_shift);
466            }
467            for i in (self.base_width - self.width_t)..(self.base_width - width_m) {
468                self.mmer = (self.mmer << 2)
469                    | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
470                self.rc_mmer = (self.rc_mmer >> 2)
471                    | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
472                        << self.rc_mmer_shift);
473                let tmer = self.mmer & self.tmer_mask;
474                let rc_tmer = self.rc_mmer >> self.rc_tmer_shift;
475                let canonical_tmer = min(tmer, rc_tmer);
476                self.queue.insert(&canonical_tmer);
477            }
478            for i in (self.base_width - width_m)..self.base_width {
479                self.mmer = ((self.mmer << 2) & self.mmer_mask)
480                    | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
481                self.rc_mmer = (self.rc_mmer >> 2)
482                    | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
483                        << self.rc_mmer_shift);
484                let tmer = self.mmer & self.tmer_mask;
485                let rc_tmer = self.rc_mmer >> self.rc_tmer_shift;
486                let canonical_tmer = min(tmer, rc_tmer);
487                self.queue.insert(&canonical_tmer);
488                let canonical_mmer = min(self.mmer, self.rc_mmer);
489                self.canon_mmers
490                    .push_back((canonical_mmer, canonical_mmer == self.rc_mmer));
491            }
492            let pos = if self.queue.multiple_mins() {
493                let (pos, tie) = self.queue.get_inner_min_pos();
494                tie.map_or(pos, |alt| {
495                    if self.window_not_canonical() {
496                        alt
497                    } else {
498                        pos
499                    }
500                })
501            } else {
502                self.queue.get_min_pos()
503            };
504            let pos = (pos as u16 % self.width_m) as usize;
505            let (mmer, rc) = self.canon_mmers[pos];
506            self.min_pos = (mmer, pos, rc);
507        } else {
508            let mut min_pos = self.min_pos;
509            while self.end < self.seq.len() && min_pos.1 == self.min_pos.1 {
510                self.mmer = ((self.mmer << 2) & self.mmer_mask)
511                    | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_());
512                self.rc_mmer = (self.rc_mmer >> 2)
513                    | (unsafe { self.rc_encoding.get_unchecked(self.seq[self.end] as usize) }
514                        .as_()
515                        << self.rc_mmer_shift);
516                let tmer = self.mmer & self.tmer_mask;
517                let rc_tmer = self.rc_mmer >> self.rc_tmer_shift;
518                let canonical_tmer = min(tmer, rc_tmer);
519                self.queue.insert(&canonical_tmer);
520                let canonical_mmer = min(self.mmer, self.rc_mmer);
521                self.canon_mmers.pop_front();
522                self.canon_mmers
523                    .push_back((canonical_mmer, canonical_mmer == self.rc_mmer));
524                self.end += 1;
525                let pos = if self.queue.multiple_mins() {
526                    let (pos, tie) = self.queue.get_inner_min_pos();
527                    tie.map_or(pos, |alt| {
528                        if self.window_not_canonical() {
529                            alt
530                        } else {
531                            pos
532                        }
533                    })
534                } else {
535                    self.queue.get_min_pos()
536                };
537                let pos = (pos as u16 % self.width_m) as usize;
538                let (mmer, rc) = self.canon_mmers[pos];
539                min_pos = (mmer, self.end - self.base_width + pos, rc);
540            }
541            if min_pos.1 == self.min_pos.1 {
542                return None;
543            }
544            self.min_pos = min_pos;
545        }
546        Some(self.min_pos)
547    }
548}