minimizer_iter/iterator/
minimizer.rs

1use core::cmp::min;
2use core::hash::{BuildHasher, Hash};
3use minimizer_queue::{DefaultHashBuilder, ImplicitMinimizerQueue, MinimizerQueue};
4use num_traits::{AsPrimitive, PrimInt};
5use std::collections::VecDeque;
6
7/// An iterator over the positions of the minimizers of a sequence.
8pub struct MinimizerPosIterator<'a, T: PrimInt + Hash = u64, S: BuildHasher = DefaultHashBuilder> {
9    pub(crate) seq: &'a [u8],
10    pub(crate) queue: ImplicitMinimizerQueue<S>,
11    pub(crate) width: usize,
12    pub(crate) mmer: T,
13    pub(crate) mmer_mask: T,
14    pub(crate) encoding: [u8; 256],
15    pub(crate) base_width: usize,
16    pub(crate) min_pos: usize,
17    pub(crate) end: usize,
18}
19
20impl<'a, T: PrimInt + Hash, S: BuildHasher> MinimizerPosIterator<'a, T, S> {
21    pub fn new(
22        seq: &'a [u8],
23        minimizer_size: usize,
24        width: u16,
25        hasher: S,
26        encoding: [u8; 256],
27    ) -> Self {
28        let queue = ImplicitMinimizerQueue::with_hasher(width, hasher);
29        let width = width as usize;
30        Self {
31            seq,
32            queue,
33            width,
34            mmer: T::zero(),
35            mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(),
36            encoding,
37            base_width: width + minimizer_size - 1,
38            end: width + minimizer_size - 1,
39            min_pos: 0,
40        }
41    }
42}
43
44impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator for MinimizerPosIterator<'a, T, S>
45where
46    u8: AsPrimitive<T>,
47{
48    type Item = usize;
49
50    fn next(&mut self) -> Option<Self::Item> {
51        if self.queue.is_empty() {
52            if self.base_width > self.seq.len() {
53                return None;
54            }
55            for i in 0..(self.base_width - self.width) {
56                self.mmer = (self.mmer << 2)
57                    | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
58            }
59            for i in (self.base_width - self.width)..self.base_width {
60                self.mmer = ((self.mmer << 2) & self.mmer_mask)
61                    | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
62                self.queue.insert(&self.mmer);
63            }
64            self.min_pos = self.queue.get_min_pos();
65        } else {
66            let mut min_pos = self.min_pos;
67            while self.end < self.seq.len() && min_pos == self.min_pos {
68                self.mmer = ((self.mmer << 2) & self.mmer_mask)
69                    | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_());
70                self.queue.insert(&self.mmer);
71                self.end += 1;
72                min_pos = self.end - self.base_width + self.queue.get_min_pos();
73            }
74            if min_pos == self.min_pos {
75                return None;
76            }
77            self.min_pos = min_pos;
78        }
79        Some(self.min_pos)
80    }
81}
82
83/// An iterator over the minimizers of a sequence and their positions.
84pub struct MinimizerIterator<'a, T: PrimInt + Hash = u64, S: BuildHasher = DefaultHashBuilder> {
85    pub(crate) seq: &'a [u8],
86    pub(crate) queue: MinimizerQueue<T, S>,
87    pub(crate) width: usize,
88    pub(crate) mmer: T,
89    pub(crate) mmer_mask: T,
90    pub(crate) encoding: [u8; 256],
91    pub(crate) base_width: usize,
92    pub(crate) min_pos: (T, usize),
93    pub(crate) end: usize,
94}
95
96impl<'a, T: PrimInt + Hash, S: BuildHasher> MinimizerIterator<'a, T, S> {
97    pub fn new(
98        seq: &'a [u8],
99        minimizer_size: usize,
100        width: u16,
101        hasher: S,
102        encoding: [u8; 256],
103    ) -> Self {
104        let queue = MinimizerQueue::with_hasher(width, hasher);
105        let width = width as usize;
106        Self {
107            seq,
108            queue,
109            width,
110            mmer: T::zero(),
111            mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(),
112            encoding,
113            base_width: width + minimizer_size - 1,
114            end: width + minimizer_size - 1,
115            min_pos: (T::zero(), 0),
116        }
117    }
118}
119
120impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator for MinimizerIterator<'a, T, S>
121where
122    u8: AsPrimitive<T>,
123{
124    type Item = (T, usize);
125
126    fn next(&mut self) -> Option<Self::Item> {
127        if self.queue.is_empty() {
128            if self.base_width > self.seq.len() {
129                return None;
130            }
131            for i in 0..(self.base_width - self.width) {
132                self.mmer = (self.mmer << 2)
133                    | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
134            }
135            for i in (self.base_width - self.width)..self.base_width {
136                self.mmer = ((self.mmer << 2) & self.mmer_mask)
137                    | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
138                self.queue.insert(self.mmer);
139            }
140            self.min_pos = self.queue.get_min_pos();
141        } else {
142            let mut min_pos = self.min_pos;
143            while self.end < self.seq.len() && min_pos.1 == self.min_pos.1 {
144                self.mmer = ((self.mmer << 2) & self.mmer_mask)
145                    | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_());
146                self.queue.insert(self.mmer);
147                self.end += 1;
148                let _min_pos = self.queue.get_min_pos();
149                min_pos = (_min_pos.0, self.end - self.base_width + _min_pos.1);
150            }
151            if min_pos.1 == self.min_pos.1 {
152                return None;
153            }
154            self.min_pos = min_pos;
155        }
156        Some(self.min_pos)
157    }
158}
159
160/// An iterator over the positions of the canonical minimizers of a sequence with a boolean indicating a reverse complement.
161/// It requires an odd width to break ties between multiple minimizers.
162pub struct CanonicalMinimizerPosIterator<
163    'a,
164    T: PrimInt + Hash = u64,
165    S: BuildHasher = DefaultHashBuilder,
166> {
167    pub(crate) seq: &'a [u8],
168    pub(crate) queue: ImplicitMinimizerQueue<S>,
169    pub(crate) width: usize,
170    pub(crate) mmer: T,
171    pub(crate) rc_mmer: T,
172    pub(crate) mmer_mask: T,
173    pub(crate) rc_mmer_shift: usize,
174    pub(crate) is_rc: VecDeque<bool>,
175    pub(crate) encoding: [u8; 256],
176    pub(crate) rc_encoding: [u8; 256],
177    pub(crate) base_width: usize,
178    pub(crate) min_pos: (usize, bool),
179    pub(crate) end: usize,
180}
181
182impl<'a, T: PrimInt + Hash, S: BuildHasher> CanonicalMinimizerPosIterator<'a, T, S> {
183    pub fn new(
184        seq: &'a [u8],
185        minimizer_size: usize,
186        width: u16,
187        hasher: S,
188        encoding: [u8; 256],
189    ) -> Self {
190        let queue = ImplicitMinimizerQueue::with_hasher(width, hasher);
191        let width = width as usize;
192        assert_eq!(
193            width % 2,
194            1,
195            "width must be odd to break ties between multiple minimizers"
196        );
197        let mut rc_encoding = encoding;
198        rc_encoding.swap(b'A' as usize, b'T' as usize);
199        rc_encoding.swap(b'a' as usize, b't' as usize);
200        rc_encoding.swap(b'C' as usize, b'G' as usize);
201        rc_encoding.swap(b'c' as usize, b'g' as usize);
202        Self {
203            seq,
204            queue,
205            width,
206            mmer: T::zero(),
207            rc_mmer: T::zero(),
208            mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(),
209            rc_mmer_shift: 2 * (minimizer_size - 1),
210            is_rc: VecDeque::with_capacity(width),
211            encoding,
212            rc_encoding,
213            base_width: width + minimizer_size - 1,
214            end: width + minimizer_size - 1,
215            min_pos: (0, false),
216        }
217    }
218
219    #[inline]
220    fn window_not_canonical(&self) -> bool {
221        self.is_rc[self.width / 2]
222    }
223}
224
225impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator
226    for CanonicalMinimizerPosIterator<'a, T, S>
227where
228    u8: AsPrimitive<T>,
229{
230    type Item = (usize, bool);
231
232    fn next(&mut self) -> Option<Self::Item> {
233        if self.queue.is_empty() {
234            if self.base_width > self.seq.len() {
235                return None;
236            }
237            for i in 0..(self.base_width - self.width) {
238                self.mmer = (self.mmer << 2)
239                    | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
240                self.rc_mmer = (self.rc_mmer >> 2)
241                    | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
242                        << self.rc_mmer_shift);
243            }
244            for i in (self.base_width - self.width)..self.base_width {
245                self.mmer = ((self.mmer << 2) & self.mmer_mask)
246                    | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
247                self.rc_mmer = (self.rc_mmer >> 2)
248                    | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
249                        << self.rc_mmer_shift);
250                let canonical_mmer = min(self.mmer, self.rc_mmer);
251                self.queue.insert(&canonical_mmer);
252                self.is_rc.push_back(canonical_mmer == self.rc_mmer);
253            }
254            let pos = if self.queue.multiple_mins() {
255                let (pos, tie) = self.queue.get_inner_min_pos();
256                tie.map_or(pos, |alt| {
257                    if self.window_not_canonical() {
258                        alt
259                    } else {
260                        pos
261                    }
262                })
263            } else {
264                self.queue.get_min_pos()
265            };
266            self.min_pos = (pos, self.is_rc[pos])
267        } else {
268            let mut min_pos = self.min_pos;
269            while self.end < self.seq.len() && min_pos == self.min_pos {
270                self.mmer = ((self.mmer << 2) & self.mmer_mask)
271                    | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_());
272                self.rc_mmer = (self.rc_mmer >> 2)
273                    | (unsafe { self.rc_encoding.get_unchecked(self.seq[self.end] as usize) }
274                        .as_()
275                        << self.rc_mmer_shift);
276                let canonical_mmer = min(self.mmer, self.rc_mmer);
277                self.queue.insert(&canonical_mmer);
278                self.is_rc.pop_front();
279                self.is_rc.push_back(canonical_mmer == self.rc_mmer);
280                self.end += 1;
281                let pos = if self.queue.multiple_mins() {
282                    let (pos, tie) = self.queue.get_inner_min_pos();
283                    tie.map_or(pos, |alt| {
284                        if self.window_not_canonical() {
285                            alt
286                        } else {
287                            pos
288                        }
289                    })
290                } else {
291                    self.queue.get_min_pos()
292                };
293                min_pos = (self.end - self.base_width + pos, self.is_rc[pos]);
294            }
295            if min_pos == self.min_pos {
296                return None;
297            }
298            self.min_pos = min_pos;
299        }
300        Some(self.min_pos)
301    }
302}
303
304/// An iterator over the canonical minimizers of a sequence and their positions with a boolean indicating a reverse complement.
305/// It requires an odd width to break ties between multiple minimizers.
306pub struct CanonicalMinimizerIterator<
307    'a,
308    T: PrimInt + Hash = u64,
309    S: BuildHasher = DefaultHashBuilder,
310> {
311    pub(crate) seq: &'a [u8],
312    pub(crate) queue: MinimizerQueue<T, S>,
313    pub(crate) width: usize,
314    pub(crate) mmer: T,
315    pub(crate) rc_mmer: T,
316    pub(crate) mmer_mask: T,
317    pub(crate) rc_mmer_shift: usize,
318    pub(crate) is_rc: VecDeque<bool>,
319    pub(crate) encoding: [u8; 256],
320    pub(crate) rc_encoding: [u8; 256],
321    pub(crate) base_width: usize,
322    pub(crate) min_pos: (T, usize, bool),
323    pub(crate) end: usize,
324}
325
326impl<'a, T: PrimInt + Hash, S: BuildHasher> CanonicalMinimizerIterator<'a, T, S> {
327    pub fn new(
328        seq: &'a [u8],
329        minimizer_size: usize,
330        width: u16,
331        hasher: S,
332        encoding: [u8; 256],
333    ) -> Self {
334        let queue = MinimizerQueue::with_hasher(width, hasher);
335        let width = width as usize;
336        assert_eq!(
337            width % 2,
338            1,
339            "width must be odd to break ties between multiple minimizers"
340        );
341        let mut rc_encoding = encoding;
342        rc_encoding.swap(b'A' as usize, b'T' as usize);
343        rc_encoding.swap(b'a' as usize, b't' as usize);
344        rc_encoding.swap(b'C' as usize, b'G' as usize);
345        rc_encoding.swap(b'c' as usize, b'g' as usize);
346        Self {
347            seq,
348            queue,
349            width,
350            mmer: T::zero(),
351            rc_mmer: T::zero(),
352            mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(),
353            rc_mmer_shift: 2 * (minimizer_size - 1),
354            is_rc: VecDeque::with_capacity(width),
355            encoding,
356            rc_encoding,
357            base_width: width + minimizer_size - 1,
358            end: width + minimizer_size - 1,
359            min_pos: (T::zero(), 0, false),
360        }
361    }
362
363    #[inline]
364    fn window_not_canonical(&self) -> bool {
365        self.is_rc[self.width / 2]
366    }
367}
368
369impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator
370    for CanonicalMinimizerIterator<'a, T, S>
371where
372    u8: AsPrimitive<T>,
373{
374    type Item = (T, usize, bool);
375
376    fn next(&mut self) -> Option<Self::Item> {
377        if self.queue.is_empty() {
378            if self.base_width > self.seq.len() {
379                return None;
380            }
381            for i in 0..(self.base_width - self.width) {
382                self.mmer = (self.mmer << 2)
383                    | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
384                self.rc_mmer = (self.rc_mmer >> 2)
385                    | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
386                        << self.rc_mmer_shift);
387            }
388            for i in (self.base_width - self.width)..self.base_width {
389                self.mmer = ((self.mmer << 2) & self.mmer_mask)
390                    | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_());
391                self.rc_mmer = (self.rc_mmer >> 2)
392                    | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_()
393                        << self.rc_mmer_shift);
394                let canonical_mmer = min(self.mmer, self.rc_mmer);
395                self.queue.insert(canonical_mmer);
396                self.is_rc.push_back(canonical_mmer == self.rc_mmer);
397            }
398            let _min_pos = if self.queue.multiple_mins() {
399                let (x, pos, tie) = self.queue.get_inner_min_pos();
400                tie.map_or((x, pos), |alt| {
401                    if self.window_not_canonical() {
402                        alt
403                    } else {
404                        (x, pos)
405                    }
406                })
407            } else {
408                self.queue.get_min_pos()
409            };
410            self.min_pos = (_min_pos.0, _min_pos.1, self.is_rc[_min_pos.1]);
411        } else {
412            let mut min_pos = self.min_pos;
413            while self.end < self.seq.len() && min_pos.1 == self.min_pos.1 {
414                self.mmer = ((self.mmer << 2) & self.mmer_mask)
415                    | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_());
416                self.rc_mmer = (self.rc_mmer >> 2)
417                    | (unsafe { self.rc_encoding.get_unchecked(self.seq[self.end] as usize) }
418                        .as_()
419                        << self.rc_mmer_shift);
420                let canonical_mmer = min(self.mmer, self.rc_mmer);
421                self.queue.insert(canonical_mmer);
422                self.is_rc.pop_front();
423                self.is_rc.push_back(canonical_mmer == self.rc_mmer);
424                self.end += 1;
425                let _min_pos = if self.queue.multiple_mins() {
426                    let (x, pos, tie) = self.queue.get_inner_min_pos();
427                    tie.map_or((x, pos), |alt| {
428                        if self.window_not_canonical() {
429                            alt
430                        } else {
431                            (x, pos)
432                        }
433                    })
434                } else {
435                    self.queue.get_min_pos()
436                };
437                min_pos = (
438                    _min_pos.0,
439                    self.end - self.base_width + _min_pos.1,
440                    self.is_rc[_min_pos.1],
441                );
442            }
443            if min_pos.1 == self.min_pos.1 {
444                return None;
445            }
446            self.min_pos = min_pos;
447        }
448        Some(self.min_pos)
449    }
450}