minimizer_iter/
builder.rs

1use crate::algorithm::{Minimizer, MinimizerAlgorithm, ModMinimizer};
2use crate::iterator::*;
3use core::hash::{BuildHasher, Hash};
4use core::marker::PhantomData;
5use minimizer_queue::DefaultHashBuilder;
6use num_traits::PrimInt;
7
8/// A builder for iterators over minimizers.
9///
10/// # Examples
11///
12/// ```
13/// use minimizer_iter::MinimizerBuilder;
14///
15/// // Build an iterator over minimizers
16/// // of size 3 with a window of size 4
17/// // for the sequence "TGATTGCACAATC"
18/// let min_iter = MinimizerBuilder::<u64>::new()
19///     .minimizer_size(3)
20///     .width(4)
21///     .iter(b"TGATTGCACAATC");
22///
23/// for (minimizer, position) in min_iter {
24///     // ...
25/// }
26/// ```
27#[derive(Clone, Debug, Eq, PartialEq)]
28pub struct MinimizerBuilder<
29    T: PrimInt = u64,
30    A: MinimizerAlgorithm = Minimizer,
31    S: BuildHasher = DefaultHashBuilder,
32    const CANONICAL: bool = false,
33> {
34    minimizer_size: usize,
35    width: u16,
36    hasher: S,
37    encoding: [u8; 256],
38    _marker: PhantomData<(T, A)>,
39}
40
41impl<T: PrimInt + Hash> MinimizerBuilder<T> {
42    /// Sets up the `MinimizerBuilder` with default values:
43    /// - minimizer_size = 21
44    /// - width = 11 (31 - 21 + 1)
45    /// - hasher = [`DefaultHashBuilder`]
46    /// - encoding: A = `00`, C = `01`, G = `10`, T = `11`
47    #[inline]
48    pub fn new() -> Self {
49        Self::_new()
50    }
51}
52
53impl<T: PrimInt + Hash> Default for MinimizerBuilder<T> {
54    #[inline]
55    fn default() -> Self {
56        Self::_new()
57    }
58}
59
60impl<T: PrimInt + Hash, S: BuildHasher> MinimizerBuilder<T, Minimizer, S, false> {
61    /// Builds an iterator over the minimizers and their positions in the given sequence.
62    #[inline]
63    pub fn iter(self, seq: &[u8]) -> MinimizerIterator<T, S> {
64        MinimizerIterator::new(
65            seq,
66            self.minimizer_size,
67            self.width,
68            self.hasher,
69            self.encoding,
70        )
71    }
72
73    /// Builds an iterator over the positions of the minimizers in the given sequence.
74    #[inline]
75    pub fn iter_pos(self, seq: &[u8]) -> MinimizerPosIterator<T, S> {
76        MinimizerPosIterator::new(
77            seq,
78            self.minimizer_size,
79            self.width,
80            self.hasher,
81            self.encoding,
82        )
83    }
84}
85
86impl<T: PrimInt + Hash, S: BuildHasher> MinimizerBuilder<T, Minimizer, S, true> {
87    /// Builds an iterator over the canonical minimizers and their positions in the given sequence with a boolean indicating a reverse complement.
88    /// It requires an odd width to break ties between multiple minimizers.
89    #[inline]
90    pub fn iter(self, seq: &[u8]) -> CanonicalMinimizerIterator<T, S> {
91        assert_eq!(
92            self.width % 2,
93            1,
94            "width must be odd to break ties between multiple minimizers"
95        );
96        CanonicalMinimizerIterator::new(
97            seq,
98            self.minimizer_size,
99            self.width,
100            self.hasher,
101            self.encoding,
102        )
103    }
104
105    /// Builds an iterator over the positions of the canonical minimizers in the given sequence with a boolean indicating a reverse complement.
106    /// It requires an odd width to break ties between multiple minimizers.
107    #[inline]
108    pub fn iter_pos(self, seq: &[u8]) -> CanonicalMinimizerPosIterator<T, S> {
109        assert_eq!(
110            self.width % 2,
111            1,
112            "width must be odd to break ties between multiple minimizers"
113        );
114        CanonicalMinimizerPosIterator::new(
115            seq,
116            self.minimizer_size,
117            self.width,
118            self.hasher,
119            self.encoding,
120        )
121    }
122}
123
124const R: usize = 4;
125
126impl<T: PrimInt + Hash> MinimizerBuilder<T, ModMinimizer> {
127    /// Sets up the `MinimizerBuilder` for mod-minimizers with default values:
128    /// - minimizer_size = 21
129    /// - width = 11 (31 - 21 + 1)
130    /// - hasher = [`DefaultHashBuilder`]
131    /// - encoding: A = `00`, C = `01`, G = `10`, T = `11`
132    #[inline]
133    pub fn new_mod() -> Self {
134        Self::_new()
135    }
136}
137
138impl<T: PrimInt + Hash, S: BuildHasher> MinimizerBuilder<T, ModMinimizer, S, false> {
139    /// Builds an iterator over the mod-minimizers and their positions in the given sequence.
140    #[inline]
141    pub fn iter(self, seq: &[u8]) -> ModSamplingIterator<T, S> {
142        assert!(
143            self.minimizer_size >= R,
144            "mod-minimizers require minimizer_size ≥ r={R}"
145        );
146        ModSamplingIterator::new(
147            seq,
148            self.minimizer_size,
149            self.width,
150            R + ((self.minimizer_size - R) % self.width as usize),
151            self.hasher,
152            self.encoding,
153        )
154    }
155
156    /// Builds an iterator over the positions of the mod-minimizers in the given sequence.
157    #[inline]
158    pub fn iter_pos(self, seq: &[u8]) -> ModSamplingPosIterator<T, S> {
159        assert!(
160            self.minimizer_size >= R,
161            "mod-minimizers require minimizer_size ≥ r={R}"
162        );
163        ModSamplingPosIterator::new(
164            seq,
165            self.minimizer_size,
166            self.width,
167            R + ((self.minimizer_size - R) % self.width as usize),
168            self.hasher,
169            self.encoding,
170        )
171    }
172}
173
174impl<T: PrimInt + Hash, S: BuildHasher> MinimizerBuilder<T, ModMinimizer, S, true> {
175    /// Builds an iterator over the canonical mod-minimizers and their positions in the given sequence with a boolean indicating a reverse complement.
176    /// It requires an odd width to break ties between multiple minimizers.
177    #[inline]
178    pub fn iter(self, seq: &[u8]) -> CanonicalModSamplingIterator<T, S> {
179        assert!(
180            self.minimizer_size >= R,
181            "mod-minimizers require minimizer_size ≥ r={R}"
182        );
183        assert_eq!(
184            self.width % 2,
185            1,
186            "width must be odd to break ties between multiple minimizers"
187        );
188        CanonicalModSamplingIterator::new(
189            seq,
190            self.minimizer_size,
191            self.width,
192            R + ((self.minimizer_size - R) % self.width as usize),
193            self.hasher,
194            self.encoding,
195        )
196    }
197
198    /// Builds an iterator over the positions of the canonical mod-minimizers in the given sequence with a boolean indicating a reverse complement.
199    /// It requires an odd width to break ties between multiple minimizers.
200    #[inline]
201    pub fn iter_pos(self, seq: &[u8]) -> CanonicalModSamplingPosIterator<T, S> {
202        assert!(
203            self.minimizer_size >= R,
204            "mod-minimizers require minimizer_size ≥ r={R}"
205        );
206        assert_eq!(
207            self.width % 2,
208            1,
209            "width must be odd to break ties between multiple minimizers"
210        );
211        CanonicalModSamplingPosIterator::new(
212            seq,
213            self.minimizer_size,
214            self.width,
215            R + ((self.minimizer_size - R) % self.width as usize),
216            self.hasher,
217            self.encoding,
218        )
219    }
220}
221
222impl<T: PrimInt + Hash, A: MinimizerAlgorithm> MinimizerBuilder<T, A, DefaultHashBuilder> {
223    fn _new() -> Self {
224        let mut encoding = [0u8; 256];
225        encoding[b'A' as usize] = 0b00;
226        encoding[b'a' as usize] = 0b00;
227        encoding[b'C' as usize] = 0b01;
228        encoding[b'c' as usize] = 0b01;
229        encoding[b'G' as usize] = 0b10;
230        encoding[b'g' as usize] = 0b10;
231        encoding[b'T' as usize] = 0b11;
232        encoding[b't' as usize] = 0b11;
233        Self {
234            minimizer_size: 21,
235            width: 31 - 21 + 1,
236            hasher: DefaultHashBuilder::default(),
237            encoding,
238            _marker: PhantomData,
239        }
240    }
241
242    /// Sets the seed of the default hasher.
243    pub fn seed(mut self, seed: u64) -> Self {
244        self.hasher = DefaultHashBuilder::with_seed(seed);
245        self
246    }
247}
248
249impl<T: PrimInt + Hash, A: MinimizerAlgorithm, S: BuildHasher, const CANONICAL: bool>
250    MinimizerBuilder<T, A, S, CANONICAL>
251{
252    /// Sets the size of the minimizers.
253    pub fn minimizer_size(mut self, minimizer_size: usize) -> Self {
254        let max_size = (T::zero().count_zeros() / 2) as usize;
255        assert!(
256            minimizer_size <= max_size,
257            "With this integer type, minimizer_size must be ≤ {max_size}. Please select a smaller size or a larger type."
258        );
259        self.minimizer_size = minimizer_size;
260        self
261    }
262
263    /// Sets the width of the window.
264    pub const fn width(mut self, width: u16) -> Self {
265        self.width = width;
266        self
267    }
268
269    /// Sets the hasher used to compute minimizers.
270    pub fn hasher<H: BuildHasher>(self, hasher: H) -> MinimizerBuilder<T, A, H, CANONICAL> {
271        MinimizerBuilder::<T, A, H, CANONICAL> {
272            minimizer_size: self.minimizer_size,
273            width: self.width,
274            hasher,
275            encoding: self.encoding,
276            _marker: self._marker,
277        }
278    }
279
280    /// Sets the binary encoding of the bases.
281    pub fn encoding(mut self, a: u8, c: u8, g: u8, t: u8) -> Self {
282        self.encoding[b'A' as usize] = a;
283        self.encoding[b'a' as usize] = a;
284        self.encoding[b'C' as usize] = c;
285        self.encoding[b'c' as usize] = c;
286        self.encoding[b'G' as usize] = g;
287        self.encoding[b'g' as usize] = g;
288        self.encoding[b'T' as usize] = t;
289        self.encoding[b't' as usize] = t;
290        self
291    }
292
293    /// Compute canonical minimizers.
294    pub fn canonical(self) -> MinimizerBuilder<T, A, S, true> {
295        MinimizerBuilder::<T, A, S, true> {
296            minimizer_size: self.minimizer_size,
297            width: self.width,
298            hasher: self.hasher,
299            encoding: self.encoding,
300            _marker: self._marker,
301        }
302    }
303
304    /// Compute non-canonical minimizers.
305    pub fn non_canonical(self) -> MinimizerBuilder<T, A, S, false> {
306        MinimizerBuilder::<T, A, S, false> {
307            minimizer_size: self.minimizer_size,
308            width: self.width,
309            hasher: self.hasher,
310            encoding: self.encoding,
311            _marker: self._marker,
312        }
313    }
314}