sshash_lib/minimizer.rs
1//! Minimizer extraction and iteration
2//!
3//! This module implements the core algorithm for extracting minimizers from k-mers.
4//! A minimizer is the smallest m-mer (subsequence of length m) within a k-mer,
5//! where "smallest" is determined by hashing.
6
7use crate::kmer::Kmer;
8use crate::hasher::DeterministicHasher;
9
10/// Information about a minimizer within a k-mer
11#[derive(Clone, Copy, Debug, PartialEq, Eq)]
12pub struct MinimizerInfo {
13 /// The minimizer value (encoded as bits)
14 pub value: u64,
15 /// The absolute position in the full string (not within the k-mer)
16 pub position: u64,
17 /// The position of the minimizer within the current k-mer (0 = rightmost)
18 pub pos_in_kmer: usize,
19}
20
21impl MinimizerInfo {
22 /// Create a new minimizer info
23 pub fn new(value: u64, position: u64, pos_in_kmer: usize) -> Self {
24 Self {
25 value,
26 position,
27 pos_in_kmer,
28 }
29 }
30}
31
32/// Iterator that extracts minimizers from k-mers using the "re-scan" method
33///
34/// This is an efficient minimizer extraction algorithm that:
35/// - Rescans the window when the minimum falls out
36/// - Performs single comparisons for sliding updates
37/// - Tracks the position of the minimum within the k-mer window
38///
39/// # Example
40/// ```
41/// use sshash_lib::kmer::Kmer;
42/// use sshash_lib::minimizer::MinimizerIterator;
43///
44/// // Create iterator for k-mer size 31, minimizer size 13
45/// let mut iter = MinimizerIterator::new(31, 13, 0);
46///
47/// // Process a k-mer and extract minimizer
48/// let kmer: Kmer<31> = Kmer::from_str("ACGTACGTACGTACGTACGTACGTACGTACG").unwrap();
49/// let mini = iter.next(kmer);
50/// println!("Minimizer at position {} in k-mer, global position {}",
51/// mini.pos_in_kmer, mini.position);
52/// ```
53pub struct MinimizerIterator {
54 k: usize,
55 m: usize,
56 position: u64,
57 min_pos_in_kmer: usize,
58 min_value: u64,
59 min_position: u64,
60 min_hash: u64,
61 hasher: DeterministicHasher,
62}
63
64impl MinimizerIterator {
65 /// Create a new minimizer iterator with a seeded hasher
66 ///
67 /// # Arguments
68 /// * `k` - k-mer size (must be >= m)
69 /// * `m` - minimizer size (must be <= k)
70 /// * `seed` - seed for the hash function (for deterministic results)
71 pub fn with_seed(k: usize, m: usize, seed: u64) -> Self {
72 assert!(k > 0 && m <= k, "k must be > 0 and m <= k");
73 let hasher = DeterministicHasher::new(seed);
74 let mut iter = Self {
75 k,
76 m,
77 position: 0,
78 min_pos_in_kmer: 0,
79 min_value: 0,
80 min_position: 0,
81 min_hash: u64::MAX,
82 hasher,
83 };
84 iter.reset();
85 iter
86 }
87
88 /// Create a new minimizer iterator (defaults to seed=1)
89 ///
90 /// # Arguments
91 /// * `k` - k-mer size (must be >= m)
92 /// * `m` - minimizer size (must be <= k)
93 /// * `_position` - (deprecated parameter for API compatibility)
94 pub fn new(k: usize, m: usize, _position: u64) -> Self {
95 Self::with_seed(k, m, 1)
96 }
97
98 /// Set a new starting position
99 pub fn set_position(&mut self, position: u64) {
100 self.position = position;
101 self.reset();
102 }
103
104 /// Reset internal state (called when position changes)
105 fn reset(&mut self) {
106 // Match C++ behavior: set min_pos_in_kmer to trigger rescan on first next()
107 // and set min_position to position - 1 (wrapping, matching C++ unsigned arithmetic)
108 // so that next() will compute begin = min_position + 1 = position
109 self.min_pos_in_kmer = 0;
110 self.min_position = self.position.wrapping_sub(1);
111 self.min_hash = u64::MAX;
112 }
113
114 /// Compute hash of a u64 value using the seeded hasher
115 fn hash_u64(&self, value: u64) -> u64 {
116 self.hasher.hash_u64(value)
117 }
118
119 /// Extract the next minimizer from a k-mer
120 ///
121 /// For sliding through a sequence, call this repeatedly with consecutive k-mers
122 /// to efficiently compute minimizers at each position.
123 pub fn next<const K: usize>(&mut self, kmer: Kmer<K>) -> MinimizerInfo
124 where
125 Kmer<K>: crate::kmer::KmerBits,
126 {
127 assert_eq!(K, self.k, "k-mer size must match iterator k");
128
129 if self.min_pos_in_kmer == 0 {
130 // Minimum fell out of window: rescan to find new minimum
131 // This matches C++ logic in minimizer_iterator.hpp line 35-36
132 // Use wrapping_add because min_position starts at u64::MAX after reset
133 self.position = self.min_position.wrapping_add(1);
134 self.rescan(kmer);
135 } else {
136 // Sliding: check new m-mer at the end of the k-mer
137 self.position += 1;
138
139 // Extract the rightmost m-mer
140 let mmer_value = self.extract_mmer(kmer, self.k - self.m);
141 let hash = self.hash_u64(mmer_value);
142
143 if hash < self.min_hash {
144 // Found a new minimum (always at position k-m in new k-mer)
145 self.min_hash = hash;
146 self.min_value = mmer_value;
147 self.min_position = self.position;
148 self.min_pos_in_kmer = self.k - self.m;
149 } else {
150 // Minimum is still in window, just shifted left
151 self.min_pos_in_kmer -= 1;
152 }
153 }
154
155 MinimizerInfo::new(self.min_value, self.min_position, self.min_pos_in_kmer)
156 }
157
158 /// Rescan the k-mer window to find all m-mers and their minimum
159 /// This matches the C++ implementation in minimizer_iterator.hpp lines 61-73
160 fn rescan<const K: usize>(&mut self, kmer: Kmer<K>)
161 where
162 Kmer<K>: crate::kmer::KmerBits,
163 {
164 self.min_hash = u64::MAX;
165 self.min_value = 0;
166 self.min_pos_in_kmer = 0;
167
168 let begin = self.position;
169
170 // Try each m-mer position in the k-mer window
171 for i in 0..=(self.k - self.m) {
172 let mmer_value = self.extract_mmer(kmer, i);
173 let hash = self.hash_u64(mmer_value);
174
175 if hash < self.min_hash {
176 // Leftmost minimum wins ties (matching C++ behavior)
177 self.min_hash = hash;
178 self.min_value = mmer_value;
179 self.min_pos_in_kmer = i;
180 }
181 }
182
183 // Set position to represent the position of the rightmost m-mer after rescan
184 // This matches C++ logic: m_position = begin + (k - m) at end of rescan
185 self.position = begin + (self.k - self.m) as u64;
186 self.min_position = begin + self.min_pos_in_kmer as u64;
187 }
188
189 /// Extract m bases starting at position `start_pos` from a k-mer
190 /// Returns the m-mer as a u64
191 #[inline]
192 fn extract_mmer<const K: usize>(&self, kmer: Kmer<K>, start_pos: usize) -> u64
193 where
194 Kmer<K>: crate::kmer::KmerBits,
195 {
196 // Extract m bases as a single shift+mask on the native storage type
197 let shift = start_pos * 2;
198 let mask = (1u64 << (self.m * 2)) - 1;
199 let bits_u128 = <Kmer<K> as crate::kmer::KmerBits>::to_u128(kmer.bits());
200 ((bits_u128 >> shift) as u64) & mask
201 }
202}
203
204#[cfg(test)]
205mod tests {
206 use super::*;
207 use crate::kmer::Kmer;
208
209 #[test]
210 fn test_minimizer_iterator_basic() {
211 // Create a simple k-mer: ACGTACGTACGTACGT (K=15, so we'll use K=15)
212 let kmer: Kmer<15> = Kmer::from_str("ACGTACGTACGTACG").unwrap();
213
214 let mut iter = MinimizerIterator::new(15, 7, 0);
215 let mini = iter.next(kmer);
216
217 // Verify it found a minimizer with valid properties
218 assert!(mini.pos_in_kmer <= 15 - 7); // pos_in_kmer is 0..=(k-m)
219 // position = pos_in_kmer for the first k-mer (min_position = begin + pos_in_kmer, begin=0)
220 assert_eq!(mini.position, mini.pos_in_kmer as u64);
221 }
222
223 #[test]
224 fn test_minimizer_iterator_position() {
225 // Note: This test verifies minimizer iteration works correctly.
226 // The exact position values depend on hash values of m-mers.
227 let kmer1: Kmer<9> = Kmer::from_str("ACGTACGTA").unwrap();
228 let kmer2: Kmer<9> = Kmer::from_str("CGTACGTAC").unwrap();
229
230 let mut iter = MinimizerIterator::new(9, 5, 0);
231 let mini1 = iter.next(kmer1);
232 let mini2 = iter.next(kmer2);
233
234 // Verify valid results were returned
235 assert!(mini1.pos_in_kmer < 9);
236 assert!(mini2.pos_in_kmer < 9);
237 }
238
239 #[test]
240 fn test_minimizer_info_creation() {
241 let mini = MinimizerInfo::new(42, 100, 5);
242 assert_eq!(mini.value, 42);
243 assert_eq!(mini.position, 100);
244 assert_eq!(mini.pos_in_kmer, 5);
245 }
246
247 #[test]
248 fn test_minimizer_consistency_fresh_vs_sequential() {
249 // This tests that a fresh iterator gives the same minimizer as
250 // processing k-mers sequentially through a string.
251 // Uses the failing k-mer from test_small.fa seq1 position 47.
252
253 let seq = "ATTTTCAGGATGTTTTCAGGTTCATCATCTCCCTTCTTTGCAGGATAGTAGATAAGATCGCTCATCAACGGATGTTGTGT";
254 let k = 31usize;
255 let m = 19usize;
256 let seed = 1u64;
257 let num_kmers = seq.len() - k + 1;
258
259 // Process ALL k-mers sequentially, like the build does
260 let mut seq_iter = MinimizerIterator::with_seed(k, m, seed);
261 seq_iter.set_position(0);
262
263 for kmer_pos in 0..num_kmers {
264 let kmer_str = &seq[kmer_pos..kmer_pos+k];
265 let kmer = Kmer::<31>::from_str(kmer_str).unwrap();
266
267 let seq_mini = seq_iter.next(kmer);
268
269 // Now compute with a FRESH iterator (like query does)
270 let mut fresh_iter = MinimizerIterator::with_seed(k, m, seed);
271 let fresh_mini = fresh_iter.next(kmer);
272
273 assert_eq!(
274 seq_mini.value, fresh_mini.value,
275 "Minimizer VALUE mismatch at kmer_pos={}: seq={} fresh={} kmer={}",
276 kmer_pos, seq_mini.value, fresh_mini.value, kmer_str
277 );
278 assert_eq!(
279 seq_mini.pos_in_kmer, fresh_mini.pos_in_kmer,
280 "Minimizer POS mismatch at kmer_pos={}: seq={} fresh={} kmer={}",
281 kmer_pos, seq_mini.pos_in_kmer, fresh_mini.pos_in_kmer, kmer_str
282 );
283 }
284 }
285}