Skip to main content

sshash_lib/
minimizer.rs

1//! Minimizer extraction and iteration
2//!
3//! This module implements the core algorithm for extracting minimizers from k-mers.
4//! A minimizer is the smallest m-mer (subsequence of length m) within a k-mer,
5//! where "smallest" is determined by hashing.
6
7use crate::kmer::Kmer;
8use crate::hasher::DeterministicHasher;
9
10/// Information about a minimizer within a k-mer
11#[derive(Clone, Copy, Debug, PartialEq, Eq)]
12pub struct MinimizerInfo {
13    /// The minimizer value (encoded as bits)
14    pub value: u64,
15    /// The absolute position in the full string (not within the k-mer)
16    pub position: u64,
17    /// The position of the minimizer within the current k-mer (0 = rightmost)
18    pub pos_in_kmer: usize,
19}
20
21impl MinimizerInfo {
22    /// Create a new minimizer info
23    pub fn new(value: u64, position: u64, pos_in_kmer: usize) -> Self {
24        Self {
25            value,
26            position,
27            pos_in_kmer,
28        }
29    }
30}
31
32/// Iterator that extracts minimizers from k-mers using the "re-scan" method
33///
34/// This is an efficient minimizer extraction algorithm that:
35/// - Rescans the window when the minimum falls out
36/// - Performs single comparisons for sliding updates
37/// - Tracks the position of the minimum within the k-mer window
38///
39/// # Example
40/// ```
41/// use sshash_lib::kmer::Kmer;
42/// use sshash_lib::minimizer::MinimizerIterator;
43///
44/// // Create iterator for k-mer size 31, minimizer size 13
45/// let mut iter = MinimizerIterator::new(31, 13, 0);
46///
47/// // Process a k-mer and extract minimizer
48/// let kmer: Kmer<31> = Kmer::from_str("ACGTACGTACGTACGTACGTACGTACGTACG").unwrap();
49/// let mini = iter.next(kmer);
50/// println!("Minimizer at position {} in k-mer, global position {}", 
51///     mini.pos_in_kmer, mini.position);
52/// ```
53pub struct MinimizerIterator {
54    k: usize,
55    m: usize,
56    position: u64,
57    min_pos_in_kmer: usize,
58    min_value: u64,
59    min_position: u64,
60    min_hash: u64,
61    hasher: DeterministicHasher,
62}
63
64impl MinimizerIterator {
65    /// Create a new minimizer iterator with a seeded hasher
66    ///
67    /// # Arguments
68    /// * `k` - k-mer size (must be >= m)
69    /// * `m` - minimizer size (must be <= k)
70    /// * `seed` - seed for the hash function (for deterministic results)
71    pub fn with_seed(k: usize, m: usize, seed: u64) -> Self {
72        assert!(k > 0 && m <= k, "k must be > 0 and m <= k");
73        let hasher = DeterministicHasher::new(seed);
74        let mut iter = Self {
75            k,
76            m,
77            position: 0,
78            min_pos_in_kmer: 0,
79            min_value: 0,
80            min_position: 0,
81            min_hash: u64::MAX,
82            hasher,
83        };
84        iter.reset();
85        iter
86    }
87
88    /// Create a new minimizer iterator (defaults to seed=1)
89    ///
90    /// # Arguments
91    /// * `k` - k-mer size (must be >= m)
92    /// * `m` - minimizer size (must be <= k)
93    /// * `_position` - (deprecated parameter for API compatibility)
94    pub fn new(k: usize, m: usize, _position: u64) -> Self {
95        Self::with_seed(k, m, 1)
96    }
97
98    /// Set a new starting position
99    pub fn set_position(&mut self, position: u64) {
100        self.position = position;
101        self.reset();
102    }
103
104    /// Reset internal state (called when position changes)
105    fn reset(&mut self) {
106        // Match C++ behavior: set min_pos_in_kmer to trigger rescan on first next()
107        // and set min_position to position - 1 (wrapping, matching C++ unsigned arithmetic)
108        // so that next() will compute begin = min_position + 1 = position
109        self.min_pos_in_kmer = 0;
110        self.min_position = self.position.wrapping_sub(1);
111        self.min_hash = u64::MAX;
112    }
113
114    /// Compute hash of a u64 value using the seeded hasher
115    fn hash_u64(&self, value: u64) -> u64 {
116        self.hasher.hash_u64(value)
117    }
118
119    /// Extract the next minimizer from a k-mer
120    ///
121    /// For sliding through a sequence, call this repeatedly with consecutive k-mers
122    /// to efficiently compute minimizers at each position.
123    pub fn next<const K: usize>(&mut self, kmer: Kmer<K>) -> MinimizerInfo
124    where
125        Kmer<K>: crate::kmer::KmerBits,
126    {
127        assert_eq!(K, self.k, "k-mer size must match iterator k");
128
129        if self.min_pos_in_kmer == 0 {
130            // Minimum fell out of window: rescan to find new minimum
131            // This matches C++ logic in minimizer_iterator.hpp line 35-36
132            // Use wrapping_add because min_position starts at u64::MAX after reset
133            self.position = self.min_position.wrapping_add(1);
134            self.rescan(kmer);
135        } else {
136            // Sliding: check new m-mer at the end of the k-mer
137            self.position += 1;
138
139            // Extract the rightmost m-mer
140            let mmer_value = self.extract_mmer(kmer, self.k - self.m);
141            let hash = self.hash_u64(mmer_value);
142
143            if hash < self.min_hash {
144                // Found a new minimum (always at position k-m in new k-mer)
145                self.min_hash = hash;
146                self.min_value = mmer_value;
147                self.min_position = self.position;
148                self.min_pos_in_kmer = self.k - self.m;
149            } else {
150                // Minimum is still in window, just shifted left
151                self.min_pos_in_kmer -= 1;
152            }
153        }
154
155        MinimizerInfo::new(self.min_value, self.min_position, self.min_pos_in_kmer)
156    }
157
158    /// Rescan the k-mer window to find all m-mers and their minimum
159    /// This matches the C++ implementation in minimizer_iterator.hpp lines 61-73
160    fn rescan<const K: usize>(&mut self, kmer: Kmer<K>)
161    where
162        Kmer<K>: crate::kmer::KmerBits,
163    {
164        self.min_hash = u64::MAX;
165        self.min_value = 0;
166        self.min_pos_in_kmer = 0;
167
168        let begin = self.position;
169        
170        // Try each m-mer position in the k-mer window
171        for i in 0..=(self.k - self.m) {
172            let mmer_value = self.extract_mmer(kmer, i);
173            let hash = self.hash_u64(mmer_value);
174
175            if hash < self.min_hash {
176                // Leftmost minimum wins ties (matching C++ behavior)
177                self.min_hash = hash;
178                self.min_value = mmer_value;
179                self.min_pos_in_kmer = i;
180            }
181        }
182        
183        // Set position to represent the position of the rightmost m-mer after rescan
184        // This matches C++ logic: m_position = begin + (k - m) at end of rescan
185        self.position = begin + (self.k - self.m) as u64;
186        self.min_position = begin + self.min_pos_in_kmer as u64;
187    }
188
189    /// Extract m bases starting at position `start_pos` from a k-mer
190    /// Returns the m-mer as a u64
191    #[inline]
192    fn extract_mmer<const K: usize>(&self, kmer: Kmer<K>, start_pos: usize) -> u64
193    where
194        Kmer<K>: crate::kmer::KmerBits,
195    {
196        // Extract m bases as a single shift+mask on the native storage type
197        let shift = start_pos * 2;
198        let mask = (1u64 << (self.m * 2)) - 1;
199        let bits_u128 = <Kmer<K> as crate::kmer::KmerBits>::to_u128(kmer.bits());
200        ((bits_u128 >> shift) as u64) & mask
201    }
202}
203
204#[cfg(test)]
205mod tests {
206    use super::*;
207    use crate::kmer::Kmer;
208
209    #[test]
210    fn test_minimizer_iterator_basic() {
211        // Create a simple k-mer: ACGTACGTACGTACGT (K=15, so we'll use K=15)
212        let kmer: Kmer<15> = Kmer::from_str("ACGTACGTACGTACG").unwrap();
213
214        let mut iter = MinimizerIterator::new(15, 7, 0);
215        let mini = iter.next(kmer);
216
217        // Verify it found a minimizer with valid properties
218        assert!(mini.pos_in_kmer <= 15 - 7); // pos_in_kmer is 0..=(k-m)
219        // position = pos_in_kmer for the first k-mer (min_position = begin + pos_in_kmer, begin=0)
220        assert_eq!(mini.position, mini.pos_in_kmer as u64);
221    }
222
223    #[test]
224    fn test_minimizer_iterator_position() {
225        // Note: This test verifies minimizer iteration works correctly.
226        // The exact position values depend on hash values of m-mers.
227        let kmer1: Kmer<9> = Kmer::from_str("ACGTACGTA").unwrap();
228        let kmer2: Kmer<9> = Kmer::from_str("CGTACGTAC").unwrap();
229
230        let mut iter = MinimizerIterator::new(9, 5, 0);
231        let mini1 = iter.next(kmer1);
232        let mini2 = iter.next(kmer2);
233
234        // Verify valid results were returned
235        assert!(mini1.pos_in_kmer < 9);
236        assert!(mini2.pos_in_kmer < 9);
237    }
238
239    #[test]
240    fn test_minimizer_info_creation() {
241        let mini = MinimizerInfo::new(42, 100, 5);
242        assert_eq!(mini.value, 42);
243        assert_eq!(mini.position, 100);
244        assert_eq!(mini.pos_in_kmer, 5);
245    }
246
247    #[test]
248    fn test_minimizer_consistency_fresh_vs_sequential() {
249        // This tests that a fresh iterator gives the same minimizer as
250        // processing k-mers sequentially through a string.
251        // Uses the failing k-mer from test_small.fa seq1 position 47.
252        
253        let seq = "ATTTTCAGGATGTTTTCAGGTTCATCATCTCCCTTCTTTGCAGGATAGTAGATAAGATCGCTCATCAACGGATGTTGTGT";
254        let k = 31usize;
255        let m = 19usize;
256        let seed = 1u64;
257        let num_kmers = seq.len() - k + 1;
258        
259        // Process ALL k-mers sequentially, like the build does
260        let mut seq_iter = MinimizerIterator::with_seed(k, m, seed);
261        seq_iter.set_position(0);
262        
263        for kmer_pos in 0..num_kmers {
264            let kmer_str = &seq[kmer_pos..kmer_pos+k];
265            let kmer = Kmer::<31>::from_str(kmer_str).unwrap();
266            
267            let seq_mini = seq_iter.next(kmer);
268            
269            // Now compute with a FRESH iterator (like query does)
270            let mut fresh_iter = MinimizerIterator::with_seed(k, m, seed);
271            let fresh_mini = fresh_iter.next(kmer);
272            
273            assert_eq!(
274                seq_mini.value, fresh_mini.value,
275                "Minimizer VALUE mismatch at kmer_pos={}: seq={} fresh={} kmer={}",
276                kmer_pos, seq_mini.value, fresh_mini.value, kmer_str
277            );
278            assert_eq!(
279                seq_mini.pos_in_kmer, fresh_mini.pos_in_kmer,
280                "Minimizer POS mismatch at kmer_pos={}: seq={} fresh={} kmer={}",
281                kmer_pos, seq_mini.pos_in_kmer, fresh_mini.pos_in_kmer, kmer_str
282            );
283        }
284    }
285}