Skip to main content

sshash_lib/
minimizers_control_map.rs

1//! Minimizers Control Map (MCM)
2//!
3//! Maps minimizer values to their control information using a minimal perfect hash function.
4//! This provides O(1) lookup of metadata associated with each unique minimizer.
5
6use rapidhash::HashMapExt;
7
8use crate::mphf_config::{Mphf, build_mphf_from_vec};
9use std::io;
10use tracing::info;
11
12/// Control information associated with a minimizer
13#[derive(Clone, Copy, Debug)]
14pub struct MinimizerControl {
15    /// Number of strings containing this minimizer
16    pub count: u32,
17    /// Bucket type (regular, sparse, or heavy-load)
18    pub bucket_type: BucketType,
19    /// Additional metadata
20    pub metadata: u64,
21}
22
23/// Classification of minimizer buckets by density
24#[derive(Clone, Copy, Debug, PartialEq, Eq)]
25#[derive(Default)]
26pub enum BucketType {
27    /// Regular bucket: few strings contain this minimizer
28    #[default]
29    Regular,
30    /// Sparse bucket: moderately populated
31    Sparse,
32    /// Heavy-load bucket: many strings contain this minimizer
33    HeavyLoad,
34}
35
36
37/// Builder for constructing a MinimizersControlMap
38///
39/// This two-phase builder collects minimizers during construction,
40/// then builds the MPHF once all minimizers are known.
41pub struct MinimizersControlMapBuilder {
42    /// Minimizers collected during building (stable order for MPHF)
43    minimizers: Vec<u64>,
44    /// Control information for each minimizer (parallel to minimizers vec)
45    controls: Vec<MinimizerControl>,
46    /// O(1) lookup: minimizer value → index in minimizers/controls vecs
47    index: rapidhash::RapidHashMap<u64, usize>,
48}
49
50impl MinimizersControlMapBuilder {
51    /// Create a new builder
52    pub fn new() -> Self {
53        Self {
54            minimizers: Vec::new(),
55            controls: Vec::new(),
56            index: rapidhash::RapidHashMap::new(),
57        }
58    }
59
60    /// Add a minimizer with initial control info
61    pub fn add_minimizer(&mut self, minimizer: u64) -> usize {
62        if let Some(&pos) = self.index.get(&minimizer) {
63            return pos;
64        }
65
66        let id = self.minimizers.len();
67        self.minimizers.push(minimizer);
68        self.controls.push(MinimizerControl {
69            count: 0,
70            bucket_type: BucketType::Regular,
71            metadata: 0,
72        });
73        self.index.insert(minimizer, id);
74        id
75    }
76
77    /// Increment count for a minimizer
78    pub fn increment_count(&mut self, minimizer: u64) {
79        let id = self.add_minimizer(minimizer);
80        self.controls[id].count = self.controls[id].count.saturating_add(1);
81    }
82
83    /// Set bucket type for a minimizer
84    pub fn set_bucket_type(&mut self, minimizer: u64, bucket_type: BucketType) {
85        let id = self.add_minimizer(minimizer);
86        self.controls[id].bucket_type = bucket_type;
87    }
88
89    /// Get mutable access to control info for a minimizer
90    pub fn get_control_mut(&mut self, minimizer: u64) -> Option<&mut MinimizerControl> {
91        self.index
92            .get(&minimizer)
93            .copied()
94            .map(|idx| &mut self.controls[idx])
95    }
96
97    /// Classify bucket types based on count thresholds
98    pub fn finalize_bucket_types(&mut self, threshold_sparse: u32, threshold_heavy: u32) {
99        for control in &mut self.controls {
100            control.bucket_type = if control.count > threshold_heavy {
101                BucketType::HeavyLoad
102            } else if control.count > threshold_sparse {
103                BucketType::Sparse
104            } else {
105                BucketType::Regular
106            };
107        }
108    }
109
110    /// Build the final MinimizersControlMap with MPHF
111    ///
112    /// Returns the control map AND a mapping from MPHF index to bucket_id.
113    /// This mapping is used to reorder control_codewords to MPHF order.
114    ///
115    /// # Arguments
116    /// * `c` - Relative level size (percentage, typically 100)
117    /// * `alpha` - Load factor (kept for C++ API parity, not used by ph)
118    pub fn build(self, _c: u16, _alpha: f64) -> io::Result<(MinimizersControlMap, Vec<usize>)> {
119        if self.minimizers.is_empty() {
120            return Ok((MinimizersControlMap {
121                mphf: None,
122                num_keys: 0,
123            }, Vec::new()));
124        }
125
126        let minimizers = self.minimizers;
127        let controls = self.controls;
128        let num_keys = minimizers.len() as u64;
129
130        info!("Building PHast MPHF for {} minimizers", num_keys);
131
132        // Build the MPHF using PHast
133        let mphf = build_mphf_from_vec(minimizers.clone());
134
135        // Build mapping: bucket_id_by_mphf_index[mphf_index] = bucket_id (= metadata)
136        let mut bucket_id_by_mphf_index = vec![0usize; controls.len()];
137        for (idx, minimizer) in minimizers.iter().enumerate() {
138            let pos = mphf.get(minimizer);
139            if pos < bucket_id_by_mphf_index.len() {
140                bucket_id_by_mphf_index[pos] = controls[idx].metadata as usize;
141            }
142        }
143
144        Ok((MinimizersControlMap {
145            mphf: Some(mphf),
146            num_keys,
147        }, bucket_id_by_mphf_index))
148    }
149
150    /// Get the number of unique minimizers
151    pub fn num_minimizers(&self) -> usize {
152        self.minimizers.len()
153    }
154}
155
156impl Default for MinimizersControlMapBuilder {
157    fn default() -> Self {
158        Self::new()
159    }
160}
161
162/// Minimizers Control Map
163///
164/// Associates each unique minimizer with a bucket ID using
165/// a minimal perfect hash function for O(1) lookups.
166/// After reordering, the MPHF index IS the index into control_codewords.
167pub struct MinimizersControlMap {
168    /// MPHF for minimizer lookup (PHast)
169    mphf: Option<Mphf>,
170    /// Number of keys in the MPHF
171    num_keys: u64,
172}
173
174impl MinimizersControlMap {
175    /// Create a MinimizersControlMap from existing parts (for testing)
176    pub fn from_parts(
177        _controls: Vec<MinimizerControl>,
178        minimizers: Vec<u64>,
179        num_keys: u64,
180    ) -> Self {
181        // Build MPHF if we have minimizers
182        let mphf = if !minimizers.is_empty() {
183            Some(build_mphf_from_vec(minimizers))
184        } else {
185            None
186        };
187        
188        Self {
189            mphf,
190            num_keys,
191        }
192    }
193    
194    /// Look up the MPHF index for a minimizer
195    ///
196    /// Returns the MPHF hash (bucket index into control_codewords), or None
197    /// if the minimizer is not in the map.
198    pub fn lookup(&self, minimizer: u64) -> Option<usize> {
199        if let Some(ref mphf) = self.mphf {
200            let id = mphf.get(&minimizer);
201            if id < self.num_keys as usize {
202                Some(id)
203            } else {
204                None
205            }
206        } else {
207            None
208        }
209    }
210
211    /// Get a reference to the MPHF (for serialization)
212    pub fn mphf_ref(&self) -> Option<&Mphf> {
213        self.mphf.as_ref()
214    }
215
216    /// Set the MPHF (for deserialization)
217    pub fn set_mphf(&mut self, mphf: Option<Mphf>) {
218        self.mphf = mphf;
219    }
220
221    /// Get the number of unique minimizers
222    pub fn num_minimizers(&self) -> u64 {
223        self.num_keys
224    }
225
226    /// Get memory usage in bits (approximation)
227    pub fn num_bits(&self) -> u64 {
228        
229        
230        if let Some(ref _mphf) = self.mphf {
231            // Estimate: fmph uses ~3.5-4 bits per key typically
232            (self.num_keys as f64 * 4.0) as u64
233        } else {
234            0
235        }
236    }
237
238    /// Exact serialized byte size of the main MPHF
239    pub fn mphf_serialized_bytes(&self) -> usize {
240        match &self.mphf {
241            Some(mphf) => mphf.write_bytes(),
242            None => 0,
243        }
244    }
245
246    /// Serialize control map without MPHF
247    ///
248    /// Only stores num_keys since control_codewords are now in the index.
249    pub fn serialize_without_mphf<W: io::Write>(
250        &self,
251        writer: &mut W,
252    ) -> io::Result<()> {
253        // Write number of keys
254        writer.write_all(&self.num_keys.to_le_bytes())?;
255        
256        Ok(())
257    }
258
259    /// Deserialize control map without MPHF
260    ///
261    /// Returns MinimizersControlMap with empty MPHF (must be filled by caller)
262    pub fn deserialize_without_mphf<R: io::Read>(
263        reader: &mut R,
264    ) -> io::Result<Self> {
265        // Read number of keys
266        let mut num_keys_bytes = [0u8; 8];
267        reader.read_exact(&mut num_keys_bytes)?;
268        let num_keys = u64::from_le_bytes(num_keys_bytes);
269        
270        Ok(Self {
271            mphf: None,
272            num_keys,
273        })
274    }
275}
276
277#[cfg(test)]
278mod tests {
279    use super::*;
280
281    #[test]
282    fn test_minimizers_control_map_builder_creation() {
283        let builder = MinimizersControlMapBuilder::new();
284        assert_eq!(builder.num_minimizers(), 0);
285    }
286
287    #[test]
288    fn test_minimizers_control_map_builder_add() {
289        let mut builder = MinimizersControlMapBuilder::new();
290        
291        let id1 = builder.add_minimizer(100);
292        let id2 = builder.add_minimizer(200);
293        let id3 = builder.add_minimizer(100); // Duplicate
294        
295        assert_eq!(id1, 0);
296        assert_eq!(id2, 1);
297        assert_eq!(id3, 0); // Same as first
298        assert_eq!(builder.num_minimizers(), 2);
299    }
300
301    #[test]
302    fn test_minimizers_control_map_builder_increment() {
303        let mut builder = MinimizersControlMapBuilder::new();
304        
305        builder.increment_count(100);
306        builder.increment_count(100);
307        builder.increment_count(200);
308        
309        assert_eq!(builder.num_minimizers(), 2);
310        assert_eq!(builder.controls[0].count, 2);
311        assert_eq!(builder.controls[1].count, 1);
312    }
313
314    #[test]
315    fn test_minimizers_control_map_builder_bucket_type() {
316        let mut builder = MinimizersControlMapBuilder::new();
317        
318        builder.add_minimizer(100);
319        builder.set_bucket_type(100, BucketType::Sparse);
320        
321        assert_eq!(builder.controls[0].bucket_type, BucketType::Sparse);
322    }
323
324    #[test]
325    fn test_minimizers_control_map_builder_finalize() {
326        let mut builder = MinimizersControlMapBuilder::new();
327        
328        builder.add_minimizer(100);
329        builder.controls[0].count = 5;
330        
331        builder.add_minimizer(200);
332        builder.controls[1].count = 15;
333        
334        builder.add_minimizer(300);
335        builder.controls[2].count = 150;
336        
337        builder.finalize_bucket_types(10, 100);
338        
339        assert_eq!(builder.controls[0].bucket_type, BucketType::Regular);
340        assert_eq!(builder.controls[1].bucket_type, BucketType::Sparse);
341        assert_eq!(builder.controls[2].bucket_type, BucketType::HeavyLoad);
342    }
343
344    #[test]
345    fn test_minimizers_control_map_build_empty() {
346        let builder = MinimizersControlMapBuilder::new();
347        let (mcm, mapping) = builder.build(100, 0.94).unwrap();
348        
349        assert_eq!(mcm.num_minimizers(), 0);
350        assert!(mcm.mphf.is_none());
351        assert!(mapping.is_empty());
352    }
353
354    #[test]
355    fn test_minimizers_control_map_build_and_lookup() {
356        let mut builder = MinimizersControlMapBuilder::new();
357        
358        builder.increment_count(100);
359        builder.increment_count(100);
360        builder.increment_count(200);
361        builder.set_bucket_type(100, BucketType::Sparse);
362        
363        let (mcm, _mapping) = builder.build(100, 0.94).unwrap();
364        
365        assert_eq!(mcm.num_minimizers(), 2);
366        
367        // lookup now returns Option<usize> (MPHF index)
368        let idx_100 = mcm.lookup(100).unwrap();
369        assert!(idx_100 < 2); // Valid MPHF index
370        
371        let idx_200 = mcm.lookup(200).unwrap();
372        assert!(idx_200 < 2); // Valid MPHF index
373        assert_ne!(idx_100, idx_200); // Different minimizers get different indices
374    }
375
376    #[test]
377    fn test_minimizers_control_map_lookup_missing() {
378        let mut builder = MinimizersControlMapBuilder::new();
379        builder.increment_count(100);
380        
381        let (mcm, _) = builder.build(100, 0.94).unwrap();
382        
383        // 300 was never added
384        let _result = mcm.lookup(300);
385        // Note: MPHF might return a value even for keys not in the original set,
386        // but it should be out of bounds or not match expected semantics
387        // For a true membership test, we'd need a separate bloom filter or similar
388    }
389
390    #[test]
391    fn test_bucket_type_default() {
392        let bucket_type = BucketType::default();
393        assert_eq!(bucket_type, BucketType::Regular);
394    }
395
396    #[test]
397    fn test_minimizer_control_default() {
398        let control = MinimizerControl {
399            count: 5,
400            bucket_type: BucketType::Regular,
401            metadata: 0,
402        };
403        assert_eq!(control.count, 5);
404        assert_eq!(control.bucket_type, BucketType::Regular);
405    }
406}