Skip to main content

sshash_lib/
minimizers_control_map.rs

1//! Minimizers Control Map (MCM)
2//!
3//! Maps minimizer values to their control information using a minimal perfect hash function.
4//! This provides O(1) lookup of metadata associated with each unique minimizer.
5
6use rapidhash::HashMapExt;
7
8use crate::partitioned_mphf::PartitionedMphf;
9use std::io;
10use tracing::info;
11
12/// Control information associated with a minimizer
13#[derive(Clone, Copy, Debug)]
14pub struct MinimizerControl {
15    /// Number of strings containing this minimizer
16    pub count: u32,
17    /// Bucket type (regular, sparse, or heavy-load)
18    pub bucket_type: BucketType,
19    /// Additional metadata
20    pub metadata: u64,
21}
22
23/// Classification of minimizer buckets by density
24#[derive(Clone, Copy, Debug, PartialEq, Eq)]
25#[derive(Default)]
26pub enum BucketType {
27    /// Regular bucket: few strings contain this minimizer
28    #[default]
29    Regular,
30    /// Sparse bucket: moderately populated
31    Sparse,
32    /// Heavy-load bucket: many strings contain this minimizer
33    HeavyLoad,
34}
35
36
37/// Builder for constructing a MinimizersControlMap
38///
39/// This two-phase builder collects minimizers during construction,
40/// then builds the MPHF once all minimizers are known.
41pub struct MinimizersControlMapBuilder {
42    /// Minimizers collected during building (stable order for MPHF)
43    minimizers: Vec<u64>,
44    /// Control information for each minimizer (parallel to minimizers vec)
45    controls: Vec<MinimizerControl>,
46    /// O(1) lookup: minimizer value → index in minimizers/controls vecs
47    index: rapidhash::RapidHashMap<u64, usize>,
48}
49
50impl MinimizersControlMapBuilder {
51    /// Create a new builder
52    pub fn new() -> Self {
53        Self {
54            minimizers: Vec::new(),
55            controls: Vec::new(),
56            index: rapidhash::RapidHashMap::new(),
57        }
58    }
59
60    /// Add a minimizer with initial control info
61    pub fn add_minimizer(&mut self, minimizer: u64) -> usize {
62        if let Some(&pos) = self.index.get(&minimizer) {
63            return pos;
64        }
65
66        let id = self.minimizers.len();
67        self.minimizers.push(minimizer);
68        self.controls.push(MinimizerControl {
69            count: 0,
70            bucket_type: BucketType::Regular,
71            metadata: 0,
72        });
73        self.index.insert(minimizer, id);
74        id
75    }
76
77    /// Increment count for a minimizer
78    pub fn increment_count(&mut self, minimizer: u64) {
79        let id = self.add_minimizer(minimizer);
80        self.controls[id].count = self.controls[id].count.saturating_add(1);
81    }
82
83    /// Set bucket type for a minimizer
84    pub fn set_bucket_type(&mut self, minimizer: u64, bucket_type: BucketType) {
85        let id = self.add_minimizer(minimizer);
86        self.controls[id].bucket_type = bucket_type;
87    }
88
89    /// Get mutable access to control info for a minimizer
90    pub fn get_control_mut(&mut self, minimizer: u64) -> Option<&mut MinimizerControl> {
91        self.index
92            .get(&minimizer)
93            .copied()
94            .map(|idx| &mut self.controls[idx])
95    }
96
97    /// Classify bucket types based on count thresholds
98    pub fn finalize_bucket_types(&mut self, threshold_sparse: u32, threshold_heavy: u32) {
99        for control in &mut self.controls {
100            control.bucket_type = if control.count > threshold_heavy {
101                BucketType::HeavyLoad
102            } else if control.count > threshold_sparse {
103                BucketType::Sparse
104            } else {
105                BucketType::Regular
106            };
107        }
108    }
109
110    /// Build the final MinimizersControlMap with MPHF
111    ///
112    /// Returns the control map AND a mapping from MPHF index to bucket_id.
113    /// This mapping is used to reorder control_codewords to MPHF order.
114    ///
115    /// # Arguments
116    /// * `c` - Relative level size (percentage, typically 100)
117    /// * `alpha` - Load factor (kept for C++ API parity, not used by ph)
118    /// * `partitioned` - Whether to use partitioned MPHF (parallel build)
119    pub fn build(self, _c: u16, _alpha: f64, partitioned: bool) -> io::Result<(MinimizersControlMap, Vec<usize>)> {
120        if self.minimizers.is_empty() {
121            return Ok((MinimizersControlMap {
122                mphf: None,
123                num_keys: 0,
124            }, Vec::new()));
125        }
126
127        let minimizers = self.minimizers;
128        let controls = self.controls;
129        let num_keys = minimizers.len() as u64;
130
131        info!("Building PHast MPHF for {} minimizers (partitioned={})", num_keys, partitioned);
132
133        // Build the MPHF using PartitionedMphf
134        let mphf = PartitionedMphf::build_from_vec(minimizers.clone(), partitioned);
135
136        // Build mapping: bucket_id_by_mphf_index[mphf_index] = bucket_id (= metadata)
137        let mut bucket_id_by_mphf_index = vec![0usize; controls.len()];
138        for (idx, minimizer) in minimizers.iter().enumerate() {
139            let pos = mphf.get(minimizer);
140            if pos < bucket_id_by_mphf_index.len() {
141                bucket_id_by_mphf_index[pos] = controls[idx].metadata as usize;
142            }
143        }
144
145        Ok((MinimizersControlMap {
146            mphf: Some(mphf),
147            num_keys,
148        }, bucket_id_by_mphf_index))
149    }
150
151    /// Get the number of unique minimizers
152    pub fn num_minimizers(&self) -> usize {
153        self.minimizers.len()
154    }
155}
156
157impl Default for MinimizersControlMapBuilder {
158    fn default() -> Self {
159        Self::new()
160    }
161}
162
163/// Minimizers Control Map
164///
165/// Associates each unique minimizer with a bucket ID using
166/// a minimal perfect hash function for O(1) lookups.
167/// After reordering, the MPHF index IS the index into control_codewords.
168pub struct MinimizersControlMap {
169    /// MPHF for minimizer lookup (partitioned PHast)
170    mphf: Option<PartitionedMphf>,
171    /// Number of keys in the MPHF
172    num_keys: u64,
173}
174
175impl MinimizersControlMap {
176    /// Create a MinimizersControlMap from existing parts (for testing)
177    pub fn from_parts(
178        _controls: Vec<MinimizerControl>,
179        minimizers: Vec<u64>,
180        num_keys: u64,
181    ) -> Self {
182        // Build MPHF if we have minimizers
183        let mphf = if !minimizers.is_empty() {
184            Some(PartitionedMphf::build_from_vec(minimizers, false))
185        } else {
186            None
187        };
188
189        Self {
190            mphf,
191            num_keys,
192        }
193    }
194
195    /// Create a MinimizersControlMap from a pre-built MPHF.
196    ///
197    /// Used by the external sort build path which builds the MPHF directly
198    /// without going through MinimizersControlMapBuilder.
199    pub fn from_mphf(mphf: PartitionedMphf, num_keys: u64) -> Self {
200        Self {
201            mphf: Some(mphf),
202            num_keys,
203        }
204    }
205    
206    /// Look up the MPHF index for a minimizer
207    ///
208    /// Returns the MPHF hash (bucket index into control_codewords), or None
209    /// if the minimizer is not in the map.
210    pub fn lookup(&self, minimizer: u64) -> Option<usize> {
211        if let Some(ref mphf) = self.mphf {
212            let id = mphf.get(&minimizer);
213            if id < self.num_keys as usize {
214                Some(id)
215            } else {
216                None
217            }
218        } else {
219            None
220        }
221    }
222
223    /// Get a reference to the MPHF (for serialization)
224    pub fn mphf_ref(&self) -> Option<&PartitionedMphf> {
225        self.mphf.as_ref()
226    }
227
228    /// Set the MPHF (for deserialization)
229    pub fn set_mphf(&mut self, mphf: Option<PartitionedMphf>) {
230        self.mphf = mphf;
231    }
232
233    /// Get the number of unique minimizers
234    pub fn num_minimizers(&self) -> u64 {
235        self.num_keys
236    }
237
238    /// Get memory usage in bits (approximation)
239    pub fn num_bits(&self) -> u64 {
240        
241        
242        if let Some(ref _mphf) = self.mphf {
243            // Estimate: fmph uses ~3.5-4 bits per key typically
244            (self.num_keys as f64 * 4.0) as u64
245        } else {
246            0
247        }
248    }
249
250    /// Exact serialized byte size of the main MPHF
251    pub fn mphf_serialized_bytes(&self) -> usize {
252        match &self.mphf {
253            Some(pmphf) => pmphf.write_bytes(),
254            None => 0,
255        }
256    }
257
258    /// Serialize control map without MPHF
259    ///
260    /// Only stores num_keys since control_codewords are now in the index.
261    pub fn serialize_without_mphf<W: io::Write>(
262        &self,
263        writer: &mut W,
264    ) -> io::Result<()> {
265        // Write number of keys
266        writer.write_all(&self.num_keys.to_le_bytes())?;
267        
268        Ok(())
269    }
270
271    /// Deserialize control map without MPHF
272    ///
273    /// Returns MinimizersControlMap with empty MPHF (must be filled by caller)
274    pub fn deserialize_without_mphf<R: io::Read>(
275        reader: &mut R,
276    ) -> io::Result<Self> {
277        // Read number of keys
278        let mut num_keys_bytes = [0u8; 8];
279        reader.read_exact(&mut num_keys_bytes)?;
280        let num_keys = u64::from_le_bytes(num_keys_bytes);
281        
282        Ok(Self {
283            mphf: None,
284            num_keys,
285        })
286    }
287}
288
289#[cfg(test)]
290mod tests {
291    use super::*;
292
293    #[test]
294    fn test_minimizers_control_map_builder_creation() {
295        let builder = MinimizersControlMapBuilder::new();
296        assert_eq!(builder.num_minimizers(), 0);
297    }
298
299    #[test]
300    fn test_minimizers_control_map_builder_add() {
301        let mut builder = MinimizersControlMapBuilder::new();
302        
303        let id1 = builder.add_minimizer(100);
304        let id2 = builder.add_minimizer(200);
305        let id3 = builder.add_minimizer(100); // Duplicate
306        
307        assert_eq!(id1, 0);
308        assert_eq!(id2, 1);
309        assert_eq!(id3, 0); // Same as first
310        assert_eq!(builder.num_minimizers(), 2);
311    }
312
313    #[test]
314    fn test_minimizers_control_map_builder_increment() {
315        let mut builder = MinimizersControlMapBuilder::new();
316        
317        builder.increment_count(100);
318        builder.increment_count(100);
319        builder.increment_count(200);
320        
321        assert_eq!(builder.num_minimizers(), 2);
322        assert_eq!(builder.controls[0].count, 2);
323        assert_eq!(builder.controls[1].count, 1);
324    }
325
326    #[test]
327    fn test_minimizers_control_map_builder_bucket_type() {
328        let mut builder = MinimizersControlMapBuilder::new();
329        
330        builder.add_minimizer(100);
331        builder.set_bucket_type(100, BucketType::Sparse);
332        
333        assert_eq!(builder.controls[0].bucket_type, BucketType::Sparse);
334    }
335
336    #[test]
337    fn test_minimizers_control_map_builder_finalize() {
338        let mut builder = MinimizersControlMapBuilder::new();
339        
340        builder.add_minimizer(100);
341        builder.controls[0].count = 5;
342        
343        builder.add_minimizer(200);
344        builder.controls[1].count = 15;
345        
346        builder.add_minimizer(300);
347        builder.controls[2].count = 150;
348        
349        builder.finalize_bucket_types(10, 100);
350        
351        assert_eq!(builder.controls[0].bucket_type, BucketType::Regular);
352        assert_eq!(builder.controls[1].bucket_type, BucketType::Sparse);
353        assert_eq!(builder.controls[2].bucket_type, BucketType::HeavyLoad);
354    }
355
356    #[test]
357    fn test_minimizers_control_map_build_empty() {
358        let builder = MinimizersControlMapBuilder::new();
359        let (mcm, mapping) = builder.build(100, 0.94, false).unwrap();
360        
361        assert_eq!(mcm.num_minimizers(), 0);
362        assert!(mcm.mphf.is_none());
363        assert!(mapping.is_empty());
364    }
365
366    #[test]
367    fn test_minimizers_control_map_build_and_lookup() {
368        let mut builder = MinimizersControlMapBuilder::new();
369        
370        builder.increment_count(100);
371        builder.increment_count(100);
372        builder.increment_count(200);
373        builder.set_bucket_type(100, BucketType::Sparse);
374        
375        let (mcm, _mapping) = builder.build(100, 0.94, false).unwrap();
376        
377        assert_eq!(mcm.num_minimizers(), 2);
378        
379        // lookup now returns Option<usize> (MPHF index)
380        let idx_100 = mcm.lookup(100).unwrap();
381        assert!(idx_100 < 2); // Valid MPHF index
382        
383        let idx_200 = mcm.lookup(200).unwrap();
384        assert!(idx_200 < 2); // Valid MPHF index
385        assert_ne!(idx_100, idx_200); // Different minimizers get different indices
386    }
387
388    #[test]
389    fn test_minimizers_control_map_lookup_missing() {
390        let mut builder = MinimizersControlMapBuilder::new();
391        builder.increment_count(100);
392        
393        let (mcm, _) = builder.build(100, 0.94, false).unwrap();
394        
395        // 300 was never added
396        let _result = mcm.lookup(300);
397        // Note: MPHF might return a value even for keys not in the original set,
398        // but it should be out of bounds or not match expected semantics
399        // For a true membership test, we'd need a separate bloom filter or similar
400    }
401
402    #[test]
403    fn test_bucket_type_default() {
404        let bucket_type = BucketType::default();
405        assert_eq!(bucket_type, BucketType::Regular);
406    }
407
408    #[test]
409    fn test_minimizer_control_default() {
410        let control = MinimizerControl {
411            count: 5,
412            bucket_type: BucketType::Regular,
413            metadata: 0,
414        };
415        assert_eq!(control.count, 5);
416        assert_eq!(control.bucket_type, BucketType::Regular);
417    }
418}