1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
//! Minimizers Control Map (MCM)
//!
//! Maps minimizer values to their control information using a minimal perfect hash function.
//! This provides O(1) lookup of metadata associated with each unique minimizer.
use rapidhash::HashMapExt;
use crate::partitioned_mphf::PartitionedMphf;
use std::io;
use tracing::info;
/// Control information associated with a minimizer
#[derive(Clone, Copy, Debug)]
pub struct MinimizerControl {
/// Number of strings containing this minimizer
pub count: u32,
/// Bucket type (regular, sparse, or heavy-load)
pub bucket_type: BucketType,
/// Additional metadata
pub metadata: u64,
}
/// Classification of minimizer buckets by density
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[derive(Default)]
pub enum BucketType {
/// Regular bucket: few strings contain this minimizer
#[default]
Regular,
/// Sparse bucket: moderately populated
Sparse,
/// Heavy-load bucket: many strings contain this minimizer
HeavyLoad,
}
/// Builder for constructing a MinimizersControlMap
///
/// This two-phase builder collects minimizers during construction,
/// then builds the MPHF once all minimizers are known.
pub struct MinimizersControlMapBuilder {
/// Minimizers collected during building (stable order for MPHF)
minimizers: Vec<u64>,
/// Control information for each minimizer (parallel to minimizers vec)
controls: Vec<MinimizerControl>,
/// O(1) lookup: minimizer value → index in minimizers/controls vecs
index: rapidhash::RapidHashMap<u64, usize>,
}
impl MinimizersControlMapBuilder {
/// Create a new builder
pub fn new() -> Self {
Self {
minimizers: Vec::new(),
controls: Vec::new(),
index: rapidhash::RapidHashMap::new(),
}
}
/// Add a minimizer with initial control info
pub fn add_minimizer(&mut self, minimizer: u64) -> usize {
if let Some(&pos) = self.index.get(&minimizer) {
return pos;
}
let id = self.minimizers.len();
self.minimizers.push(minimizer);
self.controls.push(MinimizerControl {
count: 0,
bucket_type: BucketType::Regular,
metadata: 0,
});
self.index.insert(minimizer, id);
id
}
/// Increment count for a minimizer
pub fn increment_count(&mut self, minimizer: u64) {
let id = self.add_minimizer(minimizer);
self.controls[id].count = self.controls[id].count.saturating_add(1);
}
/// Set bucket type for a minimizer
pub fn set_bucket_type(&mut self, minimizer: u64, bucket_type: BucketType) {
let id = self.add_minimizer(minimizer);
self.controls[id].bucket_type = bucket_type;
}
/// Get mutable access to control info for a minimizer
pub fn get_control_mut(&mut self, minimizer: u64) -> Option<&mut MinimizerControl> {
self.index
.get(&minimizer)
.copied()
.map(|idx| &mut self.controls[idx])
}
/// Classify bucket types based on count thresholds
pub fn finalize_bucket_types(&mut self, threshold_sparse: u32, threshold_heavy: u32) {
for control in &mut self.controls {
control.bucket_type = if control.count > threshold_heavy {
BucketType::HeavyLoad
} else if control.count > threshold_sparse {
BucketType::Sparse
} else {
BucketType::Regular
};
}
}
/// Build the final MinimizersControlMap with MPHF
///
/// Returns the control map AND a mapping from MPHF index to bucket_id.
/// This mapping is used to reorder control_codewords to MPHF order.
///
/// # Arguments
/// * `c` - Relative level size (percentage, typically 100)
/// * `alpha` - Load factor (kept for C++ API parity, not used by ph)
/// * `partitioned` - Whether to use partitioned MPHF (parallel build)
pub fn build(self, _c: u16, _alpha: f64, partitioned: bool) -> io::Result<(MinimizersControlMap, Vec<usize>)> {
if self.minimizers.is_empty() {
return Ok((MinimizersControlMap {
mphf: None,
num_keys: 0,
}, Vec::new()));
}
let minimizers = self.minimizers;
let controls = self.controls;
let num_keys = minimizers.len() as u64;
info!("Building PHast MPHF for {} minimizers (partitioned={})", num_keys, partitioned);
// Build the MPHF using PartitionedMphf
let mphf = PartitionedMphf::build_from_vec(minimizers.clone(), partitioned);
// Build mapping: bucket_id_by_mphf_index[mphf_index] = bucket_id (= metadata)
let mut bucket_id_by_mphf_index = vec![0usize; controls.len()];
for (idx, minimizer) in minimizers.iter().enumerate() {
let pos = mphf.get(minimizer);
if pos < bucket_id_by_mphf_index.len() {
bucket_id_by_mphf_index[pos] = controls[idx].metadata as usize;
}
}
Ok((MinimizersControlMap {
mphf: Some(mphf),
num_keys,
}, bucket_id_by_mphf_index))
}
/// Get the number of unique minimizers
pub fn num_minimizers(&self) -> usize {
self.minimizers.len()
}
}
impl Default for MinimizersControlMapBuilder {
fn default() -> Self {
Self::new()
}
}
/// Minimizers Control Map
///
/// Associates each unique minimizer with a bucket ID using
/// a minimal perfect hash function for O(1) lookups.
/// After reordering, the MPHF index IS the index into control_codewords.
pub struct MinimizersControlMap {
/// MPHF for minimizer lookup (partitioned PHast)
mphf: Option<PartitionedMphf>,
/// Number of keys in the MPHF
num_keys: u64,
}
impl MinimizersControlMap {
/// Create a MinimizersControlMap from existing parts (for testing)
pub fn from_parts(
_controls: Vec<MinimizerControl>,
minimizers: Vec<u64>,
num_keys: u64,
) -> Self {
// Build MPHF if we have minimizers
let mphf = if !minimizers.is_empty() {
Some(PartitionedMphf::build_from_vec(minimizers, false))
} else {
None
};
Self {
mphf,
num_keys,
}
}
/// Create a MinimizersControlMap from a pre-built MPHF.
///
/// Used by the external sort build path which builds the MPHF directly
/// without going through MinimizersControlMapBuilder.
pub fn from_mphf(mphf: PartitionedMphf, num_keys: u64) -> Self {
Self {
mphf: Some(mphf),
num_keys,
}
}
/// Look up the MPHF index for a minimizer
///
/// Returns the MPHF hash (bucket index into control_codewords), or None
/// if the minimizer is not in the map.
pub fn lookup(&self, minimizer: u64) -> Option<usize> {
if let Some(ref mphf) = self.mphf {
let id = mphf.get(&minimizer);
if id < self.num_keys as usize {
Some(id)
} else {
None
}
} else {
None
}
}
/// Get a reference to the MPHF (for serialization)
pub fn mphf_ref(&self) -> Option<&PartitionedMphf> {
self.mphf.as_ref()
}
/// Set the MPHF (for deserialization)
pub fn set_mphf(&mut self, mphf: Option<PartitionedMphf>) {
self.mphf = mphf;
}
/// Get the number of unique minimizers
pub fn num_minimizers(&self) -> u64 {
self.num_keys
}
/// Get memory usage in bits (approximation)
pub fn num_bits(&self) -> u64 {
if let Some(ref _mphf) = self.mphf {
// Estimate: fmph uses ~3.5-4 bits per key typically
(self.num_keys as f64 * 4.0) as u64
} else {
0
}
}
/// Exact serialized byte size of the main MPHF
pub fn mphf_serialized_bytes(&self) -> usize {
match &self.mphf {
Some(pmphf) => pmphf.write_bytes(),
None => 0,
}
}
/// Serialize control map without MPHF
///
/// Only stores num_keys since control_codewords are now in the index.
pub fn serialize_without_mphf<W: io::Write>(
&self,
writer: &mut W,
) -> io::Result<()> {
// Write number of keys
writer.write_all(&self.num_keys.to_le_bytes())?;
Ok(())
}
/// Deserialize control map without MPHF
///
/// Returns MinimizersControlMap with empty MPHF (must be filled by caller)
pub fn deserialize_without_mphf<R: io::Read>(
reader: &mut R,
) -> io::Result<Self> {
// Read number of keys
let mut num_keys_bytes = [0u8; 8];
reader.read_exact(&mut num_keys_bytes)?;
let num_keys = u64::from_le_bytes(num_keys_bytes);
Ok(Self {
mphf: None,
num_keys,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_minimizers_control_map_builder_creation() {
let builder = MinimizersControlMapBuilder::new();
assert_eq!(builder.num_minimizers(), 0);
}
#[test]
fn test_minimizers_control_map_builder_add() {
let mut builder = MinimizersControlMapBuilder::new();
let id1 = builder.add_minimizer(100);
let id2 = builder.add_minimizer(200);
let id3 = builder.add_minimizer(100); // Duplicate
assert_eq!(id1, 0);
assert_eq!(id2, 1);
assert_eq!(id3, 0); // Same as first
assert_eq!(builder.num_minimizers(), 2);
}
#[test]
fn test_minimizers_control_map_builder_increment() {
let mut builder = MinimizersControlMapBuilder::new();
builder.increment_count(100);
builder.increment_count(100);
builder.increment_count(200);
assert_eq!(builder.num_minimizers(), 2);
assert_eq!(builder.controls[0].count, 2);
assert_eq!(builder.controls[1].count, 1);
}
#[test]
fn test_minimizers_control_map_builder_bucket_type() {
let mut builder = MinimizersControlMapBuilder::new();
builder.add_minimizer(100);
builder.set_bucket_type(100, BucketType::Sparse);
assert_eq!(builder.controls[0].bucket_type, BucketType::Sparse);
}
#[test]
fn test_minimizers_control_map_builder_finalize() {
let mut builder = MinimizersControlMapBuilder::new();
builder.add_minimizer(100);
builder.controls[0].count = 5;
builder.add_minimizer(200);
builder.controls[1].count = 15;
builder.add_minimizer(300);
builder.controls[2].count = 150;
builder.finalize_bucket_types(10, 100);
assert_eq!(builder.controls[0].bucket_type, BucketType::Regular);
assert_eq!(builder.controls[1].bucket_type, BucketType::Sparse);
assert_eq!(builder.controls[2].bucket_type, BucketType::HeavyLoad);
}
#[test]
fn test_minimizers_control_map_build_empty() {
let builder = MinimizersControlMapBuilder::new();
let (mcm, mapping) = builder.build(100, 0.94, false).unwrap();
assert_eq!(mcm.num_minimizers(), 0);
assert!(mcm.mphf.is_none());
assert!(mapping.is_empty());
}
#[test]
fn test_minimizers_control_map_build_and_lookup() {
let mut builder = MinimizersControlMapBuilder::new();
builder.increment_count(100);
builder.increment_count(100);
builder.increment_count(200);
builder.set_bucket_type(100, BucketType::Sparse);
let (mcm, _mapping) = builder.build(100, 0.94, false).unwrap();
assert_eq!(mcm.num_minimizers(), 2);
// lookup now returns Option<usize> (MPHF index)
let idx_100 = mcm.lookup(100).unwrap();
assert!(idx_100 < 2); // Valid MPHF index
let idx_200 = mcm.lookup(200).unwrap();
assert!(idx_200 < 2); // Valid MPHF index
assert_ne!(idx_100, idx_200); // Different minimizers get different indices
}
#[test]
fn test_minimizers_control_map_lookup_missing() {
let mut builder = MinimizersControlMapBuilder::new();
builder.increment_count(100);
let (mcm, _) = builder.build(100, 0.94, false).unwrap();
// 300 was never added
let _result = mcm.lookup(300);
// Note: MPHF might return a value even for keys not in the original set,
// but it should be out of bounds or not match expected semantics
// For a true membership test, we'd need a separate bloom filter or similar
}
#[test]
fn test_bucket_type_default() {
let bucket_type = BucketType::default();
assert_eq!(bucket_type, BucketType::Regular);
}
#[test]
fn test_minimizer_control_default() {
let control = MinimizerControl {
count: 5,
bucket_type: BucketType::Regular,
metadata: 0,
};
assert_eq!(control.count, 5);
assert_eq!(control.bucket_type, BucketType::Regular);
}
}