Skip to main content

sshash_lib/builder/
buckets.rs

1//! Bucket classification and statistics
2//!
3//! This module classifies minimizer buckets into singleton, light (sparse),
4//! and heavy (skew) categories based on bucket size.
5
6use crate::builder::minimizer_tuples::MinimizerTuple;
7use crate::constants::{INVALID_UINT64, MIN_L};
8use tracing::info;
9
10/// Bucket size threshold between light and heavy buckets
11pub const MIN_BUCKET_SIZE: usize = 1 << MIN_L; // 2^6 = 64
12
13/// Statistics about bucket sizes and distribution
14#[derive(Debug, Clone)]
15pub struct BucketStatistics {
16    /// Total number of buckets (unique minimizers)
17    pub num_buckets: u64,
18    
19    /// Total number of k-mers
20    pub num_kmers: u64,
21    
22    /// Total number of minimizer positions
23    pub num_positions: u64,
24    
25    /// Number of singleton buckets (size == 1)
26    pub num_singleton_buckets: u64,
27    
28    /// Number of light buckets (1 < size <= MIN_BUCKET_SIZE)
29    pub num_light_buckets: u64,
30    
31    /// Number of heavy buckets (size > MIN_BUCKET_SIZE)
32    pub num_heavy_buckets: u64,
33    
34    /// Maximum observed bucket size
35    pub max_bucket_size: usize,
36    
37    /// Total positions in light buckets
38    pub num_positions_in_light: u64,
39    
40    /// Total positions in heavy buckets
41    pub num_positions_in_heavy: u64,
42    
43    /// Total super-k-mers in buckets larger than 1
44    pub num_super_kmers_in_non_singleton: u64,
45}
46
47impl BucketStatistics {
48    /// Create a new statistics tracker
49    pub fn new() -> Self {
50        Self {
51            num_buckets: 0,
52            num_kmers: 0,
53            num_positions: 0,
54            num_singleton_buckets: 0,
55            num_light_buckets: 0,
56            num_heavy_buckets: 0,
57            max_bucket_size: 0,
58            num_positions_in_light: 0,
59            num_positions_in_heavy: 0,
60            num_super_kmers_in_non_singleton: 0,
61        }
62    }
63    
64    /// Record statistics for a bucket
65    pub fn add_bucket(&mut self, bucket: &[MinimizerTuple]) {
66        self.num_buckets += 1;
67        let mut bucket_size: usize = 0;
68        let mut prev_pos_in_seq = INVALID_UINT64;
69        for tuple in bucket {
70            if tuple.pos_in_seq != prev_pos_in_seq {
71                bucket_size += 1;
72                prev_pos_in_seq = tuple.pos_in_seq;
73            }
74        }
75        
76        if bucket_size > self.max_bucket_size {
77            self.max_bucket_size = bucket_size;
78        }
79        
80        // Count total k-mers in this bucket
81        let kmers_in_bucket: u64 = bucket.iter()
82            .map(|mt| mt.num_kmers_in_super_kmer as u64)
83            .sum();
84        self.num_kmers += kmers_in_bucket;
85        
86        // Classify bucket
87        match bucket_size {
88            1 => {
89                self.num_singleton_buckets += 1;
90                self.num_positions += 1;
91            }
92            2..=MIN_BUCKET_SIZE => {
93                self.num_light_buckets += 1;
94                self.num_positions_in_light += bucket_size as u64;
95                self.num_positions += bucket_size as u64;
96                self.num_super_kmers_in_non_singleton += bucket.len() as u64;
97            }
98            _ => {
99                self.num_heavy_buckets += 1;
100                self.num_positions_in_heavy += bucket_size as u64;
101                self.num_positions += bucket_size as u64;
102                self.num_super_kmers_in_non_singleton += bucket.len() as u64;
103            }
104        }
105    }
106    
107    /// Log statistics summary via tracing
108    pub fn print_summary(&self) {
109        info!("Bucket Statistics:");
110        info!("  Total buckets: {}", self.num_buckets);
111        info!("  Total k-mers: {}", self.num_kmers);
112        info!("  Total positions: {}", self.num_positions);
113        
114        info!("  Singleton buckets: {} ({:.2}%)", 
115                 self.num_singleton_buckets,
116                 (self.num_singleton_buckets as f64 * 100.0) / self.num_buckets as f64);
117        
118        info!("  Light buckets (2-{}): {} ({:.2}%)", 
119                 MIN_BUCKET_SIZE,
120                 self.num_light_buckets,
121                 (self.num_light_buckets as f64 * 100.0) / self.num_buckets as f64);
122        
123        info!("  Heavy buckets (>{}): {} ({:.2}%)", 
124                 MIN_BUCKET_SIZE,
125                 self.num_heavy_buckets,
126                 (self.num_heavy_buckets as f64 * 100.0) / self.num_buckets as f64);
127        
128        info!("  Max bucket size: {}", self.max_bucket_size);
129        info!("  Positions in light buckets: {} ({:.2}%)",
130                 self.num_positions_in_light,
131                 (self.num_positions_in_light as f64 * 100.0) / self.num_positions as f64);
132        info!("  Positions in heavy buckets: {} ({:.2}%)",
133                 self.num_positions_in_heavy,
134                 (self.num_positions_in_heavy as f64 * 100.0) / self.num_positions as f64);
135    }
136}
137
138impl Default for BucketStatistics {
139    fn default() -> Self {
140        Self::new()
141    }
142}
143
144/// Bucket type classification
145#[derive(Debug, Clone, Copy, PartialEq, Eq)]
146pub enum BucketType {
147    /// Singleton bucket (size == 1)
148    Singleton,
149    /// Light bucket (1 < size <= MIN_BUCKET_SIZE) - goes into sparse index
150    Light,
151    /// Heavy bucket (size > MIN_BUCKET_SIZE) - goes into skew index
152    Heavy,
153}
154
155impl BucketType {
156    /// Classify a bucket by its size
157    pub fn from_bucket_size(size: usize) -> Self {
158        match size {
159            1 => BucketType::Singleton,
160            2..=MIN_BUCKET_SIZE => BucketType::Light,
161            _ => BucketType::Heavy,
162        }
163    }
164}
165
166/// A bucket of minimizer tuples sharing the same minimizer value
167#[derive(Debug, Clone)]
168pub struct Bucket {
169    /// The minimizer value for this bucket
170    pub minimizer: u64,
171
172    /// All tuples with this minimizer
173    pub tuples: Vec<MinimizerTuple>,
174
175    /// Cached number of unique super-kmers (distinct pos_in_seq values)
176    pub cached_size: usize,
177
178    /// Bucket type classification
179    pub bucket_type: BucketType,
180}
181
182impl Bucket {
183    /// Create a new bucket
184    pub fn new(minimizer: u64, tuples: Vec<MinimizerTuple>) -> Self {
185        let mut bucket_size: usize = 0;
186        let mut prev_pos_in_seq = INVALID_UINT64;
187        for tuple in &tuples {
188            if tuple.pos_in_seq != prev_pos_in_seq {
189                bucket_size += 1;
190                prev_pos_in_seq = tuple.pos_in_seq;
191            }
192        }
193        let bucket_type = BucketType::from_bucket_size(bucket_size);
194        Self {
195            minimizer,
196            tuples,
197            cached_size: bucket_size,
198            bucket_type,
199        }
200    }
201
202    /// Get the bucket size (number of unique super-kmers)
203    #[inline]
204    pub fn size(&self) -> usize {
205        self.cached_size
206    }
207
208    /// Get the bucket type
209    #[inline]
210    pub fn bucket_type(&self) -> BucketType {
211        self.bucket_type
212    }
213}
214
215/// Classify sorted minimizer tuples into buckets by minimizer value
216///
217/// Takes a sorted vector of MinimizerTuples and groups them by minimizer,
218/// creating one bucket per unique minimizer value.
219///
220/// # Arguments
221/// * `tuples` - Sorted minimizer tuples (sorted by minimizer, then pos_in_seq)
222///
223/// # Returns
224/// A vector of buckets, one per unique minimizer
225pub fn classify_into_buckets(tuples: Vec<MinimizerTuple>) -> Vec<Bucket> {
226    if tuples.is_empty() {
227        return Vec::new();
228    }
229
230    let mut buckets = Vec::new();
231    let mut current_minimizer = tuples[0].minimizer;
232    let mut current_bucket_tuples = Vec::new();
233
234    for tuple in tuples {
235        if tuple.minimizer != current_minimizer {
236            // Start new bucket
237            buckets.push(Bucket::new(current_minimizer, current_bucket_tuples));
238            current_minimizer = tuple.minimizer;
239            current_bucket_tuples = Vec::new();
240        }
241        current_bucket_tuples.push(tuple);
242    }
243
244    // Push the last bucket
245    if !current_bucket_tuples.is_empty() {
246        buckets.push(Bucket::new(current_minimizer, current_bucket_tuples));
247    }
248
249    buckets
250}
251
252// ---------------------------------------------------------------------------
253// In-place (zero-copy) bucket classification
254// ---------------------------------------------------------------------------
255
256/// Lightweight bucket descriptor referencing a range in a sorted tuples array.
257///
258/// Unlike [`Bucket`], this does not own the tuples — it references them by
259/// index into the parent [`ClassifiedBuckets::tuples`] vector.
260#[derive(Debug, Clone, Copy)]
261pub struct BucketRef {
262    /// The minimizer value for this bucket.
263    pub minimizer: u64,
264    /// Start index (inclusive) in the tuples array.
265    pub start: usize,
266    /// Number of tuples in this bucket.
267    pub len: usize,
268    /// Number of unique super-kmers (distinct `pos_in_seq` values).
269    pub cached_size: usize,
270    /// Bucket type classification.
271    pub bucket_type: BucketType,
272}
273
274/// In-place classified buckets: owns a sorted tuple array plus lightweight
275/// bucket descriptors.
276///
277/// This avoids the memory duplication of [`classify_into_buckets`] which
278/// moves every tuple into per-bucket `Vec`s (briefly doubling memory for
279/// the tuple payload).
280pub struct ClassifiedBuckets {
281    /// The original sorted tuples — never moved or copied.
282    pub tuples: Vec<MinimizerTuple>,
283    /// One descriptor per unique minimizer, in the same order as the tuples.
284    pub bucket_refs: Vec<BucketRef>,
285}
286
287impl ClassifiedBuckets {
288    /// Number of buckets (unique minimizers).
289    #[inline]
290    pub fn num_buckets(&self) -> usize {
291        self.bucket_refs.len()
292    }
293
294    /// Get the tuple slice for bucket at index `idx`.
295    #[inline]
296    pub fn bucket_tuples(&self, idx: usize) -> &[MinimizerTuple] {
297        let bref = &self.bucket_refs[idx];
298        &self.tuples[bref.start..bref.start + bref.len]
299    }
300}
301
302/// Classify sorted minimizer tuples into buckets **in place**, without
303/// copying or moving the tuple data.
304///
305/// Returns a [`ClassifiedBuckets`] that owns the original tuples vector
306/// and provides lightweight [`BucketRef`] descriptors.
307pub fn classify_into_buckets_inplace(tuples: Vec<MinimizerTuple>) -> ClassifiedBuckets {
308    if tuples.is_empty() {
309        return ClassifiedBuckets {
310            tuples,
311            bucket_refs: Vec::new(),
312        };
313    }
314
315    let mut bucket_refs = Vec::new();
316    let mut start = 0usize;
317    let mut current_minimizer = tuples[0].minimizer;
318
319    for i in 1..tuples.len() {
320        if tuples[i].minimizer != current_minimizer {
321            let len = i - start;
322            let (cached_size, bucket_type) =
323                compute_bucket_size_from_slice(&tuples[start..i]);
324            bucket_refs.push(BucketRef {
325                minimizer: current_minimizer,
326                start,
327                len,
328                cached_size,
329                bucket_type,
330            });
331            current_minimizer = tuples[i].minimizer;
332            start = i;
333        }
334    }
335
336    // Last bucket
337    let len = tuples.len() - start;
338    let (cached_size, bucket_type) =
339        compute_bucket_size_from_slice(&tuples[start..]);
340    bucket_refs.push(BucketRef {
341        minimizer: current_minimizer,
342        start,
343        len,
344        cached_size,
345        bucket_type,
346    });
347
348    ClassifiedBuckets {
349        tuples,
350        bucket_refs,
351    }
352}
353
354/// Count unique super-kmers and classify a bucket from a tuple slice.
355fn compute_bucket_size_from_slice(tuples: &[MinimizerTuple]) -> (usize, BucketType) {
356    let mut bucket_size: usize = 0;
357    let mut prev_pos_in_seq = INVALID_UINT64;
358    for tuple in tuples {
359        if tuple.pos_in_seq != prev_pos_in_seq {
360            bucket_size += 1;
361            prev_pos_in_seq = tuple.pos_in_seq;
362        }
363    }
364    (bucket_size, BucketType::from_bucket_size(bucket_size))
365}
366
367#[cfg(test)]
368mod tests {
369    use super::*;
370    
371    #[test]
372    fn test_bucket_type_classification() {
373        assert_eq!(BucketType::from_bucket_size(1), BucketType::Singleton);
374        assert_eq!(BucketType::from_bucket_size(2), BucketType::Light);
375        assert_eq!(BucketType::from_bucket_size(64), BucketType::Light);
376        assert_eq!(BucketType::from_bucket_size(65), BucketType::Heavy);
377        assert_eq!(BucketType::from_bucket_size(1000), BucketType::Heavy);
378    }
379    
380    #[test]
381    fn test_classify_into_buckets_empty() {
382        let tuples = Vec::new();
383        let buckets = classify_into_buckets(tuples);
384        assert_eq!(buckets.len(), 0);
385    }
386    
387    #[test]
388    fn test_classify_into_buckets_single() {
389        let tuples = vec![
390            MinimizerTuple::new(100, 50, 0),
391        ];
392        let buckets = classify_into_buckets(tuples);
393        assert_eq!(buckets.len(), 1);
394        assert_eq!(buckets[0].minimizer, 100);
395        assert_eq!(buckets[0].size(), 1);
396        assert_eq!(buckets[0].bucket_type(), BucketType::Singleton);
397    }
398    
399    #[test]
400    fn test_classify_into_buckets_multiple() {
401        let tuples = vec![
402            MinimizerTuple::new(100, 50, 0),
403            MinimizerTuple::new(100, 51, 0),
404            MinimizerTuple::new(200, 100, 0),
405            MinimizerTuple::new(300, 150, 0),
406            MinimizerTuple::new(300, 151, 0),
407            MinimizerTuple::new(300, 152, 0),
408        ];
409        
410        let buckets = classify_into_buckets(tuples);
411        assert_eq!(buckets.len(), 3);
412        
413        assert_eq!(buckets[0].minimizer, 100);
414        assert_eq!(buckets[0].size(), 2);
415        assert_eq!(buckets[0].bucket_type(), BucketType::Light);
416        
417        assert_eq!(buckets[1].minimizer, 200);
418        assert_eq!(buckets[1].size(), 1);
419        assert_eq!(buckets[1].bucket_type(), BucketType::Singleton);
420        
421        assert_eq!(buckets[2].minimizer, 300);
422        assert_eq!(buckets[2].size(), 3);
423        assert_eq!(buckets[2].bucket_type(), BucketType::Light);
424    }
425    
426    #[test]
427    fn test_bucket_statistics() {
428        let mut stats = BucketStatistics::new();
429        
430        // Add a singleton bucket
431        let bucket1 = vec![MinimizerTuple::new(100, 50, 0)];
432        stats.add_bucket(&bucket1);
433        
434        // Add a light bucket
435        let bucket2 = vec![
436            MinimizerTuple::new(200, 100, 0),
437            MinimizerTuple::new(200, 101, 0),
438        ];
439        stats.add_bucket(&bucket2);
440        
441        // Add a heavy bucket (65 tuples)
442        let mut bucket3 = Vec::new();
443        for i in 0..65 {
444            bucket3.push(MinimizerTuple::new(300, 200 + i, 0));
445        }
446        stats.add_bucket(&bucket3);
447        
448        assert_eq!(stats.num_buckets, 3);
449        assert_eq!(stats.num_singleton_buckets, 1);
450        assert_eq!(stats.num_light_buckets, 1);
451        assert_eq!(stats.num_heavy_buckets, 1);
452        assert_eq!(stats.max_bucket_size, 65);
453    }
454}