1use crate::builder::minimizer_tuples::MinimizerTuple;
7use crate::constants::{INVALID_UINT64, MIN_L};
8use tracing::info;
9
10pub const MIN_BUCKET_SIZE: usize = 1 << MIN_L; #[derive(Debug, Clone)]
15pub struct BucketStatistics {
16 pub num_buckets: u64,
18
19 pub num_kmers: u64,
21
22 pub num_positions: u64,
24
25 pub num_singleton_buckets: u64,
27
28 pub num_light_buckets: u64,
30
31 pub num_heavy_buckets: u64,
33
34 pub max_bucket_size: usize,
36
37 pub num_positions_in_light: u64,
39
40 pub num_positions_in_heavy: u64,
42
43 pub num_super_kmers_in_non_singleton: u64,
45}
46
47impl BucketStatistics {
48 pub fn new() -> Self {
50 Self {
51 num_buckets: 0,
52 num_kmers: 0,
53 num_positions: 0,
54 num_singleton_buckets: 0,
55 num_light_buckets: 0,
56 num_heavy_buckets: 0,
57 max_bucket_size: 0,
58 num_positions_in_light: 0,
59 num_positions_in_heavy: 0,
60 num_super_kmers_in_non_singleton: 0,
61 }
62 }
63
64 pub fn add_bucket(&mut self, bucket: &[MinimizerTuple]) {
66 self.num_buckets += 1;
67 let mut bucket_size: usize = 0;
68 let mut prev_pos_in_seq = INVALID_UINT64;
69 for tuple in bucket {
70 if tuple.pos_in_seq != prev_pos_in_seq {
71 bucket_size += 1;
72 prev_pos_in_seq = tuple.pos_in_seq;
73 }
74 }
75
76 if bucket_size > self.max_bucket_size {
77 self.max_bucket_size = bucket_size;
78 }
79
80 let kmers_in_bucket: u64 = bucket.iter()
82 .map(|mt| mt.num_kmers_in_super_kmer as u64)
83 .sum();
84 self.num_kmers += kmers_in_bucket;
85
86 match bucket_size {
88 1 => {
89 self.num_singleton_buckets += 1;
90 self.num_positions += 1;
91 }
92 2..=MIN_BUCKET_SIZE => {
93 self.num_light_buckets += 1;
94 self.num_positions_in_light += bucket_size as u64;
95 self.num_positions += bucket_size as u64;
96 self.num_super_kmers_in_non_singleton += bucket.len() as u64;
97 }
98 _ => {
99 self.num_heavy_buckets += 1;
100 self.num_positions_in_heavy += bucket_size as u64;
101 self.num_positions += bucket_size as u64;
102 self.num_super_kmers_in_non_singleton += bucket.len() as u64;
103 }
104 }
105 }
106
107 pub fn print_summary(&self) {
109 info!("Bucket Statistics:");
110 info!(" Total buckets: {}", self.num_buckets);
111 info!(" Total k-mers: {}", self.num_kmers);
112 info!(" Total positions: {}", self.num_positions);
113
114 info!(" Singleton buckets: {} ({:.2}%)",
115 self.num_singleton_buckets,
116 (self.num_singleton_buckets as f64 * 100.0) / self.num_buckets as f64);
117
118 info!(" Light buckets (2-{}): {} ({:.2}%)",
119 MIN_BUCKET_SIZE,
120 self.num_light_buckets,
121 (self.num_light_buckets as f64 * 100.0) / self.num_buckets as f64);
122
123 info!(" Heavy buckets (>{}): {} ({:.2}%)",
124 MIN_BUCKET_SIZE,
125 self.num_heavy_buckets,
126 (self.num_heavy_buckets as f64 * 100.0) / self.num_buckets as f64);
127
128 info!(" Max bucket size: {}", self.max_bucket_size);
129 info!(" Positions in light buckets: {} ({:.2}%)",
130 self.num_positions_in_light,
131 (self.num_positions_in_light as f64 * 100.0) / self.num_positions as f64);
132 info!(" Positions in heavy buckets: {} ({:.2}%)",
133 self.num_positions_in_heavy,
134 (self.num_positions_in_heavy as f64 * 100.0) / self.num_positions as f64);
135 }
136}
137
138impl Default for BucketStatistics {
139 fn default() -> Self {
140 Self::new()
141 }
142}
143
144#[derive(Debug, Clone, Copy, PartialEq, Eq)]
146pub enum BucketType {
147 Singleton,
149 Light,
151 Heavy,
153}
154
155impl BucketType {
156 pub fn from_bucket_size(size: usize) -> Self {
158 match size {
159 1 => BucketType::Singleton,
160 2..=MIN_BUCKET_SIZE => BucketType::Light,
161 _ => BucketType::Heavy,
162 }
163 }
164}
165
166#[derive(Debug, Clone)]
168pub struct Bucket {
169 pub minimizer: u64,
171
172 pub tuples: Vec<MinimizerTuple>,
174
175 pub cached_size: usize,
177
178 pub bucket_type: BucketType,
180}
181
182impl Bucket {
183 pub fn new(minimizer: u64, tuples: Vec<MinimizerTuple>) -> Self {
185 let mut bucket_size: usize = 0;
186 let mut prev_pos_in_seq = INVALID_UINT64;
187 for tuple in &tuples {
188 if tuple.pos_in_seq != prev_pos_in_seq {
189 bucket_size += 1;
190 prev_pos_in_seq = tuple.pos_in_seq;
191 }
192 }
193 let bucket_type = BucketType::from_bucket_size(bucket_size);
194 Self {
195 minimizer,
196 tuples,
197 cached_size: bucket_size,
198 bucket_type,
199 }
200 }
201
202 #[inline]
204 pub fn size(&self) -> usize {
205 self.cached_size
206 }
207
208 #[inline]
210 pub fn bucket_type(&self) -> BucketType {
211 self.bucket_type
212 }
213}
214
215pub fn classify_into_buckets(tuples: Vec<MinimizerTuple>) -> Vec<Bucket> {
226 if tuples.is_empty() {
227 return Vec::new();
228 }
229
230 let mut buckets = Vec::new();
231 let mut current_minimizer = tuples[0].minimizer;
232 let mut current_bucket_tuples = Vec::new();
233
234 for tuple in tuples {
235 if tuple.minimizer != current_minimizer {
236 buckets.push(Bucket::new(current_minimizer, current_bucket_tuples));
238 current_minimizer = tuple.minimizer;
239 current_bucket_tuples = Vec::new();
240 }
241 current_bucket_tuples.push(tuple);
242 }
243
244 if !current_bucket_tuples.is_empty() {
246 buckets.push(Bucket::new(current_minimizer, current_bucket_tuples));
247 }
248
249 buckets
250}
251
252#[derive(Debug, Clone, Copy)]
261pub struct BucketRef {
262 pub minimizer: u64,
264 pub start: usize,
266 pub len: usize,
268 pub cached_size: usize,
270 pub bucket_type: BucketType,
272}
273
274pub struct ClassifiedBuckets {
281 pub tuples: Vec<MinimizerTuple>,
283 pub bucket_refs: Vec<BucketRef>,
285}
286
287impl ClassifiedBuckets {
288 #[inline]
290 pub fn num_buckets(&self) -> usize {
291 self.bucket_refs.len()
292 }
293
294 #[inline]
296 pub fn bucket_tuples(&self, idx: usize) -> &[MinimizerTuple] {
297 let bref = &self.bucket_refs[idx];
298 &self.tuples[bref.start..bref.start + bref.len]
299 }
300}
301
302pub fn classify_into_buckets_inplace(tuples: Vec<MinimizerTuple>) -> ClassifiedBuckets {
308 if tuples.is_empty() {
309 return ClassifiedBuckets {
310 tuples,
311 bucket_refs: Vec::new(),
312 };
313 }
314
315 let mut bucket_refs = Vec::new();
316 let mut start = 0usize;
317 let mut current_minimizer = tuples[0].minimizer;
318
319 for i in 1..tuples.len() {
320 if tuples[i].minimizer != current_minimizer {
321 let len = i - start;
322 let (cached_size, bucket_type) =
323 compute_bucket_size_from_slice(&tuples[start..i]);
324 bucket_refs.push(BucketRef {
325 minimizer: current_minimizer,
326 start,
327 len,
328 cached_size,
329 bucket_type,
330 });
331 current_minimizer = tuples[i].minimizer;
332 start = i;
333 }
334 }
335
336 let len = tuples.len() - start;
338 let (cached_size, bucket_type) =
339 compute_bucket_size_from_slice(&tuples[start..]);
340 bucket_refs.push(BucketRef {
341 minimizer: current_minimizer,
342 start,
343 len,
344 cached_size,
345 bucket_type,
346 });
347
348 ClassifiedBuckets {
349 tuples,
350 bucket_refs,
351 }
352}
353
354fn compute_bucket_size_from_slice(tuples: &[MinimizerTuple]) -> (usize, BucketType) {
356 let mut bucket_size: usize = 0;
357 let mut prev_pos_in_seq = INVALID_UINT64;
358 for tuple in tuples {
359 if tuple.pos_in_seq != prev_pos_in_seq {
360 bucket_size += 1;
361 prev_pos_in_seq = tuple.pos_in_seq;
362 }
363 }
364 (bucket_size, BucketType::from_bucket_size(bucket_size))
365}
366
367#[cfg(test)]
368mod tests {
369 use super::*;
370
371 #[test]
372 fn test_bucket_type_classification() {
373 assert_eq!(BucketType::from_bucket_size(1), BucketType::Singleton);
374 assert_eq!(BucketType::from_bucket_size(2), BucketType::Light);
375 assert_eq!(BucketType::from_bucket_size(64), BucketType::Light);
376 assert_eq!(BucketType::from_bucket_size(65), BucketType::Heavy);
377 assert_eq!(BucketType::from_bucket_size(1000), BucketType::Heavy);
378 }
379
380 #[test]
381 fn test_classify_into_buckets_empty() {
382 let tuples = Vec::new();
383 let buckets = classify_into_buckets(tuples);
384 assert_eq!(buckets.len(), 0);
385 }
386
387 #[test]
388 fn test_classify_into_buckets_single() {
389 let tuples = vec![
390 MinimizerTuple::new(100, 50, 0),
391 ];
392 let buckets = classify_into_buckets(tuples);
393 assert_eq!(buckets.len(), 1);
394 assert_eq!(buckets[0].minimizer, 100);
395 assert_eq!(buckets[0].size(), 1);
396 assert_eq!(buckets[0].bucket_type(), BucketType::Singleton);
397 }
398
399 #[test]
400 fn test_classify_into_buckets_multiple() {
401 let tuples = vec![
402 MinimizerTuple::new(100, 50, 0),
403 MinimizerTuple::new(100, 51, 0),
404 MinimizerTuple::new(200, 100, 0),
405 MinimizerTuple::new(300, 150, 0),
406 MinimizerTuple::new(300, 151, 0),
407 MinimizerTuple::new(300, 152, 0),
408 ];
409
410 let buckets = classify_into_buckets(tuples);
411 assert_eq!(buckets.len(), 3);
412
413 assert_eq!(buckets[0].minimizer, 100);
414 assert_eq!(buckets[0].size(), 2);
415 assert_eq!(buckets[0].bucket_type(), BucketType::Light);
416
417 assert_eq!(buckets[1].minimizer, 200);
418 assert_eq!(buckets[1].size(), 1);
419 assert_eq!(buckets[1].bucket_type(), BucketType::Singleton);
420
421 assert_eq!(buckets[2].minimizer, 300);
422 assert_eq!(buckets[2].size(), 3);
423 assert_eq!(buckets[2].bucket_type(), BucketType::Light);
424 }
425
426 #[test]
427 fn test_bucket_statistics() {
428 let mut stats = BucketStatistics::new();
429
430 let bucket1 = vec![MinimizerTuple::new(100, 50, 0)];
432 stats.add_bucket(&bucket1);
433
434 let bucket2 = vec![
436 MinimizerTuple::new(200, 100, 0),
437 MinimizerTuple::new(200, 101, 0),
438 ];
439 stats.add_bucket(&bucket2);
440
441 let mut bucket3 = Vec::new();
443 for i in 0..65 {
444 bucket3.push(MinimizerTuple::new(300, 200 + i, 0));
445 }
446 stats.add_bucket(&bucket3);
447
448 assert_eq!(stats.num_buckets, 3);
449 assert_eq!(stats.num_singleton_buckets, 1);
450 assert_eq!(stats.num_light_buckets, 1);
451 assert_eq!(stats.num_heavy_buckets, 1);
452 assert_eq!(stats.max_bucket_size, 65);
453 }
454}