use crate::builder::minimizer_tuples::MinimizerTuple;
use crate::constants::{INVALID_UINT64, MIN_L};
use tracing::info;
pub const MIN_BUCKET_SIZE: usize = 1 << MIN_L;
#[derive(Debug, Clone)]
pub struct BucketStatistics {
pub num_buckets: u64,
pub num_kmers: u64,
pub num_positions: u64,
pub num_singleton_buckets: u64,
pub num_light_buckets: u64,
pub num_heavy_buckets: u64,
pub max_bucket_size: usize,
pub num_positions_in_light: u64,
pub num_positions_in_heavy: u64,
pub num_super_kmers_in_non_singleton: u64,
}
impl BucketStatistics {
pub fn new() -> Self {
Self {
num_buckets: 0,
num_kmers: 0,
num_positions: 0,
num_singleton_buckets: 0,
num_light_buckets: 0,
num_heavy_buckets: 0,
max_bucket_size: 0,
num_positions_in_light: 0,
num_positions_in_heavy: 0,
num_super_kmers_in_non_singleton: 0,
}
}
pub fn add_bucket(&mut self, bucket: &[MinimizerTuple]) {
self.num_buckets += 1;
let mut bucket_size: usize = 0;
let mut prev_pos_in_seq = INVALID_UINT64;
for tuple in bucket {
if tuple.pos_in_seq != prev_pos_in_seq {
bucket_size += 1;
prev_pos_in_seq = tuple.pos_in_seq;
}
}
if bucket_size > self.max_bucket_size {
self.max_bucket_size = bucket_size;
}
let kmers_in_bucket: u64 = bucket.iter()
.map(|mt| mt.num_kmers_in_super_kmer as u64)
.sum();
self.num_kmers += kmers_in_bucket;
match bucket_size {
1 => {
self.num_singleton_buckets += 1;
self.num_positions += 1;
}
2..=MIN_BUCKET_SIZE => {
self.num_light_buckets += 1;
self.num_positions_in_light += bucket_size as u64;
self.num_positions += bucket_size as u64;
self.num_super_kmers_in_non_singleton += bucket.len() as u64;
}
_ => {
self.num_heavy_buckets += 1;
self.num_positions_in_heavy += bucket_size as u64;
self.num_positions += bucket_size as u64;
self.num_super_kmers_in_non_singleton += bucket.len() as u64;
}
}
}
pub fn print_summary(&self) {
info!("Bucket Statistics:");
info!(" Total buckets: {}", self.num_buckets);
info!(" Total k-mers: {}", self.num_kmers);
info!(" Total positions: {}", self.num_positions);
info!(" Singleton buckets: {} ({:.2}%)",
self.num_singleton_buckets,
(self.num_singleton_buckets as f64 * 100.0) / self.num_buckets as f64);
info!(" Light buckets (2-{}): {} ({:.2}%)",
MIN_BUCKET_SIZE,
self.num_light_buckets,
(self.num_light_buckets as f64 * 100.0) / self.num_buckets as f64);
info!(" Heavy buckets (>{}): {} ({:.2}%)",
MIN_BUCKET_SIZE,
self.num_heavy_buckets,
(self.num_heavy_buckets as f64 * 100.0) / self.num_buckets as f64);
info!(" Max bucket size: {}", self.max_bucket_size);
info!(" Positions in light buckets: {} ({:.2}%)",
self.num_positions_in_light,
(self.num_positions_in_light as f64 * 100.0) / self.num_positions as f64);
info!(" Positions in heavy buckets: {} ({:.2}%)",
self.num_positions_in_heavy,
(self.num_positions_in_heavy as f64 * 100.0) / self.num_positions as f64);
}
}
impl Default for BucketStatistics {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BucketType {
Singleton,
Light,
Heavy,
}
impl BucketType {
pub fn from_bucket_size(size: usize) -> Self {
match size {
1 => BucketType::Singleton,
2..=MIN_BUCKET_SIZE => BucketType::Light,
_ => BucketType::Heavy,
}
}
}
#[derive(Debug, Clone)]
pub struct Bucket {
pub minimizer: u64,
pub tuples: Vec<MinimizerTuple>,
pub cached_size: usize,
pub bucket_type: BucketType,
}
impl Bucket {
pub fn new(minimizer: u64, tuples: Vec<MinimizerTuple>) -> Self {
let mut bucket_size: usize = 0;
let mut prev_pos_in_seq = INVALID_UINT64;
for tuple in &tuples {
if tuple.pos_in_seq != prev_pos_in_seq {
bucket_size += 1;
prev_pos_in_seq = tuple.pos_in_seq;
}
}
let bucket_type = BucketType::from_bucket_size(bucket_size);
Self {
minimizer,
tuples,
cached_size: bucket_size,
bucket_type,
}
}
#[inline]
pub fn size(&self) -> usize {
self.cached_size
}
#[inline]
pub fn bucket_type(&self) -> BucketType {
self.bucket_type
}
}
pub fn classify_into_buckets(tuples: Vec<MinimizerTuple>) -> Vec<Bucket> {
if tuples.is_empty() {
return Vec::new();
}
let mut buckets = Vec::new();
let mut current_minimizer = tuples[0].minimizer;
let mut current_bucket_tuples = Vec::new();
for tuple in tuples {
if tuple.minimizer != current_minimizer {
buckets.push(Bucket::new(current_minimizer, current_bucket_tuples));
current_minimizer = tuple.minimizer;
current_bucket_tuples = Vec::new();
}
current_bucket_tuples.push(tuple);
}
if !current_bucket_tuples.is_empty() {
buckets.push(Bucket::new(current_minimizer, current_bucket_tuples));
}
buckets
}
#[derive(Debug, Clone, Copy)]
pub struct BucketRef {
pub minimizer: u64,
pub start: usize,
pub len: usize,
pub cached_size: usize,
pub bucket_type: BucketType,
}
pub struct ClassifiedBuckets {
pub tuples: Vec<MinimizerTuple>,
pub bucket_refs: Vec<BucketRef>,
}
impl ClassifiedBuckets {
#[inline]
pub fn num_buckets(&self) -> usize {
self.bucket_refs.len()
}
#[inline]
pub fn bucket_tuples(&self, idx: usize) -> &[MinimizerTuple] {
let bref = &self.bucket_refs[idx];
&self.tuples[bref.start..bref.start + bref.len]
}
}
pub fn classify_into_buckets_inplace(tuples: Vec<MinimizerTuple>) -> ClassifiedBuckets {
if tuples.is_empty() {
return ClassifiedBuckets {
tuples,
bucket_refs: Vec::new(),
};
}
let mut bucket_refs = Vec::new();
let mut start = 0usize;
let mut current_minimizer = tuples[0].minimizer;
for i in 1..tuples.len() {
if tuples[i].minimizer != current_minimizer {
let len = i - start;
let (cached_size, bucket_type) =
compute_bucket_size_from_slice(&tuples[start..i]);
bucket_refs.push(BucketRef {
minimizer: current_minimizer,
start,
len,
cached_size,
bucket_type,
});
current_minimizer = tuples[i].minimizer;
start = i;
}
}
let len = tuples.len() - start;
let (cached_size, bucket_type) =
compute_bucket_size_from_slice(&tuples[start..]);
bucket_refs.push(BucketRef {
minimizer: current_minimizer,
start,
len,
cached_size,
bucket_type,
});
ClassifiedBuckets {
tuples,
bucket_refs,
}
}
fn compute_bucket_size_from_slice(tuples: &[MinimizerTuple]) -> (usize, BucketType) {
let mut bucket_size: usize = 0;
let mut prev_pos_in_seq = INVALID_UINT64;
for tuple in tuples {
if tuple.pos_in_seq != prev_pos_in_seq {
bucket_size += 1;
prev_pos_in_seq = tuple.pos_in_seq;
}
}
(bucket_size, BucketType::from_bucket_size(bucket_size))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bucket_type_classification() {
assert_eq!(BucketType::from_bucket_size(1), BucketType::Singleton);
assert_eq!(BucketType::from_bucket_size(2), BucketType::Light);
assert_eq!(BucketType::from_bucket_size(64), BucketType::Light);
assert_eq!(BucketType::from_bucket_size(65), BucketType::Heavy);
assert_eq!(BucketType::from_bucket_size(1000), BucketType::Heavy);
}
#[test]
fn test_classify_into_buckets_empty() {
let tuples = Vec::new();
let buckets = classify_into_buckets(tuples);
assert_eq!(buckets.len(), 0);
}
#[test]
fn test_classify_into_buckets_single() {
let tuples = vec![
MinimizerTuple::new(100, 50, 0),
];
let buckets = classify_into_buckets(tuples);
assert_eq!(buckets.len(), 1);
assert_eq!(buckets[0].minimizer, 100);
assert_eq!(buckets[0].size(), 1);
assert_eq!(buckets[0].bucket_type(), BucketType::Singleton);
}
#[test]
fn test_classify_into_buckets_multiple() {
let tuples = vec![
MinimizerTuple::new(100, 50, 0),
MinimizerTuple::new(100, 51, 0),
MinimizerTuple::new(200, 100, 0),
MinimizerTuple::new(300, 150, 0),
MinimizerTuple::new(300, 151, 0),
MinimizerTuple::new(300, 152, 0),
];
let buckets = classify_into_buckets(tuples);
assert_eq!(buckets.len(), 3);
assert_eq!(buckets[0].minimizer, 100);
assert_eq!(buckets[0].size(), 2);
assert_eq!(buckets[0].bucket_type(), BucketType::Light);
assert_eq!(buckets[1].minimizer, 200);
assert_eq!(buckets[1].size(), 1);
assert_eq!(buckets[1].bucket_type(), BucketType::Singleton);
assert_eq!(buckets[2].minimizer, 300);
assert_eq!(buckets[2].size(), 3);
assert_eq!(buckets[2].bucket_type(), BucketType::Light);
}
#[test]
fn test_bucket_statistics() {
let mut stats = BucketStatistics::new();
let bucket1 = vec![MinimizerTuple::new(100, 50, 0)];
stats.add_bucket(&bucket1);
let bucket2 = vec![
MinimizerTuple::new(200, 100, 0),
MinimizerTuple::new(200, 101, 0),
];
stats.add_bucket(&bucket2);
let mut bucket3 = Vec::new();
for i in 0..65 {
bucket3.push(MinimizerTuple::new(300, 200 + i, 0));
}
stats.add_bucket(&bucket3);
assert_eq!(stats.num_buckets, 3);
assert_eq!(stats.num_singleton_buckets, 1);
assert_eq!(stats.num_light_buckets, 1);
assert_eq!(stats.num_heavy_buckets, 1);
assert_eq!(stats.max_bucket_size, 65);
}
}