use std::collections::{BTreeMap, HashMap};
pub type KmerHistogram = BTreeMap<u64, u64>;
#[derive(Debug, Clone, PartialEq)]
pub struct HistogramStats {
pub total_kmers: u64,
pub distinct_kmers: u64,
pub mode_count: u64,
pub mode_frequency: u64,
pub mean_count: f64,
}
#[must_use]
#[allow(clippy::implicit_hasher)]
pub fn compute_histogram(counts: &HashMap<String, u64>) -> KmerHistogram {
let mut histogram = BTreeMap::new();
for &count in counts.values() {
*histogram.entry(count).or_insert(0) += 1;
}
histogram
}
#[must_use]
#[allow(clippy::implicit_hasher)]
pub fn compute_histogram_packed(counts: &HashMap<u64, u64>) -> KmerHistogram {
let mut histogram = BTreeMap::new();
for &count in counts.values() {
*histogram.entry(count).or_insert(0) += 1;
}
histogram
}
#[must_use]
pub fn histogram_stats(histogram: &KmerHistogram) -> HistogramStats {
let distinct: u64 = histogram.values().sum();
let total: u64 = histogram.iter().map(|(c, f)| c * f).sum();
let (mode_count, mode_frequency) = histogram
.iter()
.max_by_key(|(_, f)| *f)
.map_or((0, 0), |(&c, &f)| (c, f));
HistogramStats {
total_kmers: total,
distinct_kmers: distinct,
mode_count,
mode_frequency,
#[allow(clippy::cast_precision_loss)]
mean_count: if distinct > 0 {
total as f64 / distinct as f64
} else {
0.0
},
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn histogram_basic() {
let counts: HashMap<String, u64> = [
("ACG".to_string(), 1),
("CGT".to_string(), 1),
("GTA".to_string(), 2),
("TAC".to_string(), 2),
]
.into();
let hist = compute_histogram(&counts);
assert_eq!(hist.get(&1), Some(&2));
assert_eq!(hist.get(&2), Some(&2));
assert_eq!(hist.get(&3), None);
}
#[test]
fn histogram_single_kmer() {
let counts: HashMap<String, u64> = [("ACGT".to_string(), 100)].into();
let hist = compute_histogram(&counts);
assert_eq!(hist.len(), 1);
assert_eq!(hist.get(&100), Some(&1));
}
#[test]
fn histogram_empty() {
let counts: HashMap<String, u64> = HashMap::new();
let hist = compute_histogram(&counts);
assert!(hist.is_empty());
}
#[test]
fn histogram_packed() {
let counts: HashMap<u64, u64> = [
(0b0001, 5), (0b0010, 5), (0b0011, 10), ]
.into();
let hist = compute_histogram_packed(&counts);
assert_eq!(hist.get(&5), Some(&2));
assert_eq!(hist.get(&10), Some(&1));
}
#[test]
fn histogram_stats_basic() {
let counts: HashMap<String, u64> = [
("ACG".to_string(), 1),
("CGT".to_string(), 1),
("GTA".to_string(), 2),
("TAC".to_string(), 2),
]
.into();
let hist = compute_histogram(&counts);
let stats = histogram_stats(&hist);
assert_eq!(stats.distinct_kmers, 4);
assert_eq!(stats.total_kmers, 6); assert!(stats.mode_frequency == 2);
assert!((stats.mean_count - 1.5).abs() < f64::EPSILON);
}
#[test]
fn histogram_stats_empty() {
let hist = KmerHistogram::new();
let stats = histogram_stats(&hist);
assert_eq!(stats.distinct_kmers, 0);
assert_eq!(stats.total_kmers, 0);
assert_eq!(stats.mode_count, 0);
assert_eq!(stats.mode_frequency, 0);
assert!((stats.mean_count - 0.0).abs() < f64::EPSILON);
}
#[test]
fn histogram_stats_single_kmer() {
let counts: HashMap<String, u64> = [("ACGT".to_string(), 42)].into();
let hist = compute_histogram(&counts);
let stats = histogram_stats(&hist);
assert_eq!(stats.distinct_kmers, 1);
assert_eq!(stats.total_kmers, 42);
assert_eq!(stats.mode_count, 42);
assert_eq!(stats.mode_frequency, 1);
assert!((stats.mean_count - 42.0).abs() < f64::EPSILON);
}
#[test]
fn histogram_sorted_keys() {
let counts: HashMap<String, u64> = [
("A".to_string(), 100),
("B".to_string(), 1),
("C".to_string(), 50),
]
.into();
let hist = compute_histogram(&counts);
let keys: Vec<_> = hist.keys().collect();
assert_eq!(keys, vec![&1, &50, &100]);
}
}