use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
mod fx_hashmap_serde {
use rustc_hash::FxHashMap;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::collections::HashMap;
pub fn serialize<S>(map: &FxHashMap<u64, u64>, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let std_map: HashMap<u64, u64> = map.iter().map(|(&k, &v)| (k, v)).collect();
std_map.serialize(serializer)
}
pub fn deserialize<'de, D>(deserializer: D) -> Result<FxHashMap<u64, u64>, D::Error>
where
D: Deserializer<'de>,
{
let std_map = HashMap::<u64, u64>::deserialize(deserializer)?;
Ok(std_map.into_iter().collect())
}
}
const DEFAULT_SAMPLE_SIZE: u64 = 100_000;
fn hash_sequence(seq: &[u8]) -> u64 {
const FNV_OFFSET: u64 = 0xcbf29ce484222325;
const FNV_PRIME: u64 = 0x100000001b3;
let mut hash = FNV_OFFSET;
for &byte in seq {
hash ^= byte.to_ascii_uppercase() as u64;
hash = hash.wrapping_mul(FNV_PRIME);
}
hash
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DuplicationStats {
#[serde(with = "fx_hashmap_serde")]
sequence_counts: FxHashMap<u64, u64>,
sample_size: u64,
reads_processed: u64,
sampling_active: bool,
duplication_histogram: [u64; 10],
histogram_dirty: bool,
enabled: bool,
}
impl Default for DuplicationStats {
fn default() -> Self {
Self::new()
}
}
impl DuplicationStats {
pub fn new() -> Self {
Self::with_sample_size(DEFAULT_SAMPLE_SIZE)
}
pub fn with_sample_size(sample_size: u64) -> Self {
Self {
sequence_counts: FxHashMap::default(),
sample_size,
reads_processed: 0,
sampling_active: true,
duplication_histogram: [0u64; 10],
histogram_dirty: true,
enabled: true,
}
}
pub fn disabled() -> Self {
Self {
sequence_counts: FxHashMap::default(),
sample_size: 0,
reads_processed: 0,
sampling_active: false,
duplication_histogram: [0u64; 10],
histogram_dirty: false,
enabled: false,
}
}
#[inline]
pub fn update(&mut self, seq: &[u8], read_count: u64) {
if !self.enabled {
return;
}
if !self.sampling_active || read_count > self.sample_size {
self.sampling_active = false;
return;
}
if seq.is_empty() {
return;
}
let hash = hash_sequence(seq);
*self.sequence_counts.entry(hash).or_insert(0) += 1;
self.reads_processed += 1;
self.histogram_dirty = true;
}
pub fn duplication_rate(&self) -> f64 {
if self.reads_processed == 0 {
return 0.0;
}
let unique_sequences = self.sequence_counts.len() as u64;
let duplicates = self.reads_processed - unique_sequences;
(duplicates as f64 / self.reads_processed as f64) * 100.0
}
pub fn unique_count(&self) -> u64 {
self.sequence_counts.len() as u64
}
pub fn reads_processed(&self) -> u64 {
self.reads_processed
}
pub fn unique_rate(&self) -> f64 {
100.0 - self.duplication_rate()
}
pub fn duplication_histogram(&mut self) -> &[u64; 10] {
if self.histogram_dirty {
self.recalculate_histogram();
}
&self.duplication_histogram
}
pub fn histogram_snapshot(&self) -> [u64; 10] {
if self.histogram_dirty {
let mut hist = [0u64; 10];
for &count in self.sequence_counts.values() {
let idx = (count as usize).saturating_sub(1).min(9);
hist[idx] += count; }
hist
} else {
self.duplication_histogram
}
}
fn recalculate_histogram(&mut self) {
self.duplication_histogram = [0u64; 10];
for &count in self.sequence_counts.values() {
let idx = (count as usize).saturating_sub(1).min(9);
self.duplication_histogram[idx] += count;
}
self.histogram_dirty = false;
}
pub fn duplication_percentages(&mut self) -> [f64; 10] {
if self.reads_processed == 0 {
return [0.0; 10];
}
let total = self.reads_processed as f64;
let hist = self.duplication_histogram();
let mut percentages = [0.0f64; 10];
for (i, &count) in hist.iter().enumerate() {
percentages[i] = (count as f64 / total) * 100.0;
}
percentages
}
pub fn merge(&mut self, other: &DuplicationStats) {
for (&hash, &count) in &other.sequence_counts {
*self.sequence_counts.entry(hash).or_insert(0) += count;
}
self.reads_processed += other.reads_processed;
self.histogram_dirty = true;
}
pub fn is_sampling(&self) -> bool {
self.sampling_active
}
pub fn sample_size(&self) -> u64 {
self.sample_size
}
pub fn most_duplicated(&self, n: usize) -> Vec<(u64, u64)> {
let mut sorted: Vec<(u64, u64)> = self
.sequence_counts
.iter()
.map(|(&k, &v)| (k, v))
.collect();
sorted.sort_by(|a, b| b.1.cmp(&a.1));
sorted.truncate(n);
sorted
}
pub fn mean_duplication(&self) -> f64 {
let unique = self.sequence_counts.len() as u64;
if unique == 0 {
0.0
} else {
self.reads_processed as f64 / unique as f64
}
}
pub fn from_raw(sequence_counts: FxHashMap<u64, u64>, sample_size: u64) -> Self {
let reads_processed: u64 = sequence_counts.values().sum();
Self {
sequence_counts,
sample_size,
reads_processed,
sampling_active: false,
duplication_histogram: [0u64; 10],
histogram_dirty: true,
enabled: true,
}
}
pub fn is_enabled(&self) -> bool {
self.enabled
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_duplication_stats_new() {
let ds = DuplicationStats::new();
assert_eq!(ds.reads_processed(), 0);
assert_eq!(ds.unique_count(), 0);
assert!(ds.is_sampling());
}
#[test]
fn test_duplication_stats_update_unique() {
let mut ds = DuplicationStats::new();
ds.update(b"ATGC", 1);
ds.update(b"GCTA", 2);
ds.update(b"TTAA", 3);
assert_eq!(ds.reads_processed(), 3);
assert_eq!(ds.unique_count(), 3);
assert!((ds.duplication_rate() - 0.0).abs() < 0.001);
}
#[test]
fn test_duplication_stats_update_duplicates() {
let mut ds = DuplicationStats::new();
ds.update(b"ATGC", 1);
ds.update(b"ATGC", 2);
ds.update(b"ATGC", 3);
assert_eq!(ds.reads_processed(), 3);
assert_eq!(ds.unique_count(), 1);
assert!((ds.duplication_rate() - 66.666).abs() < 0.1);
}
#[test]
fn test_duplication_stats_case_insensitive() {
let mut ds = DuplicationStats::new();
ds.update(b"ATGC", 1);
ds.update(b"atgc", 2);
assert_eq!(ds.unique_count(), 1);
assert_eq!(ds.reads_processed(), 2);
}
#[test]
fn test_duplication_stats_sampling_limit() {
let mut ds = DuplicationStats::with_sample_size(5);
for i in 1..=10 {
ds.update(format!("SEQ{}", i).as_bytes(), i);
}
assert_eq!(ds.reads_processed(), 5);
assert!(!ds.is_sampling());
}
#[test]
fn test_duplication_stats_unique_rate() {
let mut ds = DuplicationStats::new();
ds.update(b"ATGC", 1);
ds.update(b"ATGC", 2);
ds.update(b"GCTA", 3);
ds.update(b"TTAA", 4);
assert!((ds.unique_rate() - 75.0).abs() < 0.1);
}
#[test]
fn test_duplication_stats_histogram() {
let mut ds = DuplicationStats::new();
ds.update(b"UNIQUE", 1);
ds.update(b"DOUBLE", 2);
ds.update(b"DOUBLE", 3);
ds.update(b"TRIPLE", 4);
ds.update(b"TRIPLE", 5);
ds.update(b"TRIPLE", 6);
let hist = ds.duplication_histogram();
assert_eq!(hist[0], 1); assert_eq!(hist[1], 2); assert_eq!(hist[2], 3); }
#[test]
fn test_duplication_stats_merge() {
let mut ds1 = DuplicationStats::new();
ds1.update(b"ATGC", 1);
let mut ds2 = DuplicationStats::new();
ds2.update(b"ATGC", 1);
ds2.update(b"GCTA", 2);
ds1.merge(&ds2);
assert_eq!(ds1.reads_processed(), 3);
assert_eq!(ds1.unique_count(), 2);
}
#[test]
fn test_duplication_stats_empty() {
let mut ds = DuplicationStats::new();
ds.update(b"", 1);
assert_eq!(ds.reads_processed(), 0);
assert_eq!(ds.duplication_rate(), 0.0);
}
#[test]
fn test_duplication_stats_most_duplicated() {
let mut ds = DuplicationStats::new();
for i in 1..=5 {
ds.update(b"COMMON", i);
}
for i in 1..=2 {
ds.update(b"LESS", i + 5);
}
ds.update(b"RARE", 8);
let top = ds.most_duplicated(2);
assert_eq!(top.len(), 2);
assert_eq!(top[0].1, 5); assert_eq!(top[1].1, 2); }
#[test]
fn test_duplication_stats_mean_duplication() {
let mut ds = DuplicationStats::new();
ds.update(b"SEQ1", 1);
ds.update(b"SEQ1", 2);
ds.update(b"SEQ2", 3);
ds.update(b"SEQ2", 4);
assert!((ds.mean_duplication() - 2.0).abs() < 0.001);
}
#[test]
fn test_duplication_stats_serialize() {
let mut ds = DuplicationStats::new();
ds.update(b"ATGC", 1);
ds.update(b"ATGC", 2);
let json = serde_json::to_string(&ds).unwrap();
let ds2: DuplicationStats = serde_json::from_str(&json).unwrap();
assert_eq!(ds.reads_processed(), ds2.reads_processed());
assert_eq!(ds.unique_count(), ds2.unique_count());
}
#[test]
fn test_hash_sequence_consistency() {
let hash1 = hash_sequence(b"ATGC");
let hash2 = hash_sequence(b"ATGC");
let hash3 = hash_sequence(b"atgc");
assert_eq!(hash1, hash2);
assert_eq!(hash1, hash3); }
#[test]
fn test_duplication_percentages() {
let mut ds = DuplicationStats::new();
for i in 1..=50 {
ds.update(format!("UNIQUE{}", i).as_bytes(), i);
}
for i in 1..=25 {
ds.update(format!("DUP{}", i).as_bytes(), 50 + (i * 2) - 1);
ds.update(format!("DUP{}", i).as_bytes(), 50 + (i * 2));
}
let percentages = ds.duplication_percentages();
assert!((percentages[0] - 50.0).abs() < 0.1);
assert!((percentages[1] - 50.0).abs() < 0.1);
}
}