use bloom::{BloomFilter, ASMS};
pub struct DedupConfig {
filter: BloomFilter,
pub duplicates_found: u64,
pub unique_reads: u64,
}
impl DedupConfig {
pub fn new(accuracy: u8) -> Self {
let capacity = match accuracy {
1 => 10_000_000, 2 => 20_000_000, 3 => 40_000_000, 4 => 80_000_000, 5 => 160_000_000, 6 => 240_000_000, _ => 40_000_000,
};
let filter = BloomFilter::with_rate(0.01, capacity);
Self {
filter,
duplicates_found: 0,
unique_reads: 0,
}
}
pub fn is_duplicate(&mut self, seq: &[u8]) -> bool {
if self.filter.contains(&seq) {
self.duplicates_found += 1;
true
} else {
self.filter.insert(&seq);
self.unique_reads += 1;
false
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_dedup_new() {
let config = DedupConfig::new(3);
assert_eq!(config.duplicates_found, 0);
assert_eq!(config.unique_reads, 0);
}
#[test]
fn test_dedup_unique_reads() {
let mut config = DedupConfig::new(3);
let seq1 = b"ACGTACGTACGT";
let seq2 = b"TGCATGCATGCA";
assert!(!config.is_duplicate(seq1));
assert_eq!(config.unique_reads, 1);
assert_eq!(config.duplicates_found, 0);
assert!(!config.is_duplicate(seq2));
assert_eq!(config.unique_reads, 2);
assert_eq!(config.duplicates_found, 0);
}
#[test]
fn test_dedup_duplicate_reads() {
let mut config = DedupConfig::new(3);
let seq = b"ACGTACGTACGT";
assert!(!config.is_duplicate(seq));
assert_eq!(config.unique_reads, 1);
assert_eq!(config.duplicates_found, 0);
assert!(config.is_duplicate(seq));
assert_eq!(config.unique_reads, 1);
assert_eq!(config.duplicates_found, 1);
assert!(config.is_duplicate(seq));
assert_eq!(config.unique_reads, 1);
assert_eq!(config.duplicates_found, 2);
}
#[test]
fn test_dedup_accuracy_levels() {
for level in 1..=6 {
let config = DedupConfig::new(level);
assert_eq!(config.duplicates_found, 0);
assert_eq!(config.unique_reads, 0);
}
}
}