fastars 0.1.0

Ultra-fast QC and trimming for short and long reads
Documentation
//! Read deduplication using bloom filter.

use bloom::{BloomFilter, ASMS};

/// Deduplication filter configuration
pub struct DedupConfig {
    /// Bloom filter with configured FPR
    filter: BloomFilter,
    /// Number of duplicates found
    pub duplicates_found: u64,
    /// Number of unique reads
    pub unique_reads: u64,
}

impl DedupConfig {
    /// Create a new dedup filter with given accuracy level (1-6)
    /// Higher level = more memory = lower false positive rate
    pub fn new(accuracy: u8) -> Self {
        // Memory budget based on accuracy level
        let capacity = match accuracy {
            1 => 10_000_000,      // ~1GB
            2 => 20_000_000,      // ~2GB
            3 => 40_000_000,      // ~4GB (default)
            4 => 80_000_000,      // ~8GB
            5 => 160_000_000,     // ~16GB
            6 => 240_000_000,     // ~24GB
            _ => 40_000_000,
        };

        // FPR of 0.01 (1%)
        let filter = BloomFilter::with_rate(0.01, capacity);

        Self {
            filter,
            duplicates_found: 0,
            unique_reads: 0,
        }
    }

    /// Check if read is duplicate. Returns true if duplicate (should be filtered)
    pub fn is_duplicate(&mut self, seq: &[u8]) -> bool {
        if self.filter.contains(&seq) {
            self.duplicates_found += 1;
            true
        } else {
            self.filter.insert(&seq);
            self.unique_reads += 1;
            false
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_dedup_new() {
        let config = DedupConfig::new(3);
        assert_eq!(config.duplicates_found, 0);
        assert_eq!(config.unique_reads, 0);
    }

    #[test]
    fn test_dedup_unique_reads() {
        let mut config = DedupConfig::new(3);

        let seq1 = b"ACGTACGTACGT";
        let seq2 = b"TGCATGCATGCA";

        assert!(!config.is_duplicate(seq1));
        assert_eq!(config.unique_reads, 1);
        assert_eq!(config.duplicates_found, 0);

        assert!(!config.is_duplicate(seq2));
        assert_eq!(config.unique_reads, 2);
        assert_eq!(config.duplicates_found, 0);
    }

    #[test]
    fn test_dedup_duplicate_reads() {
        let mut config = DedupConfig::new(3);

        let seq = b"ACGTACGTACGT";

        // First occurrence - unique
        assert!(!config.is_duplicate(seq));
        assert_eq!(config.unique_reads, 1);
        assert_eq!(config.duplicates_found, 0);

        // Second occurrence - duplicate
        assert!(config.is_duplicate(seq));
        assert_eq!(config.unique_reads, 1);
        assert_eq!(config.duplicates_found, 1);

        // Third occurrence - still duplicate
        assert!(config.is_duplicate(seq));
        assert_eq!(config.unique_reads, 1);
        assert_eq!(config.duplicates_found, 2);
    }

    #[test]
    fn test_dedup_accuracy_levels() {
        // Verify all accuracy levels can be instantiated
        for level in 1..=6 {
            let config = DedupConfig::new(level);
            assert_eq!(config.duplicates_found, 0);
            assert_eq!(config.unique_reads, 0);
        }
    }
}