fastars 0.1.0 - Docs.rs

//! Duplication estimation.
//!
//! This module provides estimation of sequence duplication levels
//! using sampling-based approaches.

use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};

/// Serde support for FxHashMap<u64, u64>
mod fx_hashmap_serde {
    use rustc_hash::FxHashMap;
    use serde::{Deserialize, Deserializer, Serialize, Serializer};
    use std::collections::HashMap;

    pub fn serialize<S>(map: &FxHashMap<u64, u64>, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: Serializer,
    {
        // Convert to std HashMap for serialization
        let std_map: HashMap<u64, u64> = map.iter().map(|(&k, &v)| (k, v)).collect();
        std_map.serialize(serializer)
    }

    pub fn deserialize<'de, D>(deserializer: D) -> Result<FxHashMap<u64, u64>, D::Error>
    where
        D: Deserializer<'de>,
    {
        let std_map = HashMap::<u64, u64>::deserialize(deserializer)?;
        Ok(std_map.into_iter().collect())
    }
}

/// Default number of reads to sample for duplication analysis.
const DEFAULT_SAMPLE_SIZE: u64 = 100_000;

/// A simple hash function for sequences.
/// Uses FNV-1a for fast hashing of byte sequences.
fn hash_sequence(seq: &[u8]) -> u64 {
    // FNV-1a hash
    const FNV_OFFSET: u64 = 0xcbf29ce484222325;
    const FNV_PRIME: u64 = 0x100000001b3;

    let mut hash = FNV_OFFSET;
    for &byte in seq {
        hash ^= byte.to_ascii_uppercase() as u64;
        hash = hash.wrapping_mul(FNV_PRIME);
    }
    hash
}

/// Duplication statistics.
///
/// Samples the first N reads and estimates duplication rate
/// by tracking sequence hashes.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DuplicationStats {
    /// Sequence hash -> count mapping
    #[serde(with = "fx_hashmap_serde")]
    sequence_counts: FxHashMap<u64, u64>,
    /// Maximum number of reads to sample
    sample_size: u64,
    /// Number of reads processed so far
    reads_processed: u64,
    /// Whether we're still in sampling mode
    sampling_active: bool,
    /// Duplication level histogram (1-10+)
    /// Index 0 = unique (1 copy), Index 9 = 10+ copies
    duplication_histogram: [u64; 10],
    /// Flag indicating histogram needs recalculation
    histogram_dirty: bool,
    /// Whether duplication evaluation is enabled
    enabled: bool,
}

impl Default for DuplicationStats {
    fn default() -> Self {
        Self::new()
    }
}

impl DuplicationStats {
    /// Create a new duplication statistics container with default settings.
    ///
    /// Samples first 100,000 reads.
    pub fn new() -> Self {
        Self::with_sample_size(DEFAULT_SAMPLE_SIZE)
    }

    /// Create a new duplication statistics container with custom sample size.
    pub fn with_sample_size(sample_size: u64) -> Self {
        Self {
            sequence_counts: FxHashMap::default(),
            sample_size,
            reads_processed: 0,
            sampling_active: true,
            duplication_histogram: [0u64; 10],
            histogram_dirty: true,
            enabled: true,
        }
    }

    /// Create a disabled duplication statistics container.
    ///
    /// This container will not collect any statistics and uses minimal memory.
    pub fn disabled() -> Self {
        Self {
            sequence_counts: FxHashMap::default(),
            sample_size: 0,
            reads_processed: 0,
            sampling_active: false,
            duplication_histogram: [0u64; 10],
            histogram_dirty: false,
            enabled: false,
        }
    }

    /// Update statistics with a sequence.
    ///
    /// Only processes reads until sample_size is reached.
    #[inline]
    pub fn update(&mut self, seq: &[u8], read_count: u64) {
        // Skip if disabled
        if !self.enabled {
            return;
        }

        // Check if we should still be sampling
        if !self.sampling_active || read_count > self.sample_size {
            self.sampling_active = false;
            return;
        }

        if seq.is_empty() {
            return;
        }

        let hash = hash_sequence(seq);
        *self.sequence_counts.entry(hash).or_insert(0) += 1;
        self.reads_processed += 1;
        self.histogram_dirty = true;
    }

    /// Calculate the duplication rate.
    ///
    /// Returns the percentage of reads that are duplicates (0.0 to 100.0).
    pub fn duplication_rate(&self) -> f64 {
        if self.reads_processed == 0 {
            return 0.0;
        }

        let unique_sequences = self.sequence_counts.len() as u64;
        let duplicates = self.reads_processed - unique_sequences;

        (duplicates as f64 / self.reads_processed as f64) * 100.0
    }

    /// Get the number of unique sequences.
    pub fn unique_count(&self) -> u64 {
        self.sequence_counts.len() as u64
    }

    /// Get the total reads processed (sampled).
    pub fn reads_processed(&self) -> u64 {
        self.reads_processed
    }

    /// Get the estimated unique rate.
    ///
    /// Returns the percentage of reads that are unique (0.0 to 100.0).
    pub fn unique_rate(&self) -> f64 {
        100.0 - self.duplication_rate()
    }

    /// Calculate and return the duplication histogram.
    ///
    /// Index 0 = sequences seen 1 time (unique)
    /// Index 1 = sequences seen 2 times
    /// ...
    /// Index 9 = sequences seen 10+ times
    pub fn duplication_histogram(&mut self) -> &[u64; 10] {
        if self.histogram_dirty {
            self.recalculate_histogram();
        }
        &self.duplication_histogram
    }

    /// Get duplication histogram without recalculating.
    pub fn histogram_snapshot(&self) -> [u64; 10] {
        if self.histogram_dirty {
            let mut hist = [0u64; 10];
            for &count in self.sequence_counts.values() {
                let idx = (count as usize).saturating_sub(1).min(9);
                hist[idx] += count; // Each sequence contributes 'count' reads
            }
            hist
        } else {
            self.duplication_histogram
        }
    }

    fn recalculate_histogram(&mut self) {
        self.duplication_histogram = [0u64; 10];
        for &count in self.sequence_counts.values() {
            let idx = (count as usize).saturating_sub(1).min(9);
            self.duplication_histogram[idx] += count;
        }
        self.histogram_dirty = false;
    }

    /// Get the percentage of reads in each duplication level.
    pub fn duplication_percentages(&mut self) -> [f64; 10] {
        if self.reads_processed == 0 {
            return [0.0; 10];
        }

        let total = self.reads_processed as f64;
        let hist = self.duplication_histogram();
        let mut percentages = [0.0f64; 10];

        for (i, &count) in hist.iter().enumerate() {
            percentages[i] = (count as f64 / total) * 100.0;
        }

        percentages
    }

    /// Merge statistics from another DuplicationStats instance.
    ///
    /// Note: Merging duplication stats is approximate since the same
    /// sequence in different workers will have the same hash.
    pub fn merge(&mut self, other: &DuplicationStats) {
        for (&hash, &count) in &other.sequence_counts {
            *self.sequence_counts.entry(hash).or_insert(0) += count;
        }
        self.reads_processed += other.reads_processed;
        self.histogram_dirty = true;
    }

    /// Check if sampling is still active.
    pub fn is_sampling(&self) -> bool {
        self.sampling_active
    }

    /// Get the sample size limit.
    pub fn sample_size(&self) -> u64 {
        self.sample_size
    }

    /// Get sequences with highest duplication.
    ///
    /// Returns hashes and their counts, sorted by count descending.
    pub fn most_duplicated(&self, n: usize) -> Vec<(u64, u64)> {
        let mut sorted: Vec<(u64, u64)> = self
            .sequence_counts
            .iter()
            .map(|(&k, &v)| (k, v))
            .collect();

        sorted.sort_by(|a, b| b.1.cmp(&a.1));
        sorted.truncate(n);
        sorted
    }

    /// Get the mean duplication level.
    ///
    /// Returns the average number of times each unique sequence appears.
    pub fn mean_duplication(&self) -> f64 {
        let unique = self.sequence_counts.len() as u64;
        if unique == 0 {
            0.0
        } else {
            self.reads_processed as f64 / unique as f64
        }
    }

    /// Create a DuplicationStats from raw hash counts.
    ///
    /// This is used for converting from FastQcStats to QcStats.
    pub fn from_raw(sequence_counts: FxHashMap<u64, u64>, sample_size: u64) -> Self {
        let reads_processed: u64 = sequence_counts.values().sum();
        Self {
            sequence_counts,
            sample_size,
            reads_processed,
            sampling_active: false,
            duplication_histogram: [0u64; 10],
            histogram_dirty: true,
            enabled: true,
        }
    }

    /// Check if duplication evaluation is enabled.
    pub fn is_enabled(&self) -> bool {
        self.enabled
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_duplication_stats_new() {
        let ds = DuplicationStats::new();
        assert_eq!(ds.reads_processed(), 0);
        assert_eq!(ds.unique_count(), 0);
        assert!(ds.is_sampling());
    }

    #[test]
    fn test_duplication_stats_update_unique() {
        let mut ds = DuplicationStats::new();
        ds.update(b"ATGC", 1);
        ds.update(b"GCTA", 2);
        ds.update(b"TTAA", 3);

        assert_eq!(ds.reads_processed(), 3);
        assert_eq!(ds.unique_count(), 3);
        assert!((ds.duplication_rate() - 0.0).abs() < 0.001);
    }

    #[test]
    fn test_duplication_stats_update_duplicates() {
        let mut ds = DuplicationStats::new();
        ds.update(b"ATGC", 1);
        ds.update(b"ATGC", 2);
        ds.update(b"ATGC", 3);

        assert_eq!(ds.reads_processed(), 3);
        assert_eq!(ds.unique_count(), 1);
        // 2 out of 3 are duplicates = 66.67%
        assert!((ds.duplication_rate() - 66.666).abs() < 0.1);
    }

    #[test]
    fn test_duplication_stats_case_insensitive() {
        let mut ds = DuplicationStats::new();
        ds.update(b"ATGC", 1);
        ds.update(b"atgc", 2);

        assert_eq!(ds.unique_count(), 1);
        assert_eq!(ds.reads_processed(), 2);
    }

    #[test]
    fn test_duplication_stats_sampling_limit() {
        let mut ds = DuplicationStats::with_sample_size(5);

        for i in 1..=10 {
            ds.update(format!("SEQ{}", i).as_bytes(), i);
        }

        assert_eq!(ds.reads_processed(), 5);
        assert!(!ds.is_sampling());
    }

    #[test]
    fn test_duplication_stats_unique_rate() {
        let mut ds = DuplicationStats::new();
        ds.update(b"ATGC", 1);
        ds.update(b"ATGC", 2);
        ds.update(b"GCTA", 3);
        ds.update(b"TTAA", 4);

        // 3 unique out of 4 = 75% unique
        assert!((ds.unique_rate() - 75.0).abs() < 0.1);
    }

    #[test]
    fn test_duplication_stats_histogram() {
        let mut ds = DuplicationStats::new();
        // 1 unique sequence
        ds.update(b"UNIQUE", 1);
        // 1 sequence seen twice
        ds.update(b"DOUBLE", 2);
        ds.update(b"DOUBLE", 3);
        // 1 sequence seen 3 times
        ds.update(b"TRIPLE", 4);
        ds.update(b"TRIPLE", 5);
        ds.update(b"TRIPLE", 6);

        let hist = ds.duplication_histogram();
        assert_eq!(hist[0], 1); // 1 read with duplication level 1
        assert_eq!(hist[1], 2); // 2 reads with duplication level 2
        assert_eq!(hist[2], 3); // 3 reads with duplication level 3
    }

    #[test]
    fn test_duplication_stats_merge() {
        let mut ds1 = DuplicationStats::new();
        ds1.update(b"ATGC", 1);

        let mut ds2 = DuplicationStats::new();
        ds2.update(b"ATGC", 1);
        ds2.update(b"GCTA", 2);

        ds1.merge(&ds2);

        assert_eq!(ds1.reads_processed(), 3);
        // ATGC appears in both, so 1 unique with count 2
        // GCTA appears once
        // 2 unique sequences, 3 reads -> 1 duplicate
        assert_eq!(ds1.unique_count(), 2);
    }

    #[test]
    fn test_duplication_stats_empty() {
        let mut ds = DuplicationStats::new();
        ds.update(b"", 1);

        assert_eq!(ds.reads_processed(), 0);
        assert_eq!(ds.duplication_rate(), 0.0);
    }

    #[test]
    fn test_duplication_stats_most_duplicated() {
        let mut ds = DuplicationStats::new();
        for i in 1..=5 {
            ds.update(b"COMMON", i);
        }
        for i in 1..=2 {
            ds.update(b"LESS", i + 5);
        }
        ds.update(b"RARE", 8);

        let top = ds.most_duplicated(2);
        assert_eq!(top.len(), 2);
        assert_eq!(top[0].1, 5); // COMMON with 5 counts
        assert_eq!(top[1].1, 2); // LESS with 2 counts
    }

    #[test]
    fn test_duplication_stats_mean_duplication() {
        let mut ds = DuplicationStats::new();
        // 2 sequences, each seen 2 times = 4 reads
        ds.update(b"SEQ1", 1);
        ds.update(b"SEQ1", 2);
        ds.update(b"SEQ2", 3);
        ds.update(b"SEQ2", 4);

        // Mean duplication = 4 reads / 2 unique = 2.0
        assert!((ds.mean_duplication() - 2.0).abs() < 0.001);
    }

    #[test]
    fn test_duplication_stats_serialize() {
        let mut ds = DuplicationStats::new();
        ds.update(b"ATGC", 1);
        ds.update(b"ATGC", 2);

        let json = serde_json::to_string(&ds).unwrap();
        let ds2: DuplicationStats = serde_json::from_str(&json).unwrap();

        assert_eq!(ds.reads_processed(), ds2.reads_processed());
        assert_eq!(ds.unique_count(), ds2.unique_count());
    }

    #[test]
    fn test_hash_sequence_consistency() {
        let hash1 = hash_sequence(b"ATGC");
        let hash2 = hash_sequence(b"ATGC");
        let hash3 = hash_sequence(b"atgc");

        assert_eq!(hash1, hash2);
        assert_eq!(hash1, hash3); // Case insensitive
    }

    #[test]
    fn test_duplication_percentages() {
        let mut ds = DuplicationStats::new();
        // 50 unique reads
        for i in 1..=50 {
            ds.update(format!("UNIQUE{}", i).as_bytes(), i);
        }
        // 50 reads that are duplicates (25 sequences x 2)
        for i in 1..=25 {
            ds.update(format!("DUP{}", i).as_bytes(), 50 + (i * 2) - 1);
            ds.update(format!("DUP{}", i).as_bytes(), 50 + (i * 2));
        }

        let percentages = ds.duplication_percentages();
        // 50% unique (index 0)
        assert!((percentages[0] - 50.0).abs() < 0.1);
        // 50% duplicated at level 2 (index 1)
        assert!((percentages[1] - 50.0).abs() < 0.1);
    }
}