fastars 0.1.0

Ultra-fast QC and trimming for short and long reads
Documentation
//! Quality score analysis.
//!
//! This module provides analysis of Phred quality scores
//! at each position and across reads.

use serde::{Deserialize, Serialize};

/// Quality score statistics.
///
/// Tracks quality scores at each position and provides histogram data.
/// Quality scores are expected to be Phred+33 encoded (ASCII 33-126).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QualityStats {
    /// Per-position quality: (sum of quality scores, count of bases)
    position_stats: Vec<(u64, u64)>,
    /// Quality histogram Q0-Q93 (94 bins)
    histogram: Vec<u64>,
    /// Total quality score sum (for global mean calculation)
    total_quality_sum: u64,
    /// Total bases counted
    total_bases: u64,
}

impl Default for QualityStats {
    fn default() -> Self {
        Self::new()
    }
}

impl QualityStats {
    /// Create a new quality statistics container with default capacity.
    pub fn new() -> Self {
        Self::with_capacity(300)
    }

    /// Create a new quality statistics container with specified capacity.
    pub fn with_capacity(capacity: usize) -> Self {
        Self {
            position_stats: Vec::with_capacity(capacity),
            histogram: vec![0u64; 94],
            total_quality_sum: 0,
            total_bases: 0,
        }
    }

    /// Update statistics with quality scores from a read.
    ///
    /// Quality scores should be Phred+33 encoded (raw ASCII values).
    /// This is a hot path - optimized for minimal branching.
    #[inline]
    pub fn update(&mut self, qual: &[u8]) {
        // Extend position stats if quality string is longer than current
        if qual.len() > self.position_stats.len() {
            self.position_stats.resize(qual.len(), (0, 0));
        }

        for (pos, &q) in qual.iter().enumerate() {
            // Convert from Phred+33 to quality score
            let score = q.saturating_sub(33) as usize;
            let clamped_score = score.min(93); // Clamp to valid range

            // Update position-based stats
            self.position_stats[pos].0 += clamped_score as u64;
            self.position_stats[pos].1 += 1;

            // Update histogram
            self.histogram[clamped_score] += 1;

            // Update totals
            self.total_quality_sum += clamped_score as u64;
            self.total_bases += 1;
        }
    }

    /// Get mean quality score at a specific position.
    pub fn mean_quality_at(&self, position: usize) -> Option<f64> {
        self.position_stats.get(position).and_then(|&(sum, count)| {
            if count == 0 {
                None
            } else {
                Some(sum as f64 / count as f64)
            }
        })
    }

    /// Get global mean quality score across all bases.
    pub fn mean_quality(&self) -> f64 {
        if self.total_bases == 0 {
            0.0
        } else {
            self.total_quality_sum as f64 / self.total_bases as f64
        }
    }

    /// Get median quality score across all bases.
    ///
    /// Uses the histogram for efficient O(1) median calculation.
    pub fn median_quality(&self) -> u8 {
        if self.total_bases == 0 {
            return 0;
        }

        // For median, we need the element at position (n+1)/2 (1-indexed)
        // or equivalently, cumulative > n/2 gives us the right element
        let half = self.total_bases / 2;
        let mut cumulative = 0u64;

        for (score, &count) in self.histogram.iter().enumerate() {
            cumulative += count;
            if cumulative > half {
                return score as u8;
            }
        }

        0
    }

    /// Get the quality histogram (Q0-Q93).
    pub fn histogram(&self) -> &[u64] {
        &self.histogram
    }

    /// Get position-based statistics.
    pub fn position_stats(&self) -> &[(u64, u64)] {
        &self.position_stats
    }

    /// Get the number of positions tracked.
    pub fn len(&self) -> usize {
        self.position_stats.len()
    }

    /// Returns true if no positions have been recorded.
    pub fn is_empty(&self) -> bool {
        self.position_stats.is_empty()
    }

    /// Get total bases counted.
    pub fn total_bases(&self) -> u64 {
        self.total_bases
    }

    /// Get Q20 percentage (% of bases with Q >= 20).
    pub fn q20_percent(&self) -> f64 {
        if self.total_bases == 0 {
            return 0.0;
        }

        let q20_plus: u64 = self.histogram[20..].iter().sum();
        (q20_plus as f64 / self.total_bases as f64) * 100.0
    }

    /// Get Q30 percentage (% of bases with Q >= 30).
    pub fn q30_percent(&self) -> f64 {
        if self.total_bases == 0 {
            return 0.0;
        }

        let q30_plus: u64 = self.histogram[30..].iter().sum();
        (q30_plus as f64 / self.total_bases as f64) * 100.0
    }

    /// Merge statistics from another QualityStats instance.
    ///
    /// Used for combining results from multiple workers.
    pub fn merge(&mut self, other: &QualityStats) {
        // Extend position stats to match the larger size
        if other.position_stats.len() > self.position_stats.len() {
            self.position_stats
                .resize(other.position_stats.len(), (0, 0));
        }

        // Add position stats element-wise
        for (pos, &(sum, count)) in other.position_stats.iter().enumerate() {
            self.position_stats[pos].0 += sum;
            self.position_stats[pos].1 += count;
        }

        // Add histogram counts
        for (i, &count) in other.histogram.iter().enumerate() {
            self.histogram[i] += count;
        }

        // Add totals
        self.total_quality_sum += other.total_quality_sum;
        self.total_bases += other.total_bases;
    }

    /// Create a QualityStats from raw data.
    ///
    /// This is used for converting from FastQcStats to QcStats.
    /// `position_sums` and `position_counts` are per-position quality sum and count.
    /// `histogram` is the global quality histogram (Q0-Q93).
    pub fn from_raw(
        position_sums: &[u64],
        position_counts: &[u64],
        histogram: &[u64],
        total_quality_sum: u64,
        total_bases: u64,
    ) -> Self {
        let position_stats: Vec<(u64, u64)> = position_sums
            .iter()
            .zip(position_counts.iter())
            .map(|(&s, &c)| (s, c))
            .collect();

        let mut hist = vec![0u64; 94];
        for (i, &count) in histogram.iter().enumerate().take(94) {
            hist[i] = count;
        }

        Self {
            position_stats,
            histogram: hist,
            total_quality_sum,
            total_bases,
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_quality_stats_new() {
        let qs = QualityStats::new();
        assert!(qs.is_empty());
        assert_eq!(qs.len(), 0);
        assert_eq!(qs.total_bases(), 0);
    }

    #[test]
    fn test_quality_stats_update_simple() {
        let mut qs = QualityStats::new();
        // "IIII" in Phred+33 is quality 40 (I = ASCII 73, 73-33=40)
        qs.update(b"IIII");

        assert_eq!(qs.len(), 4);
        assert_eq!(qs.total_bases(), 4);
        assert!((qs.mean_quality() - 40.0).abs() < 0.001);
    }

    #[test]
    fn test_quality_stats_update_varying_quality() {
        let mut qs = QualityStats::new();
        // '!' = Q0 (33-33), '+' = Q10 (43-33), '5' = Q20 (53-33), '?' = Q30 (63-33)
        qs.update(b"!+5?");

        assert_eq!(qs.len(), 4);
        assert!((qs.mean_quality() - 15.0).abs() < 0.001); // (0+10+20+30)/4 = 15
    }

    #[test]
    fn test_quality_stats_mean_quality_at_position() {
        let mut qs = QualityStats::new();
        // Two reads with different quality at position 0
        qs.update(b"I"); // Q40
        qs.update(b"?"); // Q30

        let mean = qs.mean_quality_at(0).unwrap();
        assert!((mean - 35.0).abs() < 0.001); // (40+30)/2 = 35
    }

    #[test]
    fn test_quality_stats_median_quality() {
        let mut qs = QualityStats::new();
        // Create an odd number of quality scores
        qs.update(b"!"); // Q0
        qs.update(b"?"); // Q30
        qs.update(b"I"); // Q40

        // Median should be Q30 (middle value)
        assert_eq!(qs.median_quality(), 30);
    }

    #[test]
    fn test_quality_stats_histogram() {
        let mut qs = QualityStats::new();
        qs.update(b"IIII"); // All Q40

        let hist = qs.histogram();
        assert_eq!(hist[40], 4);
        assert_eq!(hist[0], 0);
    }

    #[test]
    fn test_quality_stats_q20_q30() {
        let mut qs = QualityStats::new();
        // 2 bases Q20+, 1 base Q30+
        qs.update(b"!5?"); // Q0, Q20, Q30

        assert!((qs.q20_percent() - 66.666).abs() < 0.1);
        assert!((qs.q30_percent() - 33.333).abs() < 0.1);
    }

    #[test]
    fn test_quality_stats_merge() {
        let mut qs1 = QualityStats::new();
        qs1.update(b"II"); // Q40, Q40

        let mut qs2 = QualityStats::new();
        qs2.update(b"??"); // Q30, Q30

        qs1.merge(&qs2);

        assert_eq!(qs1.total_bases(), 4);
        assert!((qs1.mean_quality() - 35.0).abs() < 0.001); // (40+40+30+30)/4 = 35
    }

    #[test]
    fn test_quality_stats_merge_different_lengths() {
        let mut qs1 = QualityStats::new();
        qs1.update(b"I");

        let mut qs2 = QualityStats::new();
        qs2.update(b"IIII");

        qs1.merge(&qs2);

        assert_eq!(qs1.len(), 4);
        // Position 0 should have 2 bases
        let (sum, count) = qs1.position_stats()[0];
        assert_eq!(count, 2);
        assert_eq!(sum, 80); // 40 + 40
    }

    #[test]
    fn test_quality_stats_clamping() {
        let mut qs = QualityStats::new();
        // Very high quality score (beyond normal range)
        qs.update(&[126]); // 126-33=93, should not overflow

        assert_eq!(qs.histogram()[93], 1);
    }

    #[test]
    fn test_quality_stats_serialize() {
        let mut qs = QualityStats::new();
        qs.update(b"IIII");

        let json = serde_json::to_string(&qs).unwrap();
        let qs2: QualityStats = serde_json::from_str(&json).unwrap();

        assert_eq!(qs.total_bases(), qs2.total_bases());
        assert!((qs.mean_quality() - qs2.mean_quality()).abs() < 0.001);
    }

    #[test]
    fn test_quality_stats_empty_mean() {
        let qs = QualityStats::new();
        assert_eq!(qs.mean_quality(), 0.0);
        assert_eq!(qs.median_quality(), 0);
    }

    #[test]
    fn test_quality_stats_empty_position() {
        let qs = QualityStats::new();
        assert!(qs.mean_quality_at(0).is_none());
    }
}