fastars 0.1.0

Ultra-fast QC and trimming for short and long reads
Documentation
//! GC content analysis.
//!
//! This module provides GC content calculation and distribution analysis.

use serde::{Deserialize, Serialize};

/// GC content statistics.
///
/// Tracks per-read GC percentage distribution using a histogram with 1% bins.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GcStats {
    /// Histogram of GC content: bins 0-100 represent 0%-100% GC
    histogram: Vec<u64>,
    /// Total GC bases across all reads
    total_gc: u64,
    /// Total bases across all reads
    total_bases: u64,
    /// Total reads processed
    total_reads: u64,
}

impl Default for GcStats {
    fn default() -> Self {
        Self::new()
    }
}

impl GcStats {
    /// Create a new GC statistics container.
    pub fn new() -> Self {
        Self {
            histogram: vec![0u64; 101],
            total_gc: 0,
            total_bases: 0,
            total_reads: 0,
        }
    }

    /// Update statistics with a sequence.
    ///
    /// Calculates the GC percentage of the sequence and updates the histogram.
    /// This is a hot path - optimized for minimal branching.
    #[inline]
    pub fn update(&mut self, seq: &[u8]) {
        if seq.is_empty() {
            return;
        }

        let mut gc_count: u64 = 0;

        for &base in seq {
            match base {
                b'G' | b'g' | b'C' | b'c' => gc_count += 1,
                _ => {}
            }
        }

        let len = seq.len() as u64;
        let gc_percent = ((gc_count * 100) / len) as usize;
        let gc_percent = gc_percent.min(100); // Clamp to valid range

        self.histogram[gc_percent] += 1;
        self.total_gc += gc_count;
        self.total_bases += len;
        self.total_reads += 1;
    }

    /// Get mean GC content as a percentage (0.0 to 100.0).
    pub fn mean_gc(&self) -> f64 {
        if self.total_bases == 0 {
            0.0
        } else {
            (self.total_gc as f64 / self.total_bases as f64) * 100.0
        }
    }

    /// Get the GC content histogram.
    ///
    /// Each bin represents reads with that integer percentage of GC content.
    pub fn histogram(&self) -> &[u64] {
        &self.histogram
    }

    /// Get total reads processed.
    pub fn total_reads(&self) -> u64 {
        self.total_reads
    }

    /// Get total bases processed.
    pub fn total_bases(&self) -> u64 {
        self.total_bases
    }

    /// Get total GC bases counted.
    pub fn total_gc(&self) -> u64 {
        self.total_gc
    }

    /// Get median GC content percentage.
    ///
    /// Uses the histogram for efficient calculation.
    pub fn median_gc(&self) -> u8 {
        if self.total_reads == 0 {
            return 0;
        }

        // For median, we need the element at position (n+1)/2 (1-indexed)
        let half = self.total_reads / 2;
        let mut cumulative = 0u64;

        for (gc_percent, &count) in self.histogram.iter().enumerate() {
            cumulative += count;
            if cumulative > half {
                return gc_percent as u8;
            }
        }

        50 // Default to 50% if something goes wrong
    }

    /// Get the most common GC percentage (mode).
    pub fn mode_gc(&self) -> u8 {
        let (mode, _) = self
            .histogram
            .iter()
            .enumerate()
            .max_by_key(|&(_, count)| count)
            .unwrap_or((50, &0));
        mode as u8
    }

    /// Get percentage of reads with GC content in the given range.
    pub fn percent_in_range(&self, min_gc: u8, max_gc: u8) -> f64 {
        if self.total_reads == 0 {
            return 0.0;
        }

        let min = min_gc as usize;
        let max = (max_gc as usize).min(100);

        let count: u64 = self.histogram[min..=max].iter().sum();
        (count as f64 / self.total_reads as f64) * 100.0
    }

    /// Merge statistics from another GcStats instance.
    ///
    /// Used for combining results from multiple workers.
    pub fn merge(&mut self, other: &GcStats) {
        for (i, &count) in other.histogram.iter().enumerate() {
            self.histogram[i] += count;
        }
        self.total_gc += other.total_gc;
        self.total_bases += other.total_bases;
        self.total_reads += other.total_reads;
    }

    /// Create a GcStats from raw data.
    ///
    /// This is used for converting from FastQcStats to QcStats.
    pub fn from_raw(histogram: &[u64; 101], total_gc: u64, total_bases: u64, total_reads: u64) -> Self {
        Self {
            histogram: histogram.to_vec(),
            total_gc,
            total_bases,
            total_reads,
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_gc_stats_new() {
        let gc = GcStats::new();
        assert_eq!(gc.total_reads(), 0);
        assert_eq!(gc.total_bases(), 0);
        assert_eq!(gc.total_gc(), 0);
    }

    #[test]
    fn test_gc_stats_update_all_gc() {
        let mut gc = GcStats::new();
        gc.update(b"GCGCGCGCGC"); // 100% GC

        assert_eq!(gc.total_reads(), 1);
        assert_eq!(gc.histogram()[100], 1);
        assert!((gc.mean_gc() - 100.0).abs() < 0.001);
    }

    #[test]
    fn test_gc_stats_update_no_gc() {
        let mut gc = GcStats::new();
        gc.update(b"ATATATATAT"); // 0% GC

        assert_eq!(gc.total_reads(), 1);
        assert_eq!(gc.histogram()[0], 1);
        assert!((gc.mean_gc() - 0.0).abs() < 0.001);
    }

    #[test]
    fn test_gc_stats_update_50_percent() {
        let mut gc = GcStats::new();
        gc.update(b"ATGC"); // 50% GC

        assert_eq!(gc.histogram()[50], 1);
        assert!((gc.mean_gc() - 50.0).abs() < 0.001);
    }

    #[test]
    fn test_gc_stats_update_lowercase() {
        let mut gc = GcStats::new();
        gc.update(b"atgc"); // 50% GC

        assert_eq!(gc.histogram()[50], 1);
        assert!((gc.mean_gc() - 50.0).abs() < 0.001);
    }

    #[test]
    fn test_gc_stats_update_with_n() {
        let mut gc = GcStats::new();
        gc.update(b"GCNNAT"); // 2 GC out of 6 = 33%

        assert_eq!(gc.histogram()[33], 1);
    }

    #[test]
    fn test_gc_stats_update_empty() {
        let mut gc = GcStats::new();
        gc.update(b"");

        assert_eq!(gc.total_reads(), 0);
        assert_eq!(gc.total_bases(), 0);
    }

    #[test]
    fn test_gc_stats_mean_gc_multiple_reads() {
        let mut gc = GcStats::new();
        gc.update(b"GGCC"); // 100% GC, 4 bases
        gc.update(b"AATT"); // 0% GC, 4 bases

        // Total: 4 GC out of 8 bases = 50%
        assert!((gc.mean_gc() - 50.0).abs() < 0.001);
    }

    #[test]
    fn test_gc_stats_median_gc() {
        let mut gc = GcStats::new();
        gc.update(b"GGGGGGGGGG"); // 100% GC
        gc.update(b"GCGCGCGCGC"); // 100% GC
        gc.update(b"AAAAAAAAAA"); // 0% GC

        // Median should be 100% (2 reads at 100%, 1 at 0%)
        assert_eq!(gc.median_gc(), 100);
    }

    #[test]
    fn test_gc_stats_mode_gc() {
        let mut gc = GcStats::new();
        gc.update(b"ATGC"); // 50%
        gc.update(b"ATGC"); // 50%
        gc.update(b"GGGG"); // 100%

        assert_eq!(gc.mode_gc(), 50);
    }

    #[test]
    fn test_gc_stats_percent_in_range() {
        let mut gc = GcStats::new();
        gc.update(b"ATGC"); // 50%
        gc.update(b"ATGC"); // 50%
        gc.update(b"GGGG"); // 100%
        gc.update(b"AAAA"); // 0%

        // 2 out of 4 reads in 40-60% range
        assert!((gc.percent_in_range(40, 60) - 50.0).abs() < 0.1);
    }

    #[test]
    fn test_gc_stats_merge() {
        let mut gc1 = GcStats::new();
        gc1.update(b"GGGG"); // 100% GC

        let mut gc2 = GcStats::new();
        gc2.update(b"AAAA"); // 0% GC

        gc1.merge(&gc2);

        assert_eq!(gc1.total_reads(), 2);
        assert_eq!(gc1.histogram()[100], 1);
        assert_eq!(gc1.histogram()[0], 1);
        assert!((gc1.mean_gc() - 50.0).abs() < 0.001);
    }

    #[test]
    fn test_gc_stats_serialize() {
        let mut gc = GcStats::new();
        gc.update(b"ATGC");

        let json = serde_json::to_string(&gc).unwrap();
        let gc2: GcStats = serde_json::from_str(&json).unwrap();

        assert_eq!(gc.total_reads(), gc2.total_reads());
        assert!((gc.mean_gc() - gc2.mean_gc()).abs() < 0.001);
    }

    #[test]
    fn test_gc_stats_empty_mean() {
        let gc = GcStats::new();
        assert_eq!(gc.mean_gc(), 0.0);
        assert_eq!(gc.median_gc(), 0);
    }
}