fastars 0.1.0

Ultra-fast QC and trimming for short and long reads
Documentation
//! Base content analysis.
//!
//! This module provides analysis of nucleotide composition
//! at each position in reads.

use serde::{Deserialize, Serialize};

/// Index constants for base counts array.
const A_IDX: usize = 0;
const T_IDX: usize = 1;
const G_IDX: usize = 2;
const C_IDX: usize = 3;
const N_IDX: usize = 4;

/// Lookup table for base to index conversion.
/// Maps ASCII byte values to base indices: A/a=0, T/t=1, G/g=2, C/c=3, others=4 (N)
const BASE_TO_IDX: [usize; 256] = {
    let mut table = [N_IDX; 256];
    table[b'A' as usize] = A_IDX;
    table[b'a' as usize] = A_IDX;
    table[b'T' as usize] = T_IDX;
    table[b't' as usize] = T_IDX;
    table[b'G' as usize] = G_IDX;
    table[b'g' as usize] = G_IDX;
    table[b'C' as usize] = C_IDX;
    table[b'c' as usize] = C_IDX;
    table
};

/// Base content statistics per position.
///
/// Tracks counts of A, T, G, C, N at each position in reads.
/// The internal vector grows dynamically as longer reads are encountered.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BaseContent {
    /// Per-position base counts: [A, T, G, C, N]
    counts: Vec<[u64; 5]>,
}

impl Default for BaseContent {
    fn default() -> Self {
        Self::new()
    }
}

impl BaseContent {
    /// Create a new base content analyzer with default capacity.
    pub fn new() -> Self {
        Self::with_capacity(300)
    }

    /// Create a new base content analyzer with specified capacity.
    pub fn with_capacity(capacity: usize) -> Self {
        Self {
            counts: Vec::with_capacity(capacity),
        }
    }

    /// Update statistics with a sequence.
    ///
    /// This is a hot path - optimized for minimal branching.
    #[inline]
    pub fn update(&mut self, seq: &[u8]) {
        // Extend counts vector if sequence is longer than current capacity
        if seq.len() > self.counts.len() {
            self.counts.resize(seq.len(), [0u64; 5]);
        }

        for (pos, &base) in seq.iter().enumerate() {
            // Map base to index using lookup table (faster than match)
            let idx = BASE_TO_IDX[base as usize];
            self.counts[pos][idx] += 1;
        }
    }

    /// Get the base count ratios at a specific position.
    ///
    /// Returns [A%, T%, G%, C%, N%] as fractions (0.0 to 1.0).
    /// Returns [0.0; 5] if position doesn't exist or has no data.
    pub fn get_ratios(&self, position: usize) -> [f64; 5] {
        if position >= self.counts.len() {
            return [0.0; 5];
        }

        let counts = &self.counts[position];
        let total: u64 = counts.iter().sum();

        if total == 0 {
            return [0.0; 5];
        }

        let total_f = total as f64;
        [
            counts[A_IDX] as f64 / total_f,
            counts[T_IDX] as f64 / total_f,
            counts[G_IDX] as f64 / total_f,
            counts[C_IDX] as f64 / total_f,
            counts[N_IDX] as f64 / total_f,
        ]
    }

    /// Get raw counts at a specific position.
    ///
    /// Returns [A, T, G, C, N] counts.
    pub fn get_counts(&self, position: usize) -> Option<[u64; 5]> {
        self.counts.get(position).copied()
    }

    /// Get the number of positions tracked.
    pub fn len(&self) -> usize {
        self.counts.len()
    }

    /// Returns true if no positions have been recorded.
    pub fn is_empty(&self) -> bool {
        self.counts.is_empty()
    }

    /// Merge statistics from another BaseContent instance.
    ///
    /// Used for combining results from multiple workers.
    pub fn merge(&mut self, other: &BaseContent) {
        // Extend to match the larger size
        if other.counts.len() > self.counts.len() {
            self.counts.resize(other.counts.len(), [0u64; 5]);
        }

        // Add counts element-wise
        for (pos, other_counts) in other.counts.iter().enumerate() {
            for (i, &count) in other_counts.iter().enumerate() {
                self.counts[pos][i] += count;
            }
        }
    }

    /// Get GC ratio at a specific position.
    pub fn gc_ratio(&self, position: usize) -> f64 {
        if position >= self.counts.len() {
            return 0.0;
        }

        let counts = &self.counts[position];
        let gc = counts[G_IDX] + counts[C_IDX];
        let total: u64 = counts.iter().sum();

        if total == 0 {
            0.0
        } else {
            gc as f64 / total as f64
        }
    }

    /// Get all position counts as a slice.
    pub fn all_counts(&self) -> &[[u64; 5]] {
        &self.counts
    }

    /// Create a BaseContent from raw position counts.
    ///
    /// This is used for converting from FastQcStats to QcStats.
    pub fn from_raw_counts(counts: Vec<[u64; 5]>) -> Self {
        Self { counts }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_base_content_new() {
        let bc = BaseContent::new();
        assert!(bc.is_empty());
        assert_eq!(bc.len(), 0);
    }

    #[test]
    fn test_base_content_with_capacity() {
        let bc = BaseContent::with_capacity(500);
        assert!(bc.is_empty());
    }

    #[test]
    fn test_base_content_update_simple() {
        let mut bc = BaseContent::new();
        bc.update(b"ATGC");

        assert_eq!(bc.len(), 4);
        assert_eq!(bc.get_counts(0), Some([1, 0, 0, 0, 0])); // A
        assert_eq!(bc.get_counts(1), Some([0, 1, 0, 0, 0])); // T
        assert_eq!(bc.get_counts(2), Some([0, 0, 1, 0, 0])); // G
        assert_eq!(bc.get_counts(3), Some([0, 0, 0, 1, 0])); // C
    }

    #[test]
    fn test_base_content_update_with_n() {
        let mut bc = BaseContent::new();
        bc.update(b"ATGCN");

        assert_eq!(bc.len(), 5);
        assert_eq!(bc.get_counts(4), Some([0, 0, 0, 0, 1])); // N
    }

    #[test]
    fn test_base_content_update_lowercase() {
        let mut bc = BaseContent::new();
        bc.update(b"atgc");

        assert_eq!(bc.get_counts(0), Some([1, 0, 0, 0, 0])); // a
        assert_eq!(bc.get_counts(1), Some([0, 1, 0, 0, 0])); // t
        assert_eq!(bc.get_counts(2), Some([0, 0, 1, 0, 0])); // g
        assert_eq!(bc.get_counts(3), Some([0, 0, 0, 1, 0])); // c
    }

    #[test]
    fn test_base_content_update_multiple_reads() {
        let mut bc = BaseContent::new();
        bc.update(b"AAAA");
        bc.update(b"TTTT");
        bc.update(b"GGGG");
        bc.update(b"CCCC");

        // Each position should have 1 of each base
        for pos in 0..4 {
            assert_eq!(bc.get_counts(pos), Some([1, 1, 1, 1, 0]));
        }
    }

    #[test]
    fn test_base_content_get_ratios() {
        let mut bc = BaseContent::new();
        bc.update(b"AAAA");
        bc.update(b"TTTT");

        let ratios = bc.get_ratios(0);
        assert!((ratios[0] - 0.5).abs() < 0.001); // 50% A
        assert!((ratios[1] - 0.5).abs() < 0.001); // 50% T
        assert!((ratios[2] - 0.0).abs() < 0.001); // 0% G
        assert!((ratios[3] - 0.0).abs() < 0.001); // 0% C
        assert!((ratios[4] - 0.0).abs() < 0.001); // 0% N
    }

    #[test]
    fn test_base_content_get_ratios_invalid_position() {
        let bc = BaseContent::new();
        let ratios = bc.get_ratios(100);
        assert_eq!(ratios, [0.0; 5]);
    }

    #[test]
    fn test_base_content_merge() {
        let mut bc1 = BaseContent::new();
        bc1.update(b"AAAA");

        let mut bc2 = BaseContent::new();
        bc2.update(b"TTTT");

        bc1.merge(&bc2);

        // Each position should now have 1 A and 1 T
        for pos in 0..4 {
            let counts = bc1.get_counts(pos).unwrap();
            assert_eq!(counts[A_IDX], 1);
            assert_eq!(counts[T_IDX], 1);
        }
    }

    #[test]
    fn test_base_content_merge_different_lengths() {
        let mut bc1 = BaseContent::new();
        bc1.update(b"AA");

        let mut bc2 = BaseContent::new();
        bc2.update(b"TTTT");

        bc1.merge(&bc2);

        assert_eq!(bc1.len(), 4);
        // First two positions have both A and T
        assert_eq!(bc1.get_counts(0).unwrap()[A_IDX], 1);
        assert_eq!(bc1.get_counts(0).unwrap()[T_IDX], 1);
        // Last two positions only have T
        assert_eq!(bc1.get_counts(2).unwrap()[A_IDX], 0);
        assert_eq!(bc1.get_counts(2).unwrap()[T_IDX], 1);
    }

    #[test]
    fn test_base_content_gc_ratio() {
        let mut bc = BaseContent::new();
        bc.update(b"GGCC");
        bc.update(b"AATT");

        // Position 0: G + A -> 50% GC
        assert!((bc.gc_ratio(0) - 0.5).abs() < 0.001);
    }

    #[test]
    fn test_base_content_dynamic_extension() {
        let mut bc = BaseContent::with_capacity(10);
        bc.update(b"A");
        assert_eq!(bc.len(), 1);

        bc.update(b"ATGCATGCATGCATGC"); // 16 bases
        assert_eq!(bc.len(), 16);
    }

    #[test]
    fn test_base_content_serialize() {
        let mut bc = BaseContent::new();
        bc.update(b"ATGC");

        let json = serde_json::to_string(&bc).unwrap();
        let bc2: BaseContent = serde_json::from_str(&json).unwrap();

        assert_eq!(bc.len(), bc2.len());
        for pos in 0..bc.len() {
            assert_eq!(bc.get_counts(pos), bc2.get_counts(pos));
        }
    }
}