fastars 0.1.0 - Docs.rs

//! Unified fast statistics collection (fastp-style).
//!
//! This module provides a single-pass QC statistics collector that
//! processes all per-base metrics in one loop iteration, similar to fastp.
//! This approach improves cache locality and reduces overhead compared
//! to separate update calls for each metric type.

use rustc_hash::FxHashMap;

use super::base_content::BaseContent;
use super::duplication::DuplicationStats;
use super::gc::GcStats;
use super::kmer::KmerStats;
use super::length::LengthStats;
use super::quality::QualityStats;
use super::stats::{FilterFailures, QcStats, QcSummary};
use super::Mode;

/// Index constants for base counts array.
const A_IDX: usize = 0;
const T_IDX: usize = 1;
const G_IDX: usize = 2;
const C_IDX: usize = 3;
const N_IDX: usize = 4;

/// Lookup table for base to index conversion.
/// Maps ASCII byte values to base indices: A/a=0, T/t=1, G/g=2, C/c=3, others=4 (N)
const BASE_TO_IDX: [usize; 256] = {
    let mut table = [N_IDX; 256];
    table[b'A' as usize] = A_IDX;
    table[b'a' as usize] = A_IDX;
    table[b'T' as usize] = T_IDX;
    table[b't' as usize] = T_IDX;
    table[b'G' as usize] = G_IDX;
    table[b'g' as usize] = G_IDX;
    table[b'C' as usize] = C_IDX;
    table[b'c' as usize] = C_IDX;
    table
};

/// Lookup table for base to 2-bit encoding (for 5-mer calculation).
/// A=0, C=1, G=2, T=3, invalid=255
const BASE_TO_BITS: [u8; 256] = {
    let mut table = [255u8; 256];
    table[b'A' as usize] = 0;
    table[b'a' as usize] = 0;
    table[b'C' as usize] = 1;
    table[b'c' as usize] = 1;
    table[b'G' as usize] = 2;
    table[b'g' as usize] = 2;
    table[b'T' as usize] = 3;
    table[b't' as usize] = 3;
    table
};

/// Size of the 5-mer count array: 4^5 = 1024
const FIVEMER_ARRAY_SIZE: usize = 1024;

/// Default sample size for duplication analysis.
const DEFAULT_DUP_SAMPLE_SIZE: u64 = 100_000;

/// FNV-1a hash function for duplication detection.
#[inline]
fn fnv_hash(seq: &[u8]) -> u64 {
    const FNV_OFFSET: u64 = 0xcbf29ce484222325;
    const FNV_PRIME: u64 = 0x100000001b3;

    let mut hash = FNV_OFFSET;
    for &byte in seq {
        hash ^= byte.to_ascii_uppercase() as u64;
        hash = hash.wrapping_mul(FNV_PRIME);
    }
    hash
}

/// Unified fast QC statistics collector.
///
/// This struct collects all per-base statistics in a single pass,
/// similar to fastp's approach. This improves performance by:
/// - Better cache locality (accessing seq[i] and qual[i] once per position)
/// - Reduced function call overhead
/// - Fused loops for related metrics
#[derive(Debug, Clone)]
pub struct FastQcStats {
    // ========== Per-position statistics ==========
    /// Base content: [A, T, G, C, N] x positions
    base_counts: Vec<[u64; 5]>,
    /// Quality sum per position (for mean calculation)
    qual_sums: Vec<u64>,
    /// Quality count per position
    qual_counts: Vec<u64>,
    /// Q20 counts per position
    q20_counts: Vec<u64>,
    /// Q30 counts per position
    q30_counts: Vec<u64>,

    // ========== Global histograms ==========
    /// Quality histogram (Q0-Q93)
    qual_histogram: Vec<u64>,
    /// GC content histogram (0-100%)
    gc_histogram: [u64; 101],
    /// Length distribution
    length_distribution: std::collections::HashMap<usize, u64>,

    // ========== Totals ==========
    /// Total reads processed
    total_reads: u64,
    /// Total bases processed
    total_bases: u64,
    /// Total GC bases
    total_gc: u64,
    /// Total quality sum (for global mean)
    total_qual_sum: u64,
    /// Min/max length
    min_length: usize,
    max_length: usize,

    // ========== 5-mer counts ==========
    /// Fixed array for 5-mer counts (1024 entries)
    kmer_counts: [u64; FIVEMER_ARRAY_SIZE],
    /// Total valid 5-mers counted
    total_kmers: u64,

    // ========== Duplication (sampling) ==========
    /// Sequence hash -> count mapping
    dup_hashes: FxHashMap<u64, u64>,
    /// Whether duplication sampling is active
    dup_sampling_active: bool,
    /// Sample size for duplication
    dup_sample_size: u64,
    /// Whether duplication evaluation is enabled
    dup_eval_enabled: bool,

    // ========== Filter failures ==========
    /// Filter failure breakdown
    filter_failures: FilterFailures,

    // ========== Overrepresentation analysis ==========
    /// K-mer/overrepresented sequence statistics
    kmer_stats: KmerStats,

    /// Operating mode
    mode: Mode,
}

impl Default for FastQcStats {
    fn default() -> Self {
        Self::new(Mode::Short)
    }
}

impl FastQcStats {
    /// Create a new fast QC statistics container with the specified mode.
    pub fn new(mode: Mode) -> Self {
        let capacity = mode.default_capacity();
        Self {
            base_counts: Vec::with_capacity(capacity),
            qual_sums: Vec::with_capacity(capacity),
            qual_counts: Vec::with_capacity(capacity),
            q20_counts: Vec::with_capacity(capacity),
            q30_counts: Vec::with_capacity(capacity),
            qual_histogram: vec![0u64; 94],
            gc_histogram: [0u64; 101],
            length_distribution: std::collections::HashMap::new(),
            total_reads: 0,
            total_bases: 0,
            total_gc: 0,
            total_qual_sum: 0,
            min_length: usize::MAX,
            max_length: 0,
            kmer_counts: [0u64; FIVEMER_ARRAY_SIZE],
            total_kmers: 0,
            dup_hashes: FxHashMap::default(),
            dup_sampling_active: true,
            dup_sample_size: DEFAULT_DUP_SAMPLE_SIZE,
            dup_eval_enabled: true,
            filter_failures: FilterFailures::default(),
            kmer_stats: KmerStats::new(),
            mode,
        }
    }

    /// Create a new fast QC statistics container with duplication evaluation disabled.
    pub fn with_duplication_disabled(mode: Mode) -> Self {
        let capacity = mode.default_capacity();
        Self {
            base_counts: Vec::with_capacity(capacity),
            qual_sums: Vec::with_capacity(capacity),
            qual_counts: Vec::with_capacity(capacity),
            q20_counts: Vec::with_capacity(capacity),
            q30_counts: Vec::with_capacity(capacity),
            qual_histogram: vec![0u64; 94],
            gc_histogram: [0u64; 101],
            length_distribution: std::collections::HashMap::new(),
            total_reads: 0,
            total_bases: 0,
            total_gc: 0,
            total_qual_sum: 0,
            min_length: usize::MAX,
            max_length: 0,
            kmer_counts: [0u64; FIVEMER_ARRAY_SIZE],
            total_kmers: 0,
            dup_hashes: FxHashMap::default(),
            dup_sampling_active: false,
            dup_sample_size: 0,
            dup_eval_enabled: false,
            filter_failures: FilterFailures::default(),
            kmer_stats: KmerStats::new(),
            mode,
        }
    }

    /// Create a new fast QC statistics container with full configuration.
    ///
    /// - `dup_enabled`: whether duplication evaluation is enabled
    /// - `overrep_enabled`: whether overrepresentation analysis is enabled
    /// - `overrep_sampling`: 1 in N reads will be sampled for overrepresentation
    pub fn with_full_config(
        mode: Mode,
        dup_enabled: bool,
        overrep_enabled: bool,
        overrep_sampling: u32,
    ) -> Self {
        let capacity = mode.default_capacity();
        let kmer_stats = if overrep_enabled {
            KmerStats::with_sampling_rate(overrep_sampling)
        } else {
            KmerStats::disabled()
        };
        Self {
            base_counts: Vec::with_capacity(capacity),
            qual_sums: Vec::with_capacity(capacity),
            qual_counts: Vec::with_capacity(capacity),
            q20_counts: Vec::with_capacity(capacity),
            q30_counts: Vec::with_capacity(capacity),
            qual_histogram: vec![0u64; 94],
            gc_histogram: [0u64; 101],
            length_distribution: std::collections::HashMap::new(),
            total_reads: 0,
            total_bases: 0,
            total_gc: 0,
            total_qual_sum: 0,
            min_length: usize::MAX,
            max_length: 0,
            kmer_counts: [0u64; FIVEMER_ARRAY_SIZE],
            total_kmers: 0,
            dup_hashes: FxHashMap::default(),
            dup_sampling_active: dup_enabled,
            dup_sample_size: if dup_enabled { DEFAULT_DUP_SAMPLE_SIZE } else { 0 },
            dup_eval_enabled: dup_enabled,
            filter_failures: FilterFailures::default(),
            kmer_stats,
            mode,
        }
    }

    /// Create a new fast QC statistics container for short reads.
    pub fn new_short() -> Self {
        Self::new(Mode::Short)
    }

    /// Create a new fast QC statistics container for long reads.
    pub fn new_long() -> Self {
        Self::new(Mode::Long)
    }

    /// Ensure position-based vectors have sufficient capacity.
    #[inline]
    fn ensure_capacity(&mut self, len: usize) {
        if len > self.base_counts.len() {
            self.base_counts.resize(len, [0u64; 5]);
            self.qual_sums.resize(len, 0);
            self.qual_counts.resize(len, 0);
            self.q20_counts.resize(len, 0);
            self.q30_counts.resize(len, 0);
        }
    }

    /// Single-pass update - THE HOT PATH.
    ///
    /// This method collects all per-base statistics in one loop,
    /// maximizing cache locality and minimizing overhead.
    #[inline]
    pub fn update_fast(&mut self, seq: &[u8], qual: &[u8]) {
        let len = seq.len();
        if len == 0 {
            return;
        }

        self.ensure_capacity(len);

        self.total_reads += 1;
        self.total_bases += len as u64;

        // Update length statistics
        self.min_length = self.min_length.min(len);
        self.max_length = self.max_length.max(len);
        *self.length_distribution.entry(len).or_insert(0) += 1;

        let mut gc_count = 0u64;
        let mut kmer: u32 = 0;
        let mut kmer_len = 0u8;

        // Single pass over sequence and quality
        for i in 0..len {
            let base = seq[i];
            let q = qual[i];

            // === Base content ===
            let b_idx = BASE_TO_IDX[base as usize];
            self.base_counts[i][b_idx] += 1;

            // === Quality statistics ===
            let q_val = q.saturating_sub(33) as u64;
            let clamped_q = q_val.min(93);
            self.qual_sums[i] += clamped_q;
            self.qual_counts[i] += 1;
            self.qual_histogram[clamped_q as usize] += 1;
            self.total_qual_sum += clamped_q;

            // Q20/Q30 tracking
            if clamped_q >= 30 {
                self.q30_counts[i] += 1;
                self.q20_counts[i] += 1;
            } else if clamped_q >= 20 {
                self.q20_counts[i] += 1;
            }

            // === GC counting ===
            if base == b'G' || base == b'g' || base == b'C' || base == b'c' {
                gc_count += 1;
            }

            // === Rolling 5-mer ===
            let base_bits = BASE_TO_BITS[base as usize];
            if base_bits == 255 {
                // Invalid base (N or other) - reset k-mer
                kmer_len = 0;
            } else {
                kmer = ((kmer << 2) & 0x3FF) | (base_bits as u32);
                kmer_len = (kmer_len + 1).min(5);
                if kmer_len >= 5 {
                    self.kmer_counts[kmer as usize] += 1;
                    self.total_kmers += 1;
                }
            }
        }

        // === GC histogram ===
        let gc_pct = ((gc_count * 100) / len as u64).min(100) as usize;
        self.gc_histogram[gc_pct] += 1;
        self.total_gc += gc_count;

        // === Duplication sampling ===
        if self.dup_eval_enabled {
            if self.dup_sampling_active && self.total_reads <= self.dup_sample_size {
                let hash = fnv_hash(seq);
                *self.dup_hashes.entry(hash).or_insert(0) += 1;
            }
            if self.total_reads >= self.dup_sample_size {
                self.dup_sampling_active = false;
            }
        }

        // === Overrepresentation analysis ===
        self.kmer_stats.update(seq, self.total_reads);
    }

    /// Merge statistics from another FastQcStats instance.
    ///
    /// Used for combining results from multiple workers in parallel processing.
    pub fn merge(&mut self, other: FastQcStats) {
        // Extend position-based vectors
        let max_len = self.base_counts.len().max(other.base_counts.len());
        self.ensure_capacity(max_len);

        // Merge base counts
        for (i, other_counts) in other.base_counts.iter().enumerate() {
            if i < self.base_counts.len() {
                for (j, &count) in other_counts.iter().enumerate() {
                    self.base_counts[i][j] += count;
                }
            }
        }

        // Merge quality sums/counts
        for (i, &sum) in other.qual_sums.iter().enumerate() {
            if i < self.qual_sums.len() {
                self.qual_sums[i] += sum;
            }
        }
        for (i, &count) in other.qual_counts.iter().enumerate() {
            if i < self.qual_counts.len() {
                self.qual_counts[i] += count;
            }
        }

        // Merge Q20/Q30 counts
        for (i, &count) in other.q20_counts.iter().enumerate() {
            if i < self.q20_counts.len() {
                self.q20_counts[i] += count;
            }
        }
        for (i, &count) in other.q30_counts.iter().enumerate() {
            if i < self.q30_counts.len() {
                self.q30_counts[i] += count;
            }
        }

        // Merge quality histogram
        for (i, &count) in other.qual_histogram.iter().enumerate() {
            self.qual_histogram[i] += count;
        }

        // Merge GC histogram
        for (i, &count) in other.gc_histogram.iter().enumerate() {
            self.gc_histogram[i] += count;
        }

        // Merge length distribution
        for (&len, &count) in &other.length_distribution {
            *self.length_distribution.entry(len).or_insert(0) += count;
        }

        // Merge totals
        self.total_reads += other.total_reads;
        self.total_bases += other.total_bases;
        self.total_gc += other.total_gc;
        self.total_qual_sum += other.total_qual_sum;

        // Update min/max
        if other.total_reads > 0 {
            self.min_length = self.min_length.min(other.min_length);
            self.max_length = self.max_length.max(other.max_length);
        }

        // Merge 5-mer counts
        for (i, &count) in other.kmer_counts.iter().enumerate() {
            self.kmer_counts[i] += count;
        }
        self.total_kmers += other.total_kmers;

        // Merge duplication hashes
        for (&hash, &count) in &other.dup_hashes {
            *self.dup_hashes.entry(hash).or_insert(0) += count;
        }

        // Merge filter failures
        self.filter_failures.merge(&other.filter_failures);

        // Merge overrepresentation stats
        self.kmer_stats.merge(&other.kmer_stats);
    }

    /// Merge statistics from a reference (doesn't consume other).
    pub fn merge_ref(&mut self, other: &FastQcStats) {
        let max_len = self.base_counts.len().max(other.base_counts.len());
        self.ensure_capacity(max_len);

        for (i, other_counts) in other.base_counts.iter().enumerate() {
            if i < self.base_counts.len() {
                for (j, &count) in other_counts.iter().enumerate() {
                    self.base_counts[i][j] += count;
                }
            }
        }

        for (i, &sum) in other.qual_sums.iter().enumerate() {
            if i < self.qual_sums.len() {
                self.qual_sums[i] += sum;
            }
        }
        for (i, &count) in other.qual_counts.iter().enumerate() {
            if i < self.qual_counts.len() {
                self.qual_counts[i] += count;
            }
        }

        for (i, &count) in other.q20_counts.iter().enumerate() {
            if i < self.q20_counts.len() {
                self.q20_counts[i] += count;
            }
        }
        for (i, &count) in other.q30_counts.iter().enumerate() {
            if i < self.q30_counts.len() {
                self.q30_counts[i] += count;
            }
        }

        for (i, &count) in other.qual_histogram.iter().enumerate() {
            self.qual_histogram[i] += count;
        }

        for (i, &count) in other.gc_histogram.iter().enumerate() {
            self.gc_histogram[i] += count;
        }

        for (&len, &count) in &other.length_distribution {
            *self.length_distribution.entry(len).or_insert(0) += count;
        }

        self.total_reads += other.total_reads;
        self.total_bases += other.total_bases;
        self.total_gc += other.total_gc;
        self.total_qual_sum += other.total_qual_sum;

        if other.total_reads > 0 {
            self.min_length = self.min_length.min(other.min_length);
            self.max_length = self.max_length.max(other.max_length);
        }

        for (i, &count) in other.kmer_counts.iter().enumerate() {
            self.kmer_counts[i] += count;
        }
        self.total_kmers += other.total_kmers;

        for (&hash, &count) in &other.dup_hashes {
            *self.dup_hashes.entry(hash).or_insert(0) += count;
        }

        self.filter_failures.merge(&other.filter_failures);

        // Merge overrepresentation stats
        self.kmer_stats.merge(&other.kmer_stats);
    }

    // ========== Accessor Methods ==========

    /// Get total reads processed.
    pub fn total_reads(&self) -> u64 {
        self.total_reads
    }

    /// Get total bases processed.
    pub fn total_bases(&self) -> u64 {
        self.total_bases
    }

    /// Get mean read length.
    pub fn mean_length(&self) -> f64 {
        if self.total_reads == 0 {
            0.0
        } else {
            self.total_bases as f64 / self.total_reads as f64
        }
    }

    /// Get mean quality score.
    pub fn mean_quality(&self) -> f64 {
        if self.total_bases == 0 {
            0.0
        } else {
            self.total_qual_sum as f64 / self.total_bases as f64
        }
    }

    /// Get mean GC content as percentage.
    pub fn mean_gc(&self) -> f64 {
        if self.total_bases == 0 {
            0.0
        } else {
            (self.total_gc as f64 / self.total_bases as f64) * 100.0
        }
    }

    /// Get Q20 percentage.
    pub fn q20_percent(&self) -> f64 {
        if self.total_bases == 0 {
            return 0.0;
        }
        let q20_plus: u64 = self.qual_histogram[20..].iter().sum();
        (q20_plus as f64 / self.total_bases as f64) * 100.0
    }

    /// Get Q30 percentage.
    pub fn q30_percent(&self) -> f64 {
        if self.total_bases == 0 {
            return 0.0;
        }
        let q30_plus: u64 = self.qual_histogram[30..].iter().sum();
        (q30_plus as f64 / self.total_bases as f64) * 100.0
    }

    /// Get N50 (critical for long reads).
    pub fn n50(&self) -> usize {
        if self.total_reads == 0 {
            return 0;
        }

        let mut lengths: Vec<(usize, u64)> = self
            .length_distribution
            .iter()
            .map(|(&len, &count)| (len, count))
            .collect();
        lengths.sort_by(|a, b| b.0.cmp(&a.0));

        let half_bases = self.total_bases / 2;
        let mut cumulative_bases: u64 = 0;

        for (len, count) in lengths {
            cumulative_bases += (len as u64) * count;
            if cumulative_bases >= half_bases {
                return len;
            }
        }

        0
    }

    /// Get duplication rate as percentage.
    pub fn duplication_rate(&self) -> f64 {
        let sampled_reads = self.dup_hashes.values().sum::<u64>();
        if sampled_reads == 0 {
            return 0.0;
        }

        let unique_sequences = self.dup_hashes.len() as u64;
        let duplicates = sampled_reads.saturating_sub(unique_sequences);

        (duplicates as f64 / sampled_reads as f64) * 100.0
    }

    /// Get the operating mode.
    pub fn mode(&self) -> Mode {
        self.mode
    }

    /// Check if this container has any data.
    pub fn is_empty(&self) -> bool {
        self.total_reads == 0
    }

    /// Get a mutable reference to filter failures.
    pub fn filter_failures_mut(&mut self) -> &mut FilterFailures {
        &mut self.filter_failures
    }

    /// Get filter failures.
    pub fn filter_failures(&self) -> &FilterFailures {
        &self.filter_failures
    }

    /// Get a summary of key statistics.
    pub fn summary(&self) -> QcSummary {
        QcSummary {
            total_reads: self.total_reads,
            total_bases: self.total_bases,
            mean_length: self.mean_length(),
            mean_quality: self.mean_quality(),
            mean_gc: self.mean_gc(),
            q20_percent: self.q20_percent(),
            q30_percent: self.q30_percent(),
            n50: self.n50(),
            duplication_rate: self.duplication_rate(),
        }
    }

    // ========== Raw Data Access ==========

    /// Get raw base counts per position.
    pub fn base_counts(&self) -> &[[u64; 5]] {
        &self.base_counts
    }

    /// Get quality histogram.
    pub fn qual_histogram(&self) -> &[u64] {
        &self.qual_histogram
    }

    /// Get GC histogram.
    pub fn gc_histogram(&self) -> &[u64; 101] {
        &self.gc_histogram
    }

    /// Get length distribution.
    pub fn length_distribution(&self) -> &std::collections::HashMap<usize, u64> {
        &self.length_distribution
    }

    /// Get 5-mer counts.
    pub fn kmer_counts(&self) -> &[u64; FIVEMER_ARRAY_SIZE] {
        &self.kmer_counts
    }

    /// Get total 5-mers counted.
    pub fn total_kmers(&self) -> u64 {
        self.total_kmers
    }

    /// Convert to the traditional QcStats format for compatibility.
    ///
    /// This allows FastQcStats to be used with existing report generation code.
    /// The conversion rebuilds all sub-components from the raw data.
    pub fn to_qc_stats(&self) -> QcStats {
        // Build BaseContent from raw counts
        let base_content = BaseContent::from_raw_counts(self.base_counts.clone());

        // Build QualityStats from raw data
        let quality = QualityStats::from_raw(
            &self.qual_sums,
            &self.qual_counts,
            &self.qual_histogram,
            self.total_qual_sum,
            self.total_bases,
        );

        // Build GcStats from raw data
        let gc = GcStats::from_raw(
            &self.gc_histogram,
            self.total_gc,
            self.total_bases,
            self.total_reads,
        );

        // Build LengthStats from raw data
        let length = LengthStats::from_raw(
            self.length_distribution.clone(),
            if self.total_reads > 0 { self.min_length } else { 0 },
            self.max_length,
            self.total_bases,
            self.total_reads,
        );

        // Use the collected overrepresentation stats
        let kmer = self.kmer_stats.clone();

        // Build DuplicationStats from raw data
        let duplication = DuplicationStats::from_raw(
            self.dup_hashes.clone(),
            self.dup_sample_size,
        );

        let mut qc = QcStats::new(self.mode);
        qc.total_reads = self.total_reads;
        qc.total_bases = self.total_bases;
        qc.base_content = base_content;
        qc.quality = quality;
        qc.gc = gc;
        qc.length = length;
        qc.kmer = kmer;
        qc.duplication = duplication;
        qc.filter_failures = self.filter_failures.clone();

        qc
    }
}

// ============================================================================
// Conversion trait for compatibility with existing code
// ============================================================================

impl From<FastQcStats> for QcStats {
    fn from(fast: FastQcStats) -> Self {
        fast.to_qc_stats()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_qual(scores: &[u8]) -> Vec<u8> {
        scores.iter().map(|&s| s + 33).collect()
    }

    #[test]
    fn test_fast_qc_stats_new() {
        let stats = FastQcStats::new(Mode::Short);
        assert_eq!(stats.total_reads(), 0);
        assert_eq!(stats.total_bases(), 0);
        assert!(stats.is_empty());
    }

    #[test]
    fn test_fast_qc_stats_update_simple() {
        let mut stats = FastQcStats::new(Mode::Short);
        let seq = b"ATGC";
        let qual = make_qual(&[40, 40, 40, 40]);

        stats.update_fast(seq, &qual);

        assert_eq!(stats.total_reads(), 1);
        assert_eq!(stats.total_bases(), 4);
        assert!(!stats.is_empty());
    }

    #[test]
    fn test_fast_qc_stats_base_content() {
        let mut stats = FastQcStats::new(Mode::Short);

        stats.update_fast(b"ATGC", &make_qual(&[40; 4]));

        let counts = stats.base_counts();
        assert_eq!(counts[0][A_IDX], 1); // A at position 0
        assert_eq!(counts[1][T_IDX], 1); // T at position 1
        assert_eq!(counts[2][G_IDX], 1); // G at position 2
        assert_eq!(counts[3][C_IDX], 1); // C at position 3
    }

    #[test]
    fn test_fast_qc_stats_quality() {
        let mut stats = FastQcStats::new(Mode::Short);
        let qual = make_qual(&[40, 40, 40, 40]); // Q40

        stats.update_fast(b"ATGC", &qual);

        assert!((stats.mean_quality() - 40.0).abs() < 0.001);
    }

    #[test]
    fn test_fast_qc_stats_gc() {
        let mut stats = FastQcStats::new(Mode::Short);

        // 100% GC
        stats.update_fast(b"GGCC", &make_qual(&[40; 4]));

        assert!((stats.mean_gc() - 100.0).abs() < 0.001);
    }

    #[test]
    fn test_fast_qc_stats_gc_50_percent() {
        let mut stats = FastQcStats::new(Mode::Short);

        stats.update_fast(b"ATGC", &make_qual(&[40; 4]));

        assert!((stats.mean_gc() - 50.0).abs() < 0.001);
    }

    #[test]
    fn test_fast_qc_stats_kmer() {
        let mut stats = FastQcStats::new(Mode::Short);

        // Sequence with exactly one 5-mer: ACGTA
        stats.update_fast(b"ACGTA", &make_qual(&[40; 5]));

        assert_eq!(stats.total_kmers(), 1);
    }

    #[test]
    fn test_fast_qc_stats_kmer_sliding() {
        let mut stats = FastQcStats::new(Mode::Short);

        // ACGTAC has two 5-mers: ACGTA and CGTAC
        stats.update_fast(b"ACGTAC", &make_qual(&[40; 6]));

        assert_eq!(stats.total_kmers(), 2);
    }

    #[test]
    fn test_fast_qc_stats_kmer_with_n() {
        let mut stats = FastQcStats::new(Mode::Short);

        // N in the middle breaks the k-mer chain
        stats.update_fast(b"ACNTACGTA", &make_qual(&[40; 9]));

        // After N, we need 5 more valid bases to form a k-mer
        // AC[N]TACGTA -> TACGT and ACGTA are valid 5-mers
        assert_eq!(stats.total_kmers(), 2);
    }

    #[test]
    fn test_fast_qc_stats_length() {
        let mut stats = FastQcStats::new(Mode::Short);

        stats.update_fast(b"ATGC", &make_qual(&[40; 4]));
        stats.update_fast(b"ATGCATGC", &make_qual(&[40; 8]));

        assert!((stats.mean_length() - 6.0).abs() < 0.001);
    }

    #[test]
    fn test_fast_qc_stats_q20_q30() {
        let mut stats = FastQcStats::new(Mode::Short);

        // Mix of qualities: Q10, Q25, Q35
        let qual = vec![10 + 33, 25 + 33, 35 + 33];
        stats.update_fast(b"ATG", &qual);

        // Q20+: 25, 35 = 2/3 = 66.67%
        // Q30+: 35 = 1/3 = 33.33%
        assert!((stats.q20_percent() - 66.666).abs() < 0.1);
        assert!((stats.q30_percent() - 33.333).abs() < 0.1);
    }

    #[test]
    fn test_fast_qc_stats_merge() {
        let mut stats1 = FastQcStats::new(Mode::Short);
        stats1.update_fast(b"ATGC", &make_qual(&[40; 4]));

        let mut stats2 = FastQcStats::new(Mode::Short);
        stats2.update_fast(b"GCTA", &make_qual(&[30; 4]));

        stats1.merge(stats2);

        assert_eq!(stats1.total_reads(), 2);
        assert_eq!(stats1.total_bases(), 8);
    }

    #[test]
    fn test_fast_qc_stats_merge_different_lengths() {
        let mut stats1 = FastQcStats::new(Mode::Short);
        stats1.update_fast(b"AT", &make_qual(&[40; 2]));

        let mut stats2 = FastQcStats::new(Mode::Short);
        stats2.update_fast(b"GCTA", &make_qual(&[30; 4]));

        stats1.merge(stats2);

        assert_eq!(stats1.total_reads(), 2);
        assert_eq!(stats1.base_counts().len(), 4);
    }

    #[test]
    fn test_fast_qc_stats_duplication() {
        let mut stats = FastQcStats::new(Mode::Short);

        // Same sequence 3 times
        stats.update_fast(b"ATGC", &make_qual(&[40; 4]));
        stats.update_fast(b"ATGC", &make_qual(&[40; 4]));
        stats.update_fast(b"ATGC", &make_qual(&[40; 4]));

        // 3 reads, 1 unique = 2 duplicates = 66.67%
        assert!((stats.duplication_rate() - 66.666).abs() < 0.1);
    }

    #[test]
    fn test_fast_qc_stats_duplication_sampling() {
        let mut stats = FastQcStats::new(Mode::Short);

        // After sample_size reads, sampling stops
        for i in 0..150_000 {
            stats.update_fast(format!("SEQ{:06}", i).as_bytes(), &make_qual(&[40; 9]));
        }

        // Sampling should have stopped at 100,000
        assert!(stats.dup_hashes.len() <= 100_000);
    }

    #[test]
    fn test_fast_qc_stats_empty_sequence() {
        let mut stats = FastQcStats::new(Mode::Short);

        stats.update_fast(b"", &[]);

        assert_eq!(stats.total_reads(), 0);
        assert!(stats.is_empty());
    }

    #[test]
    fn test_fast_qc_stats_n50() {
        let mut stats = FastQcStats::new(Mode::Long);

        // 5 reads of lengths: 100, 200, 300, 400, 500
        stats.update_fast(&vec![b'A'; 100], &make_qual(&vec![40; 100]));
        stats.update_fast(&vec![b'A'; 200], &make_qual(&vec![40; 200]));
        stats.update_fast(&vec![b'A'; 300], &make_qual(&vec![40; 300]));
        stats.update_fast(&vec![b'A'; 400], &make_qual(&vec![40; 400]));
        stats.update_fast(&vec![b'A'; 500], &make_qual(&vec![40; 500]));

        assert_eq!(stats.n50(), 400);
    }

    #[test]
    fn test_fast_qc_stats_summary() {
        let mut stats = FastQcStats::new(Mode::Short);
        stats.update_fast(b"ATGC", &make_qual(&[40; 4]));

        let summary = stats.summary();
        assert_eq!(summary.total_reads, 1);
        assert_eq!(summary.total_bases, 4);
    }

    #[test]
    fn test_fast_qc_stats_case_insensitive() {
        let mut stats = FastQcStats::new(Mode::Short);

        stats.update_fast(b"atgc", &make_qual(&[40; 4]));

        let counts = stats.base_counts();
        assert_eq!(counts[0][A_IDX], 1); // a at position 0
        assert_eq!(counts[1][T_IDX], 1); // t at position 1
        assert_eq!(counts[2][G_IDX], 1); // g at position 2
        assert_eq!(counts[3][C_IDX], 1); // c at position 3
    }
}