fastars 0.1.0

Ultra-fast QC and trimming for short and long reads
Documentation
//! Overlap-based error correction for paired-end reads.
//!
//! This module provides base correction using the overlap region between
//! read 1 (R1) and read 2 (R2) in paired-end sequencing data.
//!
//! ## Algorithm
//!
//! When paired-end reads originate from a short fragment, the 3' ends of R1 and R2
//! overlap. In the overlap region, if there's a mismatch between bases, the base
//! with the higher quality score is likely correct. This module:
//!
//! 1. Finds the overlap between R1 tail and reverse-complemented R2 head
//! 2. Compares bases in the overlap region
//! 3. When bases differ, selects the one with higher quality score
//! 4. Updates both the base and quality in the corrected reads
//!
//! ## Example
//!
//! ```no_run
//! use fastars::correction::{CorrectionConfig, OverlapCorrector, CorrectionStats};
//!
//! let config = CorrectionConfig::new()
//!     .with_min_overlap(30)
//!     .with_diff_limit(5);
//!
//! let corrector = OverlapCorrector::new(config);
//! // Use corrector.correct_pair() on paired reads
//! ```

pub mod corrector;

pub use corrector::{CorrectionBuffers, OverlapCorrector, OverlapRegion};

use serde::{Deserialize, Serialize};

// ============================================================================
// Configuration
// ============================================================================

/// Configuration for overlap-based error correction.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CorrectionConfig {
    /// Whether correction is enabled.
    pub enabled: bool,
    /// Minimum overlap length required for correction (default: 30).
    pub min_overlap: usize,
    /// Maximum number of differences allowed in the overlap region (default: 5).
    pub diff_limit: usize,
    /// Maximum percentage of differences allowed in the overlap region (default: 5.0%).
    /// Overlap is accepted only if both diff_limit (absolute) AND diff_percent_limit (%) are satisfied.
    pub diff_percent_limit: f64,
    /// Allow gaps during overlap trimming detection (default: false).
    pub allow_gap_overlap: bool,
    /// Output file for only the overlapped region of merged reads.
    pub overlapped_out: Option<std::path::PathBuf>,
}

impl Default for CorrectionConfig {
    fn default() -> Self {
        Self {
            enabled: false,
            min_overlap: 30,
            diff_limit: 5,
            diff_percent_limit: 5.0,
            allow_gap_overlap: false,
            overlapped_out: None,
        }
    }
}

impl CorrectionConfig {
    /// Create a new correction configuration with default settings.
    pub fn new() -> Self {
        Self::default()
    }

    /// Enable correction.
    pub fn enabled(mut self) -> Self {
        self.enabled = true;
        self
    }

    /// Disable correction.
    pub fn disabled(mut self) -> Self {
        self.enabled = false;
        self
    }

    /// Set minimum overlap length required for correction.
    pub fn with_min_overlap(mut self, min_overlap: usize) -> Self {
        self.min_overlap = min_overlap;
        self
    }

    /// Set maximum number of differences allowed in the overlap region.
    pub fn with_diff_limit(mut self, diff_limit: usize) -> Self {
        self.diff_limit = diff_limit;
        self
    }

    /// Set maximum percentage of differences allowed in the overlap region.
    pub fn with_diff_percent_limit(mut self, diff_percent_limit: f64) -> Self {
        self.diff_percent_limit = diff_percent_limit.clamp(0.0, 100.0);
        self
    }

    /// Set whether to allow gaps during overlap trimming detection.
    pub fn with_allow_gap_overlap(mut self, allow: bool) -> Self {
        self.allow_gap_overlap = allow;
        self
    }

    /// Set output file for overlapped region only.
    pub fn with_overlapped_out(mut self, path: Option<std::path::PathBuf>) -> Self {
        self.overlapped_out = path;
        self
    }
}

// ============================================================================
// Statistics
// ============================================================================

/// Statistics collected during overlap-based error correction.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct CorrectionStats {
    /// Total number of read pairs processed.
    pub pairs_processed: u64,
    /// Number of pairs where overlap was found.
    pub pairs_with_overlap: u64,
    /// Number of pairs that were corrected (at least one base changed).
    pub pairs_corrected: u64,
    /// Total number of bases corrected.
    pub bases_corrected: u64,
    /// Number of bases corrected in R1.
    pub bases_corrected_r1: u64,
    /// Number of bases corrected in R2.
    pub bases_corrected_r2: u64,
}

impl CorrectionStats {
    /// Create a new empty statistics container.
    pub fn new() -> Self {
        Self::default()
    }

    /// Merge statistics from another container.
    pub fn merge(&mut self, other: &CorrectionStats) {
        self.pairs_processed += other.pairs_processed;
        self.pairs_with_overlap += other.pairs_with_overlap;
        self.pairs_corrected += other.pairs_corrected;
        self.bases_corrected += other.bases_corrected;
        self.bases_corrected_r1 += other.bases_corrected_r1;
        self.bases_corrected_r2 += other.bases_corrected_r2;
    }

    /// Get the percentage of pairs where overlap was found.
    pub fn overlap_rate(&self) -> f64 {
        if self.pairs_processed == 0 {
            0.0
        } else {
            (self.pairs_with_overlap as f64 / self.pairs_processed as f64) * 100.0
        }
    }

    /// Get the percentage of pairs that were corrected.
    pub fn correction_rate(&self) -> f64 {
        if self.pairs_processed == 0 {
            0.0
        } else {
            (self.pairs_corrected as f64 / self.pairs_processed as f64) * 100.0
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_correction_config_default() {
        let config = CorrectionConfig::default();
        assert!(!config.enabled);
        assert_eq!(config.min_overlap, 30);
        assert_eq!(config.diff_limit, 5);
        assert!((config.diff_percent_limit - 5.0).abs() < f64::EPSILON);
    }

    #[test]
    fn test_correction_config_builder() {
        let config = CorrectionConfig::new()
            .enabled()
            .with_min_overlap(20)
            .with_diff_limit(3)
            .with_diff_percent_limit(10.0)
            .with_allow_gap_overlap(true);

        assert!(config.enabled);
        assert_eq!(config.min_overlap, 20);
        assert_eq!(config.diff_limit, 3);
        assert!((config.diff_percent_limit - 10.0).abs() < f64::EPSILON);
        assert!(config.allow_gap_overlap);
    }

    #[test]
    fn test_correction_config_diff_percent_clamp() {
        let config = CorrectionConfig::new().with_diff_percent_limit(150.0);
        assert!((config.diff_percent_limit - 100.0).abs() < f64::EPSILON);

        let config = CorrectionConfig::new().with_diff_percent_limit(-10.0);
        assert!((config.diff_percent_limit - 0.0).abs() < f64::EPSILON);
    }

    #[test]
    fn test_correction_stats_default() {
        let stats = CorrectionStats::default();
        assert_eq!(stats.pairs_processed, 0);
        assert_eq!(stats.pairs_corrected, 0);
        assert_eq!(stats.bases_corrected, 0);
    }

    #[test]
    fn test_correction_stats_merge() {
        let mut stats1 = CorrectionStats {
            pairs_processed: 100,
            pairs_with_overlap: 80,
            pairs_corrected: 10,
            bases_corrected: 50,
            bases_corrected_r1: 25,
            bases_corrected_r2: 25,
        };

        let stats2 = CorrectionStats {
            pairs_processed: 50,
            pairs_with_overlap: 40,
            pairs_corrected: 5,
            bases_corrected: 25,
            bases_corrected_r1: 12,
            bases_corrected_r2: 13,
        };

        stats1.merge(&stats2);

        assert_eq!(stats1.pairs_processed, 150);
        assert_eq!(stats1.pairs_with_overlap, 120);
        assert_eq!(stats1.pairs_corrected, 15);
        assert_eq!(stats1.bases_corrected, 75);
        assert_eq!(stats1.bases_corrected_r1, 37);
        assert_eq!(stats1.bases_corrected_r2, 38);
    }

    #[test]
    fn test_correction_stats_rates() {
        let stats = CorrectionStats {
            pairs_processed: 100,
            pairs_with_overlap: 80,
            pairs_corrected: 10,
            bases_corrected: 50,
            bases_corrected_r1: 25,
            bases_corrected_r2: 25,
        };

        assert!((stats.overlap_rate() - 80.0).abs() < 0.001);
        assert!((stats.correction_rate() - 10.0).abs() < 0.001);
    }

    #[test]
    fn test_correction_stats_rates_empty() {
        let stats = CorrectionStats::default();
        assert_eq!(stats.overlap_rate(), 0.0);
        assert_eq!(stats.correction_rate(), 0.0);
    }
}