fastars 0.1.0

Ultra-fast QC and trimming for short and long reads
Documentation
//! Overlap-based adapter detection for paired-end reads.
//!
//! When R1 and R2 overlap (insert size < read length), the adapter positions
//! can be inferred directly from the overlap without brute-force searching.
//! This is the primary optimization used by fastp.

use crate::trim::adapter::reverse_complement;

/// Result of overlap analysis between paired-end reads.
#[derive(Debug, Clone, Copy, Default)]
pub struct OverlapResult {
    /// Whether an overlap was found.
    pub overlapped: bool,
    /// Offset of R2 relative to R1 (negative = adapter present).
    pub offset: i32,
    /// Length of the overlapping region.
    pub overlap_len: usize,
    /// Number of mismatches in the overlap.
    pub diff: usize,
}

impl OverlapResult {
    /// Returns the trim position for R1 (where to cut).
    /// Returns None if no trimming needed.
    #[inline]
    pub fn trim_pos_r1(&self, r1_len: usize) -> Option<usize> {
        if !self.overlapped || self.offset >= 0 {
            return None;
        }
        // When offset < 0, adapter starts at: overlap_len + offset (from R1 start)
        // Actually: trim position = min(r1_len, overlap_len)
        let trim_pos = (self.overlap_len as i32 + (-self.offset).max(0)) as usize;
        if trim_pos < r1_len {
            Some(trim_pos.min(r1_len))
        } else {
            None
        }
    }

    /// Returns the trim position for R2 (where to cut).
    /// Returns None if no trimming needed.
    #[inline]
    pub fn trim_pos_r2(&self, r2_len: usize) -> Option<usize> {
        if !self.overlapped || self.offset >= 0 {
            return None;
        }
        // Symmetric to R1
        let trim_pos = self.overlap_len;
        if trim_pos < r2_len {
            Some(trim_pos.min(r2_len))
        } else {
            None
        }
    }
}

/// Configuration for overlap analysis.
#[derive(Debug, Clone)]
pub struct OverlapConfig {
    /// Maximum number of mismatches allowed.
    pub diff_limit: usize,
    /// Minimum overlap length required.
    pub overlap_require: usize,
    /// Maximum mismatch percentage (0.0 - 1.0).
    pub diff_percent_limit: f64,
}

impl Default for OverlapConfig {
    fn default() -> Self {
        Self {
            diff_limit: 5,
            overlap_require: 30,
            diff_percent_limit: 0.2,
        }
    }
}

/// Analyze overlap between paired-end reads.
///
/// This is a Rust port of fastp's OverlapAnalysis::analyze().
/// It detects if R1 and R2 overlap and returns the overlap parameters.
///
/// # Arguments
/// * `r1_seq` - Read 1 sequence
/// * `r2_seq` - Read 2 sequence (will be reverse complemented internally)
/// * `config` - Overlap analysis configuration
///
/// # Returns
/// OverlapResult indicating whether overlap was found and its parameters.
pub fn analyze_overlap(r1_seq: &[u8], r2_seq: &[u8], config: &OverlapConfig) -> OverlapResult {
    let rc_r2 = reverse_complement(r2_seq);
    let len1 = r1_seq.len();
    let len2 = rc_r2.len();

    let complete_compare_require = 50;
    let overlap_require = config.overlap_require;
    let diff_limit = config.diff_limit;
    let diff_percent_limit = config.diff_percent_limit;

    // Forward search: R1 extends beyond R2 on the left (offset >= 0)
    // This is the normal case where insert >= read length
    let mut offset: i32 = 0;
    while (offset as usize) < len1.saturating_sub(overlap_require) {
        let overlap_len = (len1 - offset as usize).min(len2);
        let overlap_diff_limit = diff_limit.min((overlap_len as f64 * diff_percent_limit) as usize);

        let mut diff = 0;
        let mut i = 0;
        while i < overlap_len {
            if r1_seq[offset as usize + i] != rc_r2[i] {
                diff += 1;
                if diff > overlap_diff_limit && i < complete_compare_require {
                    break;
                }
            }
            i += 1;
        }

        if diff <= overlap_diff_limit || (diff > overlap_diff_limit && i > complete_compare_require) {
            return OverlapResult {
                overlapped: true,
                offset,
                overlap_len,
                diff,
            };
        }

        offset += 1;
    }

    // Reverse search: R2 extends beyond R1 (offset < 0)
    // This is when insert < read length, meaning adapter is present
    offset = 0;
    while offset > -((len2.saturating_sub(overlap_require)) as i32) {
        let overlap_len = len1.min(len2 - (-offset) as usize);
        let overlap_diff_limit = diff_limit.min((overlap_len as f64 * diff_percent_limit) as usize);

        let mut diff = 0;
        let mut i = 0;
        while i < overlap_len {
            if r1_seq[i] != rc_r2[(-offset) as usize + i] {
                diff += 1;
                if diff > overlap_diff_limit && i < complete_compare_require {
                    break;
                }
            }
            i += 1;
        }

        if diff <= overlap_diff_limit || (diff > overlap_diff_limit && i > complete_compare_require) {
            return OverlapResult {
                overlapped: true,
                offset,
                overlap_len,
                diff,
            };
        }

        offset -= 1;
    }

    OverlapResult::default()
}

/// Trim paired-end reads based on overlap analysis.
///
/// Returns (r1_trim_end, r2_trim_end) - the positions where reads should be trimmed.
/// If no adapter detected, returns (r1_len, r2_len).
#[inline]
pub fn trim_by_overlap(
    r1_seq: &[u8],
    r2_seq: &[u8],
    config: &OverlapConfig,
) -> (usize, usize) {
    let ov = analyze_overlap(r1_seq, r2_seq, config);

    if ov.overlapped && ov.offset < 0 {
        // Adapter detected - calculate trim positions
        // When offset is negative, the insert is shorter than read length
        // R1 trim position: overlap_len (everything after is adapter)
        // R2 trim position: overlap_len (symmetric)
        let r1_trim = ov.overlap_len.min(r1_seq.len());
        let r2_trim = ov.overlap_len.min(r2_seq.len());
        (r1_trim, r2_trim)
    } else {
        // No adapter detected via overlap
        (r1_seq.len(), r2_seq.len())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_overlap_no_adapter() {
        // Reads that don't overlap (insert longer than read)
        let r1 = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT";
        let r2 = b"NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN";

        let config = OverlapConfig::default();
        let result = analyze_overlap(r1, r2, &config);

        // Should not find overlap (sequences don't match)
        assert!(!result.overlapped || result.offset >= 0);
    }

    #[test]
    fn test_overlap_with_adapter() {
        // Simulate paired-end reads with adapter contamination:
        // When insert size (64bp) < read length (97bp), adapters appear at 3' ends
        //
        // R1 reads forward:  5'--[INSERT 64bp][ADAPTER]--3'
        // R2 reads reverse:  3'--[ADAPTER][INSERT 64bp]--5'
        //
        // R2 as sequenced (5'->3'): RC(INSERT) followed by RC(ADAPTER)
        // When we RC(R2), we get: ADAPTER followed by INSERT
        //
        // So R1 = INSERT + ADAPTER (64 + 33 = 97bp)
        // RC(R2) = ADAPTER + INSERT (33 + 64 = 97bp)
        //
        // For these to align, we need negative offset where:
        // R1[0..64] aligns with RC(R2)[33..97]

        let insert = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT"; // 64bp
        let adapter_r1 = b"AGATCGGAAGAGCACACGTCTGAACTCCAGTCA"; // 33bp TruSeq R1
        let adapter_r2 = b"AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"; // 33bp TruSeq R2

        // R1: insert + adapter_r1
        let mut r1 = insert.to_vec();
        r1.extend_from_slice(adapter_r1);

        // R2 (as sequenced): RC(insert) + adapter_r2
        let rc_insert = reverse_complement(insert);
        let mut r2 = rc_insert;
        r2.extend_from_slice(adapter_r2);

        let config = OverlapConfig {
            diff_limit: 5,
            overlap_require: 30,
            diff_percent_limit: 0.2,
        };

        let result = analyze_overlap(&r1, &r2, &config);

        // The overlap analysis reverse-complements R2 internally
        // After RC(R2): RC(adapter_r2) + insert
        // R1[0..64] should match RC(R2)[33..97] with overlap_len = 64
        // This gives offset = -33 (negative, indicating adapter present)
        assert!(result.overlapped, "should find overlap between R1 and R2");
        assert!(result.offset < 0, "offset should be negative when adapter present, got {}", result.offset);
    }

    #[test]
    fn test_trim_by_overlap() {
        let config = OverlapConfig::default();

        // When no overlap, return full lengths
        let r1 = b"ACGTACGT";
        let r2 = b"NNNNNNNN";
        let (t1, t2) = trim_by_overlap(r1, r2, &config);
        assert_eq!(t1, r1.len());
        assert_eq!(t2, r2.len());
    }

    #[test]
    fn test_overlap_config_default() {
        let config = OverlapConfig::default();
        assert_eq!(config.diff_limit, 5);
        assert_eq!(config.overlap_require, 30);
        assert!((config.diff_percent_limit - 0.2).abs() < 0.001);
    }
}