omicsx 1.0.1

omicsx: SIMD-accelerated sequence alignment and bioinformatics analysis for petabyte-scale genomic data
Documentation
//! Banded Dynamic Programming for similar sequences
//!
//! Banded DP restricts computation to a band around the diagonal, reducing complexity
//! from O(m*n) to O(k*n) where k is the bandwidth. This is highly effective for
//! sequences with high similarity (typical in genomics: ~95%+ identity).
//!
//! ## Algorithm
//!
//! Standard DP computes all cells (i,j) where 0 ≤ i ≤ m and 0 ≤ j ≤ n.
//! Banded DP only computes cells within a band of width 2k+1 centered on the diagonal:
//! - Compute (i,j) only if |i - j| ≤ k
//! - Memory: O(k*n) instead of O(m*n)
//! - Time: O(k*n) instead of O(m*n)
//! - Speedup: ~10x for k=10-100 on moderately large sequences
//!
//! ## Use Cases
//!
//! 1. **Sequence mapping** - Read alignment to reference (5-10% mismatch acceptable)
//! 2. **Variant detection** - Highly similar sequences with known variations
//! 3. **Metagenomic reads** - Queries similar to database reference
//! 4. **Protein domains** - Pre-aligned sequences with species-level variation
//!
//! ## Performance Characteristics
//!
//! For sequences of length 1000 with k=20:
//! - Full DP: 1,000 × 1,000 = 1M cells
//! - Banded: 1,000 × 41 = 41K cells
//! - Speedup: ~24x
//!
//! For sequences of length 1000 with k=100:
//! - Banded: 1,000 × 201 = 201K cells
//! - Speedup: ~4.9x

use crate::protein::AminoAcid;
use crate::scoring::ScoringMatrix;
use crate::error::Result;

/// Banded Smith-Waterman implementation
///
/// Restricts computation to cells within distance k of the diagonal.
/// Falls back to standard DP if sequence length ratio is very different.
pub fn smith_waterman_banded(
    seq1: &[AminoAcid],
    seq2: &[AminoAcid],
    matrix: &ScoringMatrix,
    open_penalty: i32,
    extend_penalty: i32,
    bandwidth: usize,
) -> Result<(Vec<Vec<i32>>, usize, usize)> {
    let m = seq1.len();
    let n = seq2.len();

    // Check if sequences are too different in length
    // If one is significantly longer, sequences are likely too dissimilar for banded DP
    let length_ratio = (m as f64) / (n.max(1) as f64);
    if length_ratio < 0.5 || length_ratio > 2.0 {
        // Fall back to standard DP for dissimilar length sequences
        return smith_waterman_standard(seq1, seq2, matrix, open_penalty, extend_penalty);
    }

    let mut h = vec![vec![0i32; n + 1]; m + 1];
    let mut max_score = 0;
    let mut max_i = 0;
    let mut max_j = 0;

    // Process only cells within the band
    for i in 1..=m {
        let j_min = if i > bandwidth { i - bandwidth } else { 1 };
        let j_max = std::cmp::min(n, i + bandwidth);

        for j in j_min..=j_max {
            let match_score = matrix.score(seq1[i - 1], seq2[j - 1]);

            // Diagonal dependency
            let diagonal = if i > 0 && j > 0 {
                h[i - 1][j - 1] + match_score
            } else {
                match_score
            };

            // Up dependency (gap in seq1)
            let up = if i > 0 {
                h[i - 1][j] + extend_penalty
            } else {
                i32::MIN / 2 // Sentinel value to ignore this path
            };

            // Left dependency (gap in seq2)
            let left = if j > 0 {
                h[i][j - 1] + extend_penalty
            } else {
                i32::MIN / 2
            };

            // Smith-Waterman: max(diagonal, up, left, 0)
            h[i][j] = std::cmp::max(0, std::cmp::max(diagonal, std::cmp::max(up, left)));

            if h[i][j] > max_score {
                max_score = h[i][j];
                max_i = i;
                max_j = j;
            }
        }
    }

    Ok((h, max_i, max_j))
}

/// Banded Needleman-Wunsch (global alignment) implementation
pub fn needleman_wunsch_banded(
    seq1: &[AminoAcid],
    seq2: &[AminoAcid],
    matrix: &ScoringMatrix,
    open_penalty: i32,
    extend_penalty: i32,
    bandwidth: usize,
) -> Result<Vec<Vec<i32>>> {
    let m = seq1.len();
    let n = seq2.len();

    // Check if sequences are too different in length
    let length_ratio = (m as f64) / (n.max(1) as f64);
    if length_ratio < 0.5 || length_ratio > 2.0 {
        return needleman_wunsch_standard(seq1, seq2, matrix, open_penalty, extend_penalty);
    }

    let mut h = vec![vec![i32::MIN / 2; n + 1]; m + 1];

    // Initialize first row (opening gaps in seq1)
    h[0][0] = 0;
    for j in 1..=std::cmp::min(n, bandwidth) {
        h[0][j] = (j as i32) * open_penalty;
    }

    // Initialize first column (opening gaps in seq2)
    for i in 1..=std::cmp::min(m, bandwidth) {
        h[i][0] = (i as i32) * open_penalty;
    }

    // Fill banded region
    for i in 1..=m {
        let j_min = if i > bandwidth { i - bandwidth } else { 1 };
        let j_max = std::cmp::min(n, i + bandwidth);

        for j in j_min..=j_max {
            let match_score = matrix.score(seq1[i - 1], seq2[j - 1]);

            let diagonal = if h[i - 1][j - 1] != i32::MIN / 2 {
                h[i - 1][j - 1] + match_score
            } else {
                i32::MIN / 2
            };

            let up = if h[i - 1][j] != i32::MIN / 2 {
                h[i - 1][j] + extend_penalty
            } else {
                i32::MIN / 2
            };

            let left = if j > 0 && h[i][j - 1] != i32::MIN / 2 {
                h[i][j - 1] + extend_penalty
            } else {
                i32::MIN / 2
            };

            h[i][j] = std::cmp::max(diagonal, std::cmp::max(up, left));
        }
    }

    Ok(h)
}

/// Standard (non-banded) Smith-Waterman for fallback
fn smith_waterman_standard(
    seq1: &[AminoAcid],
    seq2: &[AminoAcid],
    matrix: &ScoringMatrix,
    _open_penalty: i32,
    extend_penalty: i32,
) -> Result<(Vec<Vec<i32>>, usize, usize)> {
    let m = seq1.len();
    let n = seq2.len();

    let mut h = vec![vec![0i32; n + 1]; m + 1];
    let mut max_score = 0;
    let mut max_i = 0;
    let mut max_j = 0;

    for i in 1..=m {
        for j in 1..=n {
            let match_score = matrix.score(seq1[i - 1], seq2[j - 1]);
            let diagonal = h[i - 1][j - 1] + match_score;
            let up = h[i - 1][j] + extend_penalty;
            let left = h[i][j - 1] + extend_penalty;

            h[i][j] = std::cmp::max(0, std::cmp::max(diagonal, std::cmp::max(up, left)));

            if h[i][j] > max_score {
                max_score = h[i][j];
                max_i = i;
                max_j = j;
            }
        }
    }

    Ok((h, max_i, max_j))
}

/// Standard (non-banded) Needleman-Wunsch for fallback
fn needleman_wunsch_standard(
    seq1: &[AminoAcid],
    seq2: &[AminoAcid],
    matrix: &ScoringMatrix,
    open_penalty: i32,
    extend_penalty: i32,
) -> Result<Vec<Vec<i32>>> {
    let m = seq1.len();
    let n = seq2.len();

    let mut h = vec![vec![0i32; n + 1]; m + 1];

    for i in 0..=m {
        h[i][0] = (i as i32) * open_penalty;
    }
    for j in 0..=n {
        h[0][j] = (j as i32) * open_penalty;
    }

    for i in 1..=m {
        for j in 1..=n {
            let match_score = matrix.score(seq1[i - 1], seq2[j - 1]);
            let diagonal = h[i - 1][j - 1] + match_score;
            let up = h[i - 1][j] + extend_penalty;
            let left = h[i][j - 1] + extend_penalty;

            h[i][j] = std::cmp::max(diagonal, std::cmp::max(up, left));
        }
    }

    Ok(h)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::scoring::{ScoringMatrix, MatrixType};

    #[test]
    fn test_banded_sw_similar_sequences() -> Result<()> {
        let matrix = ScoringMatrix::new(MatrixType::Blosum62)?;
        let seq1 = vec![
            AminoAcid::Alanine,
            AminoAcid::Glycine,
            AminoAcid::Serine,
            AminoAcid::Glycine,
            AminoAcid::AsparticAcid,
        ];
        let seq2 = vec![
            AminoAcid::Alanine,
            AminoAcid::Glycine,
            AminoAcid::Serine,
            AminoAcid::Glycine,
            AminoAcid::AsparticAcid,
        ];

        let (h_banded, i_banded, j_banded) = smith_waterman_banded(&seq1, &seq2, &matrix, -11, -1, 10)?;
        let (h_standard, i_standard, j_standard) = smith_waterman_standard(&seq1, &seq2, &matrix, -11, -1)?;

        assert_eq!(h_banded[i_banded][j_banded], h_standard[i_standard][j_standard]);
        Ok(())
    }

    #[test]
    fn test_banded_nw_similar_sequences() -> Result<()> {
        let matrix = ScoringMatrix::new(MatrixType::Blosum62)?;
        let seq1 = vec![
            AminoAcid::Methionine,
            AminoAcid::Glycine,
            AminoAcid::Leucine,
        ];
        let seq2 = vec![
            AminoAcid::Methionine,
            AminoAcid::Glycine,
            AminoAcid::Leucine,
        ];

        let h_banded = needleman_wunsch_banded(&seq1, &seq2, &matrix, -11, -1, 10)?;
        let h_standard = needleman_wunsch_standard(&seq1, &seq2, &matrix, -11, -1)?;

        assert_eq!(h_banded[seq1.len()][seq2.len()], h_standard[seq1.len()][seq2.len()]);
        Ok(())
    }

    #[test]
    fn test_banded_fallback_different_lengths() -> Result<()> {
        let matrix = ScoringMatrix::new(MatrixType::Blosum62)?;
        let seq1 = vec![AminoAcid::Alanine; 100];
        let seq2 = vec![AminoAcid::Alanine; 10]; // 10x different

        // Should fall back to standard DP due to large length difference
        let (h_banded, i_banded, j_banded) = smith_waterman_banded(&seq1, &seq2, &matrix, -11, -1, 5)?;
        let (h_standard, i_standard, j_standard) = smith_waterman_standard(&seq1, &seq2, &matrix, -11, -1)?;

        // Results should match since both use standard algorithm
        assert_eq!(h_banded[i_banded][j_banded], h_standard[i_standard][j_standard]);
        Ok(())
    }
}