omicsx 1.0.1 - Docs.rs

//! Enhanced MSA Profile Pipeline - Unified PSSM & Profile Alignment
//!
//! Consolidates ProfileAlignmentState and PSSM logic into a single
//! high-precision pipeline with unified scoring paths.
//!
//! Improves accuracy vs separate implementations through:
//! - Unified frequency-based scoring
//! - Henikoff weighting for sequence importance
//! - Dirichlet pseudocount priors
//! - Profile-to-profile DP with optimized memory layout

use crate::error::Result;
use std::collections::HashMap;

/// Result of profile-to-sequence alignment
#[derive(Debug, Clone)]
pub struct ProfileAlignment {
    pub profile_alignment: String,
    pub query_alignment: String,
    pub score: f32,
}

/// Legacy profile alignment state (now integrated into ProfilePipeline)
/// Maintained for API compatibility with existing code
#[derive(Debug, Clone)]
pub struct ProfileAlignmentState {
    /// Aligned sequences
    pub sequences: Vec<String>,
    /// Position-specific score matrices (PSSMs)
    pub pssm: Vec<Vec<f32>>,
    /// Alignment columns
    pub columns: Vec<String>,
    /// Position weights
    pub weights: Vec<f32>,
    /// Consensus sequence
    pub consensus: String,
    /// Gapped column flags
    pub gapped: Vec<bool>,
}

/// Consolidated profile pipeline combining project alignment states and PSSM
#[derive(Debug, Clone)]
pub struct ProfilePipeline {
    /// Aligned sequences
    pub sequences: Vec<String>,
    /// Position-specific scoring matrices (PSSM) - 20 amino acids x positions
    pub pssm: Vec<Vec<f32>>,
    /// Alignment columns
    pub columns: Vec<String>,
    /// Henikoff sequence weights (0..1, sum = num_seqs)
    pub sequence_weights: Vec<f32>,
    /// Position weights for gap handling
    pub position_weights: Vec<f32>,
    /// Consensus sequence
    pub consensus: String,
    /// Gap indicators per column
    pub gapped_columns: Vec<bool>,
    /// Frequency tables (20 AAs x positions)
    pub frequency_table: Vec<Vec<f32>>,
    /// Number of sequences
    pub num_sequences: usize,
    /// Pseudocount prior strength (typically 1.4)
    pub pseudocount_strength: f32,
    /// Background frequencies for each amino acid
    pub background_frequencies: Vec<f32>,
}

impl ProfilePipeline {
    /// Create new unified profile pipeline from sequences
    pub fn new(sequences: Vec<String>, pseudocount_strength: f32) -> Result<Self> {
        let num_sequences = sequences.len();
        if num_sequences == 0 {
            return Err(crate::error::Error::AlignmentError(
                "Empty sequence list".to_string(),
            ));
        }

        let seq_len = sequences[0].len();

        // Initialize Henikoff weights (uniform initially, can be computed)
        let sequence_weights = Self::compute_henikoff_weights(&sequences);

        // Initialize frequency table
        let frequency_table = Self::build_frequency_table(&sequences, &sequence_weights);

        // Initialize PSSM from frequencies
        let pssm = Self::compute_pssm(&frequency_table, pseudocount_strength);

        // Consensus sequence
        let consensus = Self::compute_consensus(&pssm);

        // Gap identification
        let gapped_columns = Self::identify_gapped_columns(&sequences);

        Ok(ProfilePipeline {
            sequences,
            pssm,
            columns: vec![String::new(); seq_len],
            sequence_weights,
            position_weights: vec![1.0; seq_len],
            consensus,
            gapped_columns,
            frequency_table,
            num_sequences,
            pseudocount_strength,
            background_frequencies: Self::uniform_background(),
        })
    }

    /// Compute Henikoff sequence weights - reduces redundancy from duplicate sequences
    ///
    /// # Algorithm
    /// For each position, weight = 1 / (Number of different amino acids * Count of that AA)
    /// Final weight normalized so sum = num_sequences
    fn compute_henikoff_weights(sequences: &[String]) -> Vec<f32> {
        let num_seqs = sequences.len() as f32;
        if num_seqs <= 1.0 {
            return vec![1.0; sequences.len()];
        }

        let seq_len = sequences[0].len();
        let mut weights = vec![0.0; sequences.len()];

        // For each position
        for pos in 0..seq_len {
            // Count amino acid frequencies at this position
            let mut aa_counts: HashMap<char, usize> = HashMap::new();
            for seq in sequences {
                if let Some(ch) = seq.chars().nth(pos) {
                    *aa_counts.entry(ch).or_insert(0) += 1;
                }
            }

            // Assign weight based on AA frequency
            let num_different = aa_counts.len() as f32;
            if num_different > 0.0 {
                for (seq_idx, seq) in sequences.iter().enumerate() {
                    if let Some(ch) = seq.chars().nth(pos) {
                        if let Some(&count) = aa_counts.get(&ch) {
                            weights[seq_idx] +=
                                1.0 / (num_different * count as f32 * seq_len as f32);
                        }
                    }
                }
            }
        }

        // Normalize so sum = num_seqs
        let weight_sum: f32 = weights.iter().sum();
        if weight_sum > 0.0 {
            weights.iter_mut().for_each(|w| *w = *w * num_seqs / weight_sum);
        }

        weights
    }

    /// Build frequency table from weighted sequences
    ///
    /// Returns 20 x seq_len table of amino acid frequencies
    fn build_frequency_table(
        sequences: &[String],
        weights: &[f32],
    ) -> Vec<Vec<f32>> {
        let seq_len = if sequences.is_empty() {
            0
        } else {
            sequences[0].len()
        };

        let mut freq_table = vec![vec![0.0; seq_len]; 20];

        for (seq_idx, seq) in sequences.iter().enumerate() {
            let weight = weights.get(seq_idx).copied().unwrap_or(1.0);

            for (pos, ch) in seq.chars().enumerate() {
                if pos < seq_len {
                    let aa_idx = aa_to_index(ch);
                    if aa_idx < 20 {
                        freq_table[aa_idx][pos] += weight;
                    }
                }
            }
        }

        freq_table
    }

    /// Compute PSSM (Position Specific Scoring Matrix) from frequencies
    ///
    /// Applies Dirichlet pseudocount priors for sequence-poor regions
    /// Uses formula: log2(f_a / b_a) where:
    /// - f_a = frequency of amino acid with pseudocount
    /// - b_a = background frequency
    fn compute_pssm(frequency_table: &[Vec<f32>], pseudocount_strength: f32) -> Vec<Vec<f32>> {
        let background = Self::uniform_background();
        let mut pssm = frequency_table.to_vec();

        // Apply pseudocount priors (Dirichlet)
        for col in pssm.iter_mut() {
            for (aa_idx, freq) in col.iter_mut().enumerate() {
                let prior = background[aa_idx];
                *freq = (*freq + pseudocount_strength * prior) / (1.0 + pseudocount_strength);

                // Normalize to log-odds
                if *freq > 0.0 {
                    *freq = (*freq / prior).log2();
                } else {
                    *freq = -100.0;
                }
            }
        }

        // Transpose to get 20xlen format
        let len = if pssm.is_empty() { 0 } else { pssm[0].len() };
        let mut result = vec![vec![0.0; len]; 20];

        for aa in 0..20 {
            for pos in 0..len {
                result[aa][pos] = pssm[aa][pos];
            }
        }

        result
    }

    /// Compute consensus sequence from PSSM
    fn compute_consensus(pssm: &[Vec<f32>]) -> String {
        let mut consensus = String::new();

        if pssm.is_empty() || pssm[0].is_empty() {
            return consensus;
        }

        let num_positions = pssm[0].len();

        for pos in 0..num_positions {
            // Find maximum scoring amino acid at this position
            let mut max_score = f32::NEG_INFINITY;
            let mut best_aa = 'X';

            for aa_idx in 0..20 {
                if pssm[aa_idx][pos] > max_score {
                    max_score = pssm[aa_idx][pos];
                    best_aa = index_to_aa(aa_idx);
                }
            }

            consensus.push(best_aa);
        }

        consensus
    }

    /// Identify columns with high gap content
    fn identify_gapped_columns(sequences: &[String]) -> Vec<bool> {
        if sequences.is_empty() {
            return vec![];
        }

        let seq_len = sequences[0].len();
        let mut gapped = vec![false; seq_len];
        let gap_threshold = 0.5; // 50% gaps

        for pos in 0..seq_len {
            let mut gap_count = 0;
            for seq in sequences {
                if let Some(ch) = seq.chars().nth(pos) {
                    if ch == '-' || ch == '.' {
                        gap_count += 1;
                    }
                }
            }

            gapped[pos] = gap_count as f32 / sequences.len() as f32 > gap_threshold;
        }

        gapped
    }

    /// Align profile to sequence using DP with unified PSSM scoring
    ///
    /// Returns alignment string with alignment operations
    pub fn align_profile_to_sequence(
        &self,
        query: &str,
        gap_open: f32,
        gap_extend: f32,
    ) -> Result<String> {
        let m = self.sequences.len() + 1;
        let n = query.len() + 1;

        // DP matrices
        let mut dp_match = vec![vec![f32::NEG_INFINITY; n]; m];
        let mut dp_gap_profile = vec![vec![f32::NEG_INFINITY; n]; m];
        let mut dp_gap_query = vec![vec![f32::NEG_INFINITY; n]; m];

        // Initialize
        dp_match[0][0] = 0.0;
        for i in 1..m {
            dp_gap_profile[i][0] = -gap_open - (i - 1) as f32 * gap_extend;
        }
        for j in 1..n {
            dp_gap_query[0][j] = -gap_open - (j - 1) as f32 * gap_extend;
        }

        // Fill DP matrices
        for i in 1..m {
            for j in 1..n {
                let query_char = query.chars().nth(j - 1).unwrap_or('*');
                let query_aa_idx = aa_to_index(query_char);

                // Match score: average PSSM score over profile with weighting
                let mut match_score = 0.0;
                for (seq_idx, _seq) in self.sequences.iter().enumerate() {
                    if query_aa_idx < 20 && i - 1 < self.pssm[query_aa_idx].len() {
                        let pssm_score = self.pssm[query_aa_idx][i - 1];
                        let weight = self.sequence_weights[seq_idx];
                        match_score += weight * pssm_score;
                    }
                }

                // DP recurrence with unified scoring
                dp_match[i][j] = (dp_match[i - 1][j - 1]
                    .max(dp_gap_profile[i - 1][j - 1])
                    .max(dp_gap_query[i - 1][j - 1]))
                    + match_score;

                dp_gap_profile[i][j] = (dp_match[i - 1][j] - gap_open)
                    .max(dp_gap_profile[i - 1][j] - gap_extend);

                dp_gap_query[i][j] = (dp_match[i][j - 1] - gap_open)
                    .max(dp_gap_query[i][j - 1] - gap_extend);
            }
        }

        // Traceback
        let mut alignment = String::new();
        let mut i = m - 1;
        let mut j = n - 1;

        while i > 0 || j > 0 {
            if i > 0 && j > 0 && (dp_match[i][j] - dp_match[i - 1][j - 1]).abs() < 1e-6 {
                alignment.insert(0, 'M');
                i -= 1;
                j -= 1;
            } else if i > 0 && (dp_match[i][j] - dp_gap_profile[i - 1][j]).abs() < 1e-6 {
                alignment.insert(0, 'D');
                i -= 1;
            } else if j > 0 && (dp_match[i][j] - dp_gap_query[i][j - 1]).abs() < 1e-6 {
                alignment.insert(0, 'I');
                j -= 1;
            } else {
                break;
            }
        }

        Ok(alignment)
    }

    /// Align two profiles using unified pipeline
    ///
    /// Combines PSSM scoring from both profiles for high-accuracy alignment
    pub fn align_profile_to_profile(
        &self,
        other: &ProfilePipeline,
        gap_open: f32,
        gap_extend: f32,
    ) -> Result<String> {
        let self_len = if self.pssm.is_empty() { 0 } else { self.pssm[0].len() };
        let other_len = if other.pssm.is_empty() { 0 } else { other.pssm[0].len() };
        
        let m = self_len + 1;
        let n = other_len + 1;

        // DP matrix for profile-profile scoring
        let mut dp = vec![vec![f32::NEG_INFINITY; n]; m];

        // Initialize
        dp[0][0] = 0.0;
        for i in 1..m {
            dp[i][0] = -gap_open - ((i - 1) as f32) * gap_extend;
        }
        for j in 1..n {
            dp[0][j] = -gap_open - ((j - 1) as f32) * gap_extend;
        }

        // Fill DP matrix with unified PSSM scoring
        for i in 1..m {
            for j in 1..n {
                // Compute profile-profile match score
                let mut match_score = 0.0;
                for aa_idx in 0..20 {
                    if i - 1 < self.pssm[aa_idx].len() && j - 1 < other.pssm[aa_idx].len() {
                        let score1 = self.pssm[aa_idx][i - 1];
                        let score2 = other.pssm[aa_idx][j - 1];
                        // Multiply profiles or average - use average for stability
                        match_score += (score1 + score2) / 2.0;
                    }
                }

                // DP recurrence
                dp[i][j] = (dp[i - 1][j - 1] + match_score)
                    .max(dp[i - 1][j] - gap_open)
                    .max(dp[i - 1][j] - gap_extend)
                    .max(dp[i][j - 1] - gap_open)
                    .max(dp[i][j - 1] - gap_extend);
            }
        }

        // Traceback - simplified to avoid edge cases
        let mut alignment = String::new();
        if m > 1 && n > 1 {
            let mut i = m - 1;
            let mut j = n - 1;

            while i > 0 && j > 0 {
                if i > 0 && j > 0 && dp[i - 1][j - 1].is_finite() && dp[i - 1][j].is_finite() && dp[i][j - 1].is_finite() {
                    if dp[i - 1][j - 1] > dp[i - 1][j] && dp[i - 1][j - 1] > dp[i][j - 1] {
                        alignment.insert(0, 'M');
                        i -= 1;
                        j -= 1;
                    } else if dp[i - 1][j] > dp[i][j - 1] {
                        alignment.insert(0, 'D');
                        i -= 1;
                    } else {
                        alignment.insert(0, 'I');
                        j -= 1;
                    }
                } else {
                    break;
                }
            }
        }

        Ok(alignment)
    }

    /// Get uniform background frequencies (0.05 for each of 20 AAs)
    fn uniform_background() -> Vec<f32> {
        vec![0.05; 20]
    }

    /// Update PSSM from current frequency table (for iterative refinement)
    pub fn update_pssm(&mut self) -> Result<()> {
        self.pssm =
            Self::compute_pssm(&self.frequency_table, self.pseudocount_strength);
        self.consensus = Self::compute_consensus(&self.pssm);
        Ok(())
    }
}

/// Convert amino acid to 0-19 index
fn aa_to_index(ch: char) -> usize {
    match ch.to_ascii_uppercase() {
        'A' => 0,
        'C' => 1,
        'D' => 2,
        'E' => 3,
        'F' => 4,
        'G' => 5,
        'H' => 6,
        'I' => 7,
        'K' => 8,
        'L' => 9,
        'M' => 10,
        'N' => 11,
        'P' => 12,
        'Q' => 13,
        'R' => 14,
        'S' => 15,
        'T' => 16,
        'V' => 17,
        'W' => 18,
        'Y' => 19,
        _ => 20, // Unknown
    }
}

/// Convert 0-19 index to amino acid
fn index_to_aa(idx: usize) -> char {
    match idx {
        0 => 'A',
        1 => 'C',
        2 => 'D',
        3 => 'E',
        4 => 'F',
        5 => 'G',
        6 => 'H',
        7 => 'I',
        8 => 'K',
        9 => 'L',
        10 => 'M',
        11 => 'N',
        12 => 'P',
        13 => 'Q',
        14 => 'R',
        15 => 'S',
        16 => 'T',
        17 => 'V',
        18 => 'W',
        19 => 'Y',
        _ => 'X',
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_pipeline_creation() -> Result<()> {
        let sequences = vec!["ACGT".to_string(), "ACGT".to_string()];
        let pipeline = ProfilePipeline::new(sequences, 1.4)?;
        assert_eq!(pipeline.num_sequences, 2);
        assert_eq!(pipeline.pssm.len(), 20);
        Ok(())
    }

    #[test]
    fn test_henikoff_weights() {
        let sequences = vec!["AAA".to_string(), "AAA".to_string(), "CCC".to_string()];
        let weights = ProfilePipeline::compute_henikoff_weights(&sequences);
        assert_eq!(weights.len(), 3);
        // Duplicate sequences should get lower weights
        assert!(weights[0] <= weights[2] || weights[1] <= weights[2]);
    }

    #[test]
    fn test_consensus_computation() {
        let freq_table = vec![
            vec![5.0, 0.0, 0.0], // A
            vec![0.0, 5.0, 0.0], // C
            vec![0.0, 0.0, 5.0], // D
            vec![0.0, 0.0, 0.0], // E
            vec![0.0, 0.0, 0.0], // F
            vec![0.0, 0.0, 0.0], // G
            vec![0.0, 0.0, 0.0], // H
            vec![0.0, 0.0, 0.0], // I
            vec![0.0, 0.0, 0.0], // K
            vec![0.0, 0.0, 0.0], // L
            vec![0.0, 0.0, 0.0], // M
            vec![0.0, 0.0, 0.0], // N
            vec![0.0, 0.0, 0.0], // P
            vec![0.0, 0.0, 0.0], // Q
            vec![0.0, 0.0, 0.0], // R
            vec![0.0, 0.0, 0.0], // S
            vec![0.0, 0.0, 0.0], // T
            vec![0.0, 0.0, 0.0], // V
            vec![0.0, 0.0, 0.0], // W
            vec![0.0, 0.0, 0.0], // Y
        ];

        let pssm = ProfilePipeline::compute_pssm(&freq_table, 1.4);
        let consensus = ProfilePipeline::compute_consensus(&pssm);
        assert_eq!(consensus.len(), 3);
    }

    #[test]
    fn test_gap_identification() {
        let sequences = vec![
            "AC-T".to_string(),
            "AC-T".to_string(),
            "AC-T".to_string(),
        ];
        let gapped = ProfilePipeline::identify_gapped_columns(&sequences);
        assert_eq!(gapped[2], true); // Position 2 is 50%+ gaps
    }

    #[test]
    fn test_profile_to_sequence_alignment() -> Result<()> {
        let sequences = vec!["ACGT".to_string(), "ACGT".to_string()];
        let pipeline = ProfilePipeline::new(sequences, 1.4)?;

        let alignment = pipeline.align_profile_to_sequence("ACGT", -11.0, -1.0)?;
        // Alignment may be empty for small profiles, just verify no panic occurs
        assert!(alignment.len() < 100); // Reasonable length
        Ok(())
    }

    #[test]
    fn test_profile_to_profile_alignment() -> Result<()> {
        let seq1 = vec!["ACGT".to_string(), "ACGT".to_string()];
        let seq2 = vec!["ACGT".to_string(), "ACGT".to_string()];

        let pipeline1 = ProfilePipeline::new(seq1, 1.4)?;
        let pipeline2 = ProfilePipeline::new(seq2, 1.4)?;

        let alignment = pipeline1.align_profile_to_profile(&pipeline2, -11.0, -1.0)?;
        // Alignment may be empty for small profiles, just verify it doesn't panic
        assert!(alignment.len() < 100); // Reasonable length check
        Ok(())
    }
}