bio-seq-algos 0.1.13

"bio-seq-algos", Library of Algorithms about Biological Sequences
Documentation
use utils::*;

pub struct LogSaPpfMats {
  pub log_sa_forward_ppf_mat_4_char_align: LogPpfMat,
  pub log_sa_forward_ppf_mat_4_gap_1: LogPpfMat,
  pub log_sa_forward_ppf_mat_4_gap_2: LogPpfMat,
  pub log_sa_backward_ppf_mat_4_char_align: LogPpfMat,
  pub log_sa_backward_ppf_mat_4_gap_1: LogPpfMat,
  pub log_sa_backward_ppf_mat_4_gap_2: LogPpfMat,
}

impl LogSaPpfMats {
  fn new(slp: &(usize, usize)) -> LogSaPpfMats {
    let ni_mat = vec![vec![NEG_INFINITY; slp.1]; slp.0];
    LogSaPpfMats {
      log_sa_forward_ppf_mat_4_char_align: ni_mat.clone(),
      log_sa_forward_ppf_mat_4_gap_1: ni_mat.clone(),
      log_sa_forward_ppf_mat_4_gap_2: ni_mat.clone(),
      log_sa_backward_ppf_mat_4_char_align: ni_mat.clone(),
      log_sa_backward_ppf_mat_4_gap_1: ni_mat.clone(),
      log_sa_backward_ppf_mat_4_gap_2: ni_mat,
    }
  }
}

impl SaScoringParams {
  pub fn new(ca_sm: &CaScoreMat, base_opening_gap_penalty: SaScore, base_extending_gap_penalty: SaScore) -> SaScoringParams {
    SaScoringParams {
      ca_sm: ca_sm.clone(),
      base_opening_gap_penalty: base_opening_gap_penalty,
      base_extending_gap_penalty: base_extending_gap_penalty,
    }
  }
}

#[inline]
pub fn durbin_algo(seq_pair: &SsPair, sa_sps: &SaScoringParams) -> ProbMat {
  let seq_len_pair = (seq_pair.0.len(), seq_pair.1.len());
  let log_sa_ppf_mats = get_log_sa_ppf_mats(seq_pair, &seq_len_pair, sa_sps);
  let log_cap_mat = get_log_char_align_prob_mat(&log_sa_ppf_mats, &seq_len_pair);
  get_cap_mat(&log_cap_mat)
}

#[inline]
pub fn get_cap_mat(log_cap_mat: &LogProbMat) -> ProbMat {
  log_cap_mat.iter().map(|xs| xs.iter().map(|&x| x.exp()).collect()).collect()
}

#[inline]
pub fn get_log_cap_mat(seq_pair: &SsPair, sa_sps: &SaScoringParams) -> LogProbMat {
  let seq_len_pair = (seq_pair.0.len(), seq_pair.1.len());
  let log_sa_ppf_mats = get_log_sa_ppf_mats(seq_pair, &seq_len_pair, sa_sps);
  get_log_char_align_prob_mat(&log_sa_ppf_mats, &seq_len_pair)
}

#[inline]
pub fn get_cap_mat_and_unaligned_char_psp(sp: &SsPair, sa_sps: &SaScoringParams) -> (ProbMat, ProbSeqPair) {
  let slp = (sp.0.len(), sp.1.len());
  let log_sa_ppf_mats = get_log_sa_ppf_mats(sp, &slp, sa_sps);
  let log_cap_mat = get_log_char_align_prob_mat(&log_sa_ppf_mats, &slp);
  let mut ucp_seq_pair = (vec![NEG_INFINITY; slp.0], vec![NEG_INFINITY; slp.1]);
  for i in 0 .. slp.0 {
    let mut eps_of_terms_4_log_prob = EpsOfTerms4LogProb::new();
    let mut min_ep_of_term_4_log_prob = INFINITY;
    for j in 0 .. slp.1 {
      let ep_of_term_4_log_prob = log_cap_mat[i][j];
      if min_ep_of_term_4_log_prob > ep_of_term_4_log_prob {min_ep_of_term_4_log_prob = ep_of_term_4_log_prob;}
      eps_of_terms_4_log_prob.push(ep_of_term_4_log_prob);
    }
    ucp_seq_pair.0[i] = 1. - logsumexp(&eps_of_terms_4_log_prob[..], min_ep_of_term_4_log_prob).exp();
  }
  for i in 0 .. slp.1 {
    let mut eps_of_terms_4_log_prob = EpsOfTerms4LogProb::new();
    let mut min_ep_of_term_4_log_prob = INFINITY;
    for j in 0 .. slp.0 {
      let ep_of_term_4_log_prob = log_cap_mat[j][i];
      if min_ep_of_term_4_log_prob > ep_of_term_4_log_prob {min_ep_of_term_4_log_prob = ep_of_term_4_log_prob;}
      eps_of_terms_4_log_prob.push(ep_of_term_4_log_prob);
    }
    ucp_seq_pair.1[i] = 1. - logsumexp(&eps_of_terms_4_log_prob[..], min_ep_of_term_4_log_prob).exp();
  }
  (get_cap_mat(&log_cap_mat), ucp_seq_pair)
}

#[inline]
pub fn get_log_sa_ppf_mats(sp: &SsPair, slp: &(usize, usize), sa_sps: &SaScoringParams) -> LogSaPpfMats {
  let mut log_sa_ppf_mats = LogSaPpfMats::new(slp);
  log_sa_ppf_mats.log_sa_forward_ppf_mat_4_char_align[0][0] = sa_sps.ca_sm[&(sp.0[0], sp.1[0])];
  for i in 1 .. slp.0 {
    log_sa_ppf_mats.log_sa_forward_ppf_mat_4_char_align[i][0] = sa_sps.base_opening_gap_penalty + get_begp(&(1, i - 1), sa_sps) + sa_sps.ca_sm[&(sp.0[i], sp.1[0])];
  }
  for i in 1 .. slp.1 {
    log_sa_ppf_mats.log_sa_forward_ppf_mat_4_char_align[0][i] = sa_sps.base_opening_gap_penalty + get_begp(&(1, i - 1), sa_sps) + sa_sps.ca_sm[&(sp.0[0], sp.1[i])];
  }
  for i in 1 .. slp.0 {
    for j in 1 .. slp.1 {
      let mut eps_of_terms_4_log_pf = EpsOfTerms4LogPf::new();
      let mut max_ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_forward_ppf_mat_4_char_align[i - 1][j - 1];
      eps_of_terms_4_log_pf.push(max_ep_of_term_4_log_pf);
      let ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_forward_ppf_mat_4_gap_1[i - 1][j - 1];
      if max_ep_of_term_4_log_pf < ep_of_term_4_log_pf {max_ep_of_term_4_log_pf = ep_of_term_4_log_pf;}
      eps_of_terms_4_log_pf.push(ep_of_term_4_log_pf);
      let ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_forward_ppf_mat_4_gap_2[i - 1][j - 1];
      if max_ep_of_term_4_log_pf < ep_of_term_4_log_pf {max_ep_of_term_4_log_pf = ep_of_term_4_log_pf;}
      eps_of_terms_4_log_pf.push(ep_of_term_4_log_pf);
      log_sa_ppf_mats.log_sa_forward_ppf_mat_4_char_align[i][j] = logsumexp(&eps_of_terms_4_log_pf[..], max_ep_of_term_4_log_pf) + sa_sps.ca_sm[&(sp.0[i], sp.1[j])];
      let mut eps_of_terms_4_log_pf = EpsOfTerms4LogPf::new();
      let mut max_ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_forward_ppf_mat_4_char_align[i - 1][j] + sa_sps.base_opening_gap_penalty;
      eps_of_terms_4_log_pf.push(max_ep_of_term_4_log_pf);
      let ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_forward_ppf_mat_4_gap_2[i - 1][j] + sa_sps.base_opening_gap_penalty;
      if max_ep_of_term_4_log_pf < ep_of_term_4_log_pf {max_ep_of_term_4_log_pf = ep_of_term_4_log_pf;}
      eps_of_terms_4_log_pf.push(ep_of_term_4_log_pf);
      let ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_forward_ppf_mat_4_gap_1[i - 1][j] + sa_sps.base_extending_gap_penalty;
      if max_ep_of_term_4_log_pf < ep_of_term_4_log_pf {max_ep_of_term_4_log_pf = ep_of_term_4_log_pf;}
      eps_of_terms_4_log_pf.push(ep_of_term_4_log_pf);
      log_sa_ppf_mats.log_sa_forward_ppf_mat_4_gap_1[i][j] = logsumexp(&eps_of_terms_4_log_pf[..], max_ep_of_term_4_log_pf);
      let mut eps_of_terms_4_log_pf = EpsOfTerms4LogPf::new();
      let mut max_ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_forward_ppf_mat_4_char_align[i][j - 1] + sa_sps.base_opening_gap_penalty;
      eps_of_terms_4_log_pf.push(max_ep_of_term_4_log_pf);
      let ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_forward_ppf_mat_4_gap_1[i][j - 1] + sa_sps.base_opening_gap_penalty;
      if max_ep_of_term_4_log_pf < ep_of_term_4_log_pf {max_ep_of_term_4_log_pf = ep_of_term_4_log_pf;}
      eps_of_terms_4_log_pf.push(ep_of_term_4_log_pf);
      let ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_forward_ppf_mat_4_gap_2[i][j - 1] + sa_sps.base_extending_gap_penalty;
      if max_ep_of_term_4_log_pf < ep_of_term_4_log_pf {max_ep_of_term_4_log_pf = ep_of_term_4_log_pf;}
      eps_of_terms_4_log_pf.push(ep_of_term_4_log_pf);
      log_sa_ppf_mats.log_sa_forward_ppf_mat_4_gap_2[i][j] = logsumexp(&eps_of_terms_4_log_pf[..], max_ep_of_term_4_log_pf);
    }
  }
  log_sa_ppf_mats.log_sa_backward_ppf_mat_4_char_align[slp.0 - 1][slp.1 - 1] = 0.;
  log_sa_ppf_mats.log_sa_backward_ppf_mat_4_gap_1[slp.0 - 1][slp.1 - 1] = log_sa_ppf_mats.log_sa_backward_ppf_mat_4_char_align[slp.0 - 1][slp.1 - 1];
  log_sa_ppf_mats.log_sa_backward_ppf_mat_4_gap_2[slp.0 - 1][slp.1 - 1] = log_sa_ppf_mats.log_sa_backward_ppf_mat_4_char_align[slp.0 - 1][slp.1 - 1];
  for i in 0 .. slp.0 - 1 {
    log_sa_ppf_mats.log_sa_backward_ppf_mat_4_char_align[i][slp.1 - 1] = sa_sps.base_opening_gap_penalty + get_begp(&(i + 2, slp.0 - 1), sa_sps);
  }
  for i in 0 .. slp.1 - 1 {
    log_sa_ppf_mats.log_sa_backward_ppf_mat_4_char_align[slp.0 - 1][i] = sa_sps.base_opening_gap_penalty + get_begp(&(i + 2, slp.1 - 1), sa_sps);
  }
  for i in (0 .. slp.0 - 1).rev() {
    for j in (0 .. slp.1 - 1).rev() {
      let mut eps_of_terms_4_log_pf = EpsOfTerms4LogPf::new();
      let ca_score = sa_sps.ca_sm[&(sp.0[i + 1], sp.1[j + 1])];
      let mut max_ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_backward_ppf_mat_4_char_align[i + 1][j + 1] + ca_score;
      eps_of_terms_4_log_pf.push(max_ep_of_term_4_log_pf);
      let ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_backward_ppf_mat_4_gap_1[i + 1][j] + sa_sps.base_opening_gap_penalty;
      if max_ep_of_term_4_log_pf < ep_of_term_4_log_pf {max_ep_of_term_4_log_pf = ep_of_term_4_log_pf;}
      eps_of_terms_4_log_pf.push(ep_of_term_4_log_pf);
      let ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_backward_ppf_mat_4_gap_2[i][j + 1] + sa_sps.base_opening_gap_penalty;
      if max_ep_of_term_4_log_pf < ep_of_term_4_log_pf {max_ep_of_term_4_log_pf = ep_of_term_4_log_pf;}
      eps_of_terms_4_log_pf.push(ep_of_term_4_log_pf);
      log_sa_ppf_mats.log_sa_backward_ppf_mat_4_char_align[i][j] = logsumexp(&eps_of_terms_4_log_pf[..], max_ep_of_term_4_log_pf);
      let mut eps_of_terms_4_log_pf = EpsOfTerms4LogPf::new();
      let mut max_ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_backward_ppf_mat_4_char_align[i + 1][j + 1] + ca_score;
      eps_of_terms_4_log_pf.push(max_ep_of_term_4_log_pf);
      let ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_backward_ppf_mat_4_gap_2[i + 1][j] + sa_sps.base_opening_gap_penalty;
      if max_ep_of_term_4_log_pf < ep_of_term_4_log_pf {max_ep_of_term_4_log_pf = ep_of_term_4_log_pf;}
      eps_of_terms_4_log_pf.push(ep_of_term_4_log_pf);
      let ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_backward_ppf_mat_4_gap_1[i + 1][j] + sa_sps.base_extending_gap_penalty;
      if max_ep_of_term_4_log_pf < ep_of_term_4_log_pf {max_ep_of_term_4_log_pf = ep_of_term_4_log_pf;}
      eps_of_terms_4_log_pf.push(ep_of_term_4_log_pf);
      log_sa_ppf_mats.log_sa_backward_ppf_mat_4_gap_1[i][j] = logsumexp(&eps_of_terms_4_log_pf[..], max_ep_of_term_4_log_pf);
      let mut eps_of_terms_4_log_pf = EpsOfTerms4LogPf::new();
      let mut max_ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_backward_ppf_mat_4_char_align[i + 1][j + 1] + ca_score;
      eps_of_terms_4_log_pf.push(max_ep_of_term_4_log_pf);
      let ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_backward_ppf_mat_4_gap_1[i][j + 1] + sa_sps.base_opening_gap_penalty;
      if max_ep_of_term_4_log_pf < ep_of_term_4_log_pf {max_ep_of_term_4_log_pf = ep_of_term_4_log_pf;}
      eps_of_terms_4_log_pf.push(ep_of_term_4_log_pf);
      let ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_backward_ppf_mat_4_gap_2[i][j + 1] + sa_sps.base_extending_gap_penalty;
      if max_ep_of_term_4_log_pf < ep_of_term_4_log_pf {max_ep_of_term_4_log_pf = ep_of_term_4_log_pf;}
      eps_of_terms_4_log_pf.push(ep_of_term_4_log_pf);
      log_sa_ppf_mats.log_sa_backward_ppf_mat_4_gap_2[i][j] = logsumexp(&eps_of_terms_4_log_pf[..], max_ep_of_term_4_log_pf);
    }
  }
  log_sa_ppf_mats
}

#[inline]
fn get_log_char_align_prob_mat(log_sa_ppf_mats: &LogSaPpfMats, slp: &(usize, usize)) -> LogProbMat {
  let mut eps_of_terms_4_log_pf = EpsOfTerms4LogPf::new();
  let mut max_ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_forward_ppf_mat_4_char_align[slp.0 - 1][slp.1 - 1];
  eps_of_terms_4_log_pf.push(max_ep_of_term_4_log_pf);
  let ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_forward_ppf_mat_4_gap_1[slp.0 - 1][slp.1 - 1];
  if max_ep_of_term_4_log_pf < ep_of_term_4_log_pf {max_ep_of_term_4_log_pf = ep_of_term_4_log_pf;}
  eps_of_terms_4_log_pf.push(ep_of_term_4_log_pf);
  let ep_of_term_4_log_pf = log_sa_ppf_mats.log_sa_forward_ppf_mat_4_gap_2[slp.0 - 1][slp.1 - 1];
  if max_ep_of_term_4_log_pf < ep_of_term_4_log_pf {max_ep_of_term_4_log_pf = ep_of_term_4_log_pf;}
  eps_of_terms_4_log_pf.push(ep_of_term_4_log_pf);
  let log_sa_ppf = logsumexp(&eps_of_terms_4_log_pf, max_ep_of_term_4_log_pf);
  let mut log_cap_mat = vec![vec![NEG_INFINITY; slp.1]; slp.0];
  for i in 0 .. slp.0 {
    for j in 0 .. slp.1 {
      log_cap_mat[i][j] = log_sa_ppf_mats.log_sa_forward_ppf_mat_4_char_align[i][j] + log_sa_ppf_mats.log_sa_backward_ppf_mat_4_char_align[i][j] - log_sa_ppf;
    }
  }
  log_cap_mat
}

#[inline]
fn get_begp(pp: &PosPair, sa_sps: &SaScoringParams) -> SaScore {
  (pp.1 + 1 - pp.0) as SaScore * sa_sps.base_extending_gap_penalty
}