prll-ri 1.0.6

Parallel-RI, parallel executable binary to wrap RIblast.
Documentation
/*
 * seed_search.cpp
 *
 *  Created on: 2016/8/31
 *      Author: Tsukasa Fukunaga
 */

#include "seed_search.h"
#include <math.h>
#include <iostream>
          
void SeedSearch::Run(vector<unsigned char> &query_seq, vector<int> &query_suffix_array, vector<unsigned char> &db_seq, vector<int> &db_suffix_array,  vector<vector<int> > &start_hash,  vector<vector<int> > &end_hash){
  vector<int> db_seed; db_seed.reserve(_max_seed_length);
  vector<int> q_seed; q_seed.reserve(_max_seed_length);
  
  SeedSearchCore(query_seq, query_suffix_array, db_seq, db_suffix_array, start_hash, end_hash, db_seed, q_seed, 0, query_suffix_array.size()-1, 0, db_suffix_array.size()-1, 0.0, 0);
  return;
}

void SeedSearch::CalcInteractionEnergy(vector<Hit> &hit_result, vector<int> &query_suffix_array, vector<int> &db_suffix_array, vector<float> &query_accessibility, vector<float> &query_conditional_accessibility, vector<vector<float> > &db_accessibility,vector<vector<float> > &db_conditional_accessibility, vector<int> &db_seq_length, vector<int> &db_seq_start_position){
  
  int count = 0;
  for(int i = 0;i < _hit_candidate_result.size(); i++){
    int sp_q_sa = _hit_candidate_result[i].GetSpQSa();
    int ep_q_sa = _hit_candidate_result[i].GetEpQSa();
    int sp_db_sa = _hit_candidate_result[i].GetSpDbSa();
    int ep_db_sa = _hit_candidate_result[i].GetEpDbSa();
    int length = _hit_candidate_result[i].GetLength();
    double temp_score = _hit_candidate_result[i].GetEnergy();

    vector<int> q_sp_vector; q_sp_vector.reserve(10000);
    vector<double> qa_vector; qa_vector.reserve(10000);
    for(int j = sp_q_sa; j <=ep_q_sa ; j++){
      q_sp_vector.push_back(query_suffix_array[j]);
      qa_vector.push_back(CalcAccessibility(query_accessibility, query_conditional_accessibility, query_suffix_array[j], length));
    }

    for(int k = sp_db_sa; k <=ep_db_sa ; k++){
      int db_sp = db_suffix_array[k];
      int dbseq_start = 0; int dbseq_id = 0;
      GetSeqIdAndStart(db_seq_length, db_seq_start_position, &dbseq_id, &dbseq_start, db_sp, length);
      double dba = CalcAccessibility(db_accessibility[dbseq_id], db_conditional_accessibility[dbseq_id], dbseq_start, length);
      
      for(int j = sp_q_sa; j <= ep_q_sa; j++){
	count += 1;
	double interaction_energy = qa_vector[j-sp_q_sa] + dba + temp_score;
	
	if(interaction_energy < 0){
	  Hit temp_hit( q_sp_vector[j-sp_q_sa], db_sp, length, interaction_energy);
	  temp_hit.SetDbSeqId(dbseq_id);
	  temp_hit.SetDbSeqIdStart(dbseq_start);
	  hit_result.push_back(temp_hit);
	}
      }
    } 
  }
  return;
}

void SeedSearch::GetSeqIdAndStart(vector<int> &db_seq_length, vector<int> &db_seq_start_position, int* seq_id, int* start, int sp, int length){
  int s = 0;
  int e = db_seq_start_position.size()-1;
  int m = (s+e)/2;

  while(true){
    if(e-s == 0){
      *seq_id  = s; *start = db_seq_length[s] - (sp - db_seq_start_position[s]) - length;
      break;
    }else if(e-s==1){
      if(sp >= db_seq_start_position[s] && sp < db_seq_start_position[s+1]){
	*seq_id  = s; *start = db_seq_length[s] - (sp - db_seq_start_position[s]) - length;
	break;
      }else{
	*seq_id  = e; *start = db_seq_length[e] - (sp - db_seq_start_position[e]) - length;
	break;
      }
    }else{    
      if(sp >= db_seq_start_position[m] && sp < db_seq_start_position[m+1]){
	*seq_id  = m;
	*start = db_seq_length[m] - (sp - db_seq_start_position[m]) - length;
	break;
      }else{
	if(sp >= db_seq_start_position[m]){
	  s = m;
	  m = (s+e)/2;
	}else{
	  e = m;
	  m = (s+e)/2;
	}
      }
    }
  }
  
  return;
}

double SeedSearch::CalcAccessibility(vector<float> accessibility, vector<float> conditional_accessibility, int sp, int length){
  double temp = accessibility[sp];
  for(int i =_min_accessible_length; i < length; i++){
    temp += conditional_accessibility[sp+i];
  }
  return(temp);
}

void SeedSearch::SeedSearchCore(vector<unsigned char> &query_seq, vector<int> &query_suffix_array, vector<unsigned char> &db_seq, vector<int> &db_suffix_array,  vector<vector<int> > &start_hash,  vector<vector<int> > &end_hash, vector<int> &db_seed, vector<int> &q_seed, int sp_q, int ep_q, int sp_db, int ep_db, double score, int length){

  if(length < _max_seed_length){
    vector<int> sp_q_result; sp_q_result.reserve(_pair_size);
    vector<int> ep_q_result; ep_q_result.reserve(_pair_size);
    vector<int> sp_db_result; sp_db_result.reserve(_pair_size);
    vector<int> ep_db_result; ep_db_result.reserve(_pair_size);

    for(int i = 0; i< _pair_size; i++){
      int start = sp_q;
      int end = ep_q;
      int nucleotide_size = 4;
      SeedSearchNextCharacter(query_seq, query_suffix_array, &start, &end, _stem_pair[i][0], length);
      sp_q_result.push_back(start);
      ep_q_result.push_back(end);
      start = sp_db;
      end = ep_db;
      if(length+1 > _hash_size){
	SeedSearchNextCharacter(db_seq, db_suffix_array, &start, &end, _stem_pair[i][1], length);
      }else{
	int temp = _stem_pair[i][1]-2;
	for(int j = 0; j <length;j++){
	  temp += (int)pow(nucleotide_size,length-j) * (db_seed[j] - 2);
	}
	start = start_hash[length][temp];
	end = end_hash[length][temp];
      }
      sp_db_result.push_back(start);
      ep_db_result.push_back(end);
    }
    
    for(int i = 0; i< _pair_size; i++){
      if(sp_q_result[i] <= ep_q_result[i] && sp_db_result[i] <= ep_db_result[i]){
	double temp_score = 0.0;
	
	if(length > 0){
	  int type = BP_pair[q_seed[length-1]-1][db_seed[length-1]-1];
	  int type2 = BP_pair[_stem_pair[i][0]-1][_stem_pair[i][1]-1];
	  type2 = rtype[type2];
	  temp_score = score + ((double)stack37[type][type2])/100;
	}
	
	if(temp_score < _hybrid_energy_threshold && length+1 >= _min_accessible_length){
	  Hit_candidate temp_hit_candidate(sp_q_result[i], ep_q_result[i], sp_db_result[i], ep_db_result[i], length+1, temp_score);
	  _hit_candidate_result.push_back(temp_hit_candidate);
	}else{
	  q_seed.push_back(_stem_pair[i][0]);
	  db_seed.push_back(_stem_pair[i][1]);
	  SeedSearchCore(query_seq, query_suffix_array, db_seq, db_suffix_array, start_hash, end_hash, db_seed, q_seed, sp_q_result[i], ep_q_result[i], sp_db_result[i], ep_db_result[i], temp_score, length+1);
	}
      }
    }
  }
  q_seed.pop_back();
  db_seed.pop_back();
  return;
}

void SeedSearch::SeedSearchNextCharacter(vector<unsigned char> &encoded_sequences, vector<int> &suffix_array, int* start, int* end, unsigned char c, int offset){
  int s = *start;
  int e = *end;
  int m;

  if (suffix_array[s] + offset >= encoded_sequences.size()) {
    ++(*start);
  }
  
  if(s>e){
    *start = 1;
    *end = 0;
    return;
  }else if (s == e) {
    if (encoded_sequences[suffix_array[s]+offset] == c) {
      return;
    } else {
      *start = 1;
      *end = 0;
      return;
    }
  }
  
  if (encoded_sequences[suffix_array[s]+offset] != c) {
    while (s < e - 1) {
      m = (s + e) / 2;
      if (encoded_sequences[suffix_array[m]+offset] < c) {
        s = m;
      } else {
        e = m;
      }
    }
    if (encoded_sequences[suffix_array[e]+offset] != c) {
      *start = 1;
      *end = 0;
      return;
    }
    *start = e;
    s = e;
    e = *end;
  }

  if (encoded_sequences[suffix_array[e]+offset] != c) {
    while (s < e - 1) {
      m = (s + e) / 2;
      if (encoded_sequences[suffix_array[m]+offset] > c) {
        e = m;
      } else {
        s = m;
      }
    }
    if (encoded_sequences[suffix_array[s]+offset] != c) {
      *start = 1;
      *end = 0;
      return;
    }
    *end = s;
  }
  return;
}