neofold 0.1.0

NeoFold Program, Program for Maximum-expected-accuracy Estimations of RNA Secondary Structures with Incorporating Homologous-RNA sequences
Documentation
#! /usr/bin/env python

import utils
from Bio import SeqIO
import numpy
import seaborn
from matplotlib import pyplot
import os
import multiprocessing
import time
import datetime
import shutil

def main():
  (current_work_dir_path, asset_dir_path, program_dir_path, conda_program_dir_path) = utils.get_dir_paths()
  num_of_threads = multiprocessing.cpu_count()
  rnafamprob_dir_path = asset_dir_path + "/rnafamprob"
  neofold_dir_path = asset_dir_path + "/neofold"
  centroidfold_dir_path = asset_dir_path + "/centroidfold"
  contrafold_dir_path = asset_dir_path + "/contrafold"
  centroidhomfold_dir_path = asset_dir_path + "/centroidhomfold"
  turbofold_dir_path = asset_dir_path + "/turbofold"
  temp_dir_path = "/tmp/run_ss_estimation_programs_%s" % datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H:%M:%S')
  if not os.path.isdir(rnafamprob_dir_path):
    os.mkdir(rnafamprob_dir_path)
  if not os.path.isdir(neofold_dir_path):
    os.mkdir(neofold_dir_path)
  if not os.path.isdir(centroidfold_dir_path):
    os.mkdir(centroidfold_dir_path)
  if not os.path.isdir(contrafold_dir_path):
    os.mkdir(contrafold_dir_path)
  if not os.path.isdir(centroidhomfold_dir_path):
    os.mkdir(centroidhomfold_dir_path)
  if not os.path.isdir(turbofold_dir_path):
    os.mkdir(turbofold_dir_path)
  if not os.path.isdir(temp_dir_path):
    os.mkdir(temp_dir_path)
  rna_dir_path = asset_dir_path + "/sampled_rna_families"
  # rna_dir_path = asset_dir_path + "/rna_families"
  bpp_mat_file = "bpp_mats_on_sta.dat"
  upp_mat_file = "upp_mats.dat"
  gammas = [2. ** i for i in range(-7, 11)]
  centroidfold_params = []
  contrafold_params = []
  centroidhomfold_params = []
  centroidfold_params_4_elapsed_time = []
  contrafold_params_4_elapsed_time = []
  centroidhomfold_params_4_elapsed_time = []
  rnafamprob_and_neofold_elapsed_time = 0.
  turbofold_elapsed_time = 0.
  for rna_file in os.listdir(rna_dir_path):
    if not rna_file.endswith(".fa"):
      continue
    rna_file_path = os.path.join(rna_dir_path, rna_file)
    (rna_familiy_name, extension) = os.path.splitext(rna_file)
    rnafamprob_output_dir_path = os.path.join(rnafamprob_dir_path, rna_familiy_name)
    rnafamprob_command = "rnafamprob -s -i " + rna_file_path + " -o " + rnafamprob_output_dir_path
    begin = time.time()
    utils.run_command(rnafamprob_command)
    elapsed_time = time.time() - begin
    rnafamprob_and_neofold_elapsed_time += elapsed_time
    bpp_mat_file_path = os.path.join(rnafamprob_output_dir_path, bpp_mat_file)
    upp_mat_file_path = os.path.join(rnafamprob_output_dir_path, upp_mat_file)
    neofold_output_dir_path = os.path.join(neofold_dir_path, "sss_of_" + rna_familiy_name)
    centroidfold_output_dir_path = os.path.join(centroidfold_dir_path, "sss_of_" + rna_familiy_name)
    contrafold_output_dir_path = os.path.join(contrafold_dir_path, "sss_of_" + rna_familiy_name)
    centroidhomfold_output_dir_path = os.path.join(centroidhomfold_dir_path, "sss_of_" + rna_familiy_name)
    turbofold_output_dir_path = os.path.join(turbofold_dir_path, "sss_of_" + rna_familiy_name)
    if not os.path.isdir(neofold_output_dir_path):
      os.mkdir(neofold_output_dir_path)
    if not os.path.isdir(centroidfold_output_dir_path):
      os.mkdir(centroidfold_output_dir_path)
    if not os.path.isdir(contrafold_output_dir_path):
      os.mkdir(contrafold_output_dir_path)
    if not os.path.isdir(centroidhomfold_output_dir_path):
      os.mkdir(centroidhomfold_output_dir_path)
    if not os.path.isdir(turbofold_output_dir_path):
      os.mkdir(turbofold_output_dir_path)
    for gamma in gammas:
      gamma_str = str(gamma)
      output_file = "gamma=" + gamma_str + ".dat"
      neofold_output_file_path = os.path.join(neofold_output_dir_path, output_file)
      neofold_command = "neofold -f " + rna_file_path + " -p " + bpp_mat_file_path + " -q " + upp_mat_file_path + " -o " + neofold_output_file_path + " --gamma " + gamma_str
      begin = time.time()
      utils.run_command(neofold_command)
      elapsed_time = time.time() - begin
      if gamma == 1:
        rnafamprob_and_neofold_elapsed_time += elapsed_time
      centroidfold_output_file_path = os.path.join(centroidfold_output_dir_path, output_file)
      centroidfold_params.insert(0, (rna_file_path, centroidfold_output_file_path, gamma_str))
      if gamma == 1:
        centroidfold_params_4_elapsed_time.insert(0, (rna_file_path, centroidfold_output_file_path, gamma_str))
      contrafold_output_file_path = os.path.join(contrafold_output_dir_path, output_file)
      contrafold_params.insert(0, (rna_file_path, contrafold_output_file_path, gamma_str))
      if gamma == 1:
        contrafold_params_4_elapsed_time.insert(0, (rna_file_path, contrafold_output_file_path, gamma_str))
      centroidhomfold_output_file_path = os.path.join(centroidhomfold_output_dir_path, output_file)
      centroidhomfold_params.insert(0, (rna_file_path, centroidhomfold_output_file_path, gamma_str, temp_dir_path))
      if gamma == 1:
        centroidhomfold_params_4_elapsed_time.insert(0, (rna_file_path, centroidhomfold_output_file_path, gamma_str, temp_dir_path))
      recs = [rec for rec in SeqIO.parse(rna_file_path, "fasta")]
      rec_seq_len = len(recs)
      turbofold_config_file_contents = "InSeq = {"
      for i in range(rec_seq_len):
        turbofold_config_file_contents += "%s/%d.fasta;" % (temp_dir_path, i)
      turbofold_config_file_contents += "}\nOutCT = {"
      for i in range(rec_seq_len):
        turbofold_config_file_contents += "%s/%d.ct;" % (temp_dir_path, i)
      turbofold_config_file_contents += "}\nIterations = 1\nMode = MEA\nMeaGamma = %f\nProcessors = %d" % (gamma, num_of_threads)
      turbofold_config_file_path = os.path.join(temp_dir_path, "turbofold_config.dat")
      turbofold_config_file = open(turbofold_config_file_path, "w")
      turbofold_config_file.write(turbofold_config_file_contents)
      turbofold_config_file.close()
      for (i, rec) in enumerate(recs):
        SeqIO.write([rec], open(os.path.join(temp_dir_path, "%d.fasta" % i), "w"), "fasta")
      begin = time.time()
      run_turbofold(turbofold_config_file_path)
      elapsed_time = time.time() - begin
      if gamma == 1:
        turbofold_elapsed_time += elapsed_time
      turbofold_output_file_contents = ""
      for i in range(rec_seq_len):
        ct_file_path = os.path.join(temp_dir_path, "%d.ct" % i)
        ss_string = read_ct_file(ct_file_path)
        turbofold_output_file_contents += ">%d\n%s\n\n" % (i, ss_string)
      turbofold_output_file_path = os.path.join(turbofold_output_dir_path, output_file)
      turbofold_output_file = open(turbofold_output_file_path, "w")
      turbofold_output_file.write(turbofold_output_file_contents)
      turbofold_output_file.close()
  pool = multiprocessing.Pool(num_of_threads)
  begin = time.time()
  pool.map(run_centroidfold, centroidfold_params_4_elapsed_time)
  centroidfold_elapsed_time = time.time() - begin
  begin = time.time()
  pool.map(run_contrafold, contrafold_params_4_elapsed_time)
  contrafold_elapsed_time = time.time() - begin
  begin = time.time()
  pool.map(run_centroidhomfold, centroidhomfold_params_4_elapsed_time)
  centroidhomfold_elapsed_time = time.time() - begin
  pool.map(run_centroidfold, centroidfold_params)
  pool.map(run_contrafold, contrafold_params)
  pool.map(run_centroidhomfold, centroidhomfold_params)
  print("The elapsed time of the 3 RNAfamProb, McCaskill, and NeoFold programs for a test set = %f [s]." % rnafamprob_and_neofold_elapsed_time)
  print("The elapsed time of the CentroidFold program with the CentroidFold algorithm for a test set = %f [s]." % centroidfold_elapsed_time)
  print("The elapsed time of the CentroidFold program with the CONTRAfold algorithm for a test set = %f [s]." % contrafold_elapsed_time)
  print("The elapsed time of the CentroidHomFold program for a test set = %f [s]." % centroidhomfold_elapsed_time)
  print("The elapsed time of the TurboFold-smp program for a test set = %f [s]." % turbofold_elapsed_time)
  shutil.rmtree(temp_dir_path)

def run_centroidfold(centroidfold_params):
  (rna_file_path, centroidfold_output_file_path, gamma_str) = centroidfold_params
  centroidfold_command = "centroid_fold " + rna_file_path + " -g " + gamma_str
  (output, _, _) = utils.run_command(centroidfold_command)
  lines = [line.split()[0] for (i, line) in enumerate(str(output).split("\\n")) if i % 3 == 2]
  centroidfold_output_file = open(centroidfold_output_file_path, "w+")
  centroidfold_output_buf = ""
  for (i, line) in enumerate(lines):
    centroidfold_output_buf += ">%d\n%s\n\n" % (i, line)
  centroidfold_output_file.write(centroidfold_output_buf)
  centroidfold_output_file.close()

def run_contrafold(contrafold_params):
  (rna_file_path, contrafold_output_file_path, gamma_str) = contrafold_params
  contrafold_command = "centroid_fold " + rna_file_path + " --mea -g " + gamma_str
  (output, _, _) = utils.run_command(contrafold_command)
  lines = [line.split()[0] for (i, line) in enumerate(str(output).split("\\n")) if i % 3 == 2]
  contrafold_output_file = open(contrafold_output_file_path, "w+")
  contrafold_output_buf = ""
  for (i, line) in enumerate(lines):
    contrafold_output_buf += ">%d\n%s\n\n" % (i, line)
  contrafold_output_file.write(contrafold_output_buf)
  contrafold_output_file.close()

def run_centroidhomfold(centroidhomfold_params):
  (rna_file_path, centroidhomfold_output_file_path, gamma_str, temp_dir_path) = centroidhomfold_params
  recs = [rec for rec in SeqIO.parse(rna_file_path, "fasta")]
  rec_seq_len = len(recs)
  centroidhomfold_output_file = open(centroidhomfold_output_file_path, "w+")
  centroidhomfold_output_buf = ""
  basename = os.path.basename(rna_file_path)
  seq_file_path = os.path.join(temp_dir_path, "seqs_4_%s_and_gamma=%s.fa" % (basename, gamma_str))
  hom_seq_file_path = os.path.join(temp_dir_path, "hom_seqs_4_%s_and_gamma=%s.fa" % (basename, gamma_str))
  for (i, rec) in enumerate(recs):
    SeqIO.write([rec], open(seq_file_path, "w"), "fasta")
    hom_recs = [rec for (j, rec) in enumerate(recs) if j != i]
    SeqIO.write(recs, open(hom_seq_file_path, "w"), "fasta")
    centroidhomfold_command = "centroid_homfold " + seq_file_path + " -H " + hom_seq_file_path + " -g " + gamma_str
    (output, _, _) = utils.run_command(centroidhomfold_command)
    centroidhomfold_output_buf += ">%d\n%s\n\n" % (i, str(output).split("\\n")[2].split()[0])
  centroidhomfold_output_file.write(centroidhomfold_output_buf)
  centroidhomfold_output_file.close()

def run_turbofold(turbofold_config_file_path):
  turbofold_command = "TurboFold-smp " + turbofold_config_file_path
  utils.run_command(turbofold_command)

def read_ct_file(ct_file_path):
  ct_file = open(ct_file_path, "r")
  lines = ct_file.readlines()
  seq_len = int(lines[0].split()[0])
  ss_string = ["." for i in range(seq_len)]
  num_of_lines = len(lines)
  for line in lines[1 : num_of_lines]:
    if "ENERGY" in line:
      break
    substrings = line.split()
    index_1 = int(substrings[0])
    index_2 = int(substrings[4])
    if index_2 == 0 or index_1 >= index_2:
      continue
    ss_string[index_1 - 1] = "("
    ss_string[index_2 - 1] = ")"
  return "".join(ss_string)

if __name__ == "__main__":
  main()