crast 1.0.4

CRAST, Context RNA Alignment Search Tool
Documentation
#! /usr/bin/env python

import os
import utls
import matplotlib.pyplot as pyplt
import seaborn as sbrn
from decimal import Decimal as Dcml
from collections import OrderedDict as OrdrDct
from pandas import DataFrame as DtFrm
from pandas import pivot_table as pvt_tbl

def mn():
  (cr_wrk_dr, ast_dr, prgrm_dr, cnd_prgrm_dr) = utls.gt_drs()
  clr_plt = sbrn.color_palette()
  utls.int_mtpltlb()
  otpt_dr = ast_dr + "/crast_otpts"
  utls.st_dr_cln(otpt_dr)
  (fg, ax) = pyplt.subplots()
  rf_sq_fl = ast_dr + "/m_mscls_ncrnas.fa"
  hmlg_rlts = ast_dr + "/h_spns_m_mscls_hmlg_lncrnas.dat"
  hmlg_rlts = utls.gt_prs_hmlg_rlts(hmlg_rlts)
  img_dr = ast_dr + "/imgs"
  if not os.path.isdir(img_dr):
    os.mkdir(img_dr)
  cntxt_mtch_prbs = [str(cntxt_mtch_prb / 100.) for cntxt_mtch_prb in range(0, 55, 5)]
  pstvs = []
  tps = []
  for cntxt_mtch_prb in cntxt_mtch_prbs:
    otpt_fl = otpt_dr + "/h_spns_m_mscls_hmlg_lncrnas_2_m_mscls_ncrnas_wth_cntxt_mtch_prb_" + cntxt_mtch_prb + ".maf"
    if not os.path.isfile(otpt_fl):
      cmnd = prgrm_dr + "/bin/crast --db " + ast_dr + "/m_mscls_ncrna_db " + " -i " + ast_dr + "/h_spns_m_mscls_hmlg_lncrnas.fa -o " + otpt_fl + " --cp " + cntxt_mtch_prb + " -u " + ast_dr + "/h_spns_m_mscls_hmlg_lncrna_cntxt_dst_sqs.dat"
      utls.rn_cmnd(cmnd)
    pstv_algns = utls.gt_prs_algns(otpt_fl)
    tp = fp = 0.
    fnd_algns = []
    for pstv_algn in pstv_algns:
      algn_pr = (pstv_algn[1], pstv_algn[2])
      if algn_pr in fnd_algns:
        continue
      if utls.is_gn_prdctd(hmlg_rlts[pstv_algn[2]], pstv_algn[1], rf_sq_fl):
        tp += 1
      else:
        fp += 1
      fnd_algns.append(algn_pr)
    pstvs.append(tp + fp)
    tps.append(tp)
  sbrn.barplot(cntxt_mtch_prbs, pstvs, ax = ax, fc = clr_plt[1])
  sbrn.barplot(cntxt_mtch_prbs, tps, ax = ax, fc = clr_plt[0])
  pstv_brs = pyplt.Rectangle((0, 0), 1, 1, fc = clr_plt[1])
  tp_brs = pyplt.Rectangle((0, 0), 1, 1, fc = clr_plt[0])
  pyplt.legend([tp_brs, pstv_brs], ["TPs", "FPs"], loc = 0)
  ax.set_xlabel("Match probability")
  ax.set_ylabel("# of positives")
  fg.savefig(img_dr + "/crast_cntxt_mtch_prb_pstv_nm_rlt.eps", bbox_inches = "tight")
  (fg, ax) = pyplt.subplots()
  e_vls = ["%.1e" % Dcml(str(10. ** e_vl)) for e_vl in range(-1, 7, 1)]
  pstvs = []
  tps = []
  for e_vl in e_vls:
    otpt_fl = otpt_dr + "/h_spns_m_mscls_hmlg_lncrnas_2_m_mscls_ncrnas_wth_sd_e_vl_" + e_vl + ".maf"
    if not os.path.isfile(otpt_fl):
      cmnd = prgrm_dr + "/bin/crast --db " + ast_dr + "/m_mscls_ncrna_db " + " -i " + ast_dr + "/h_spns_m_mscls_hmlg_lncrnas.fa -o " + otpt_fl + " --e1 " + e_vl + " -u " + ast_dr + "/h_spns_m_mscls_hmlg_lncrna_cntxt_dst_sqs.dat"
      utls.rn_cmnd(cmnd)
    pstv_algns = utls.gt_prs_algns(otpt_fl)
    tp = fp = 0.
    fnd_algns = []
    for pstv_algn in pstv_algns:
      algn_pr = (pstv_algn[1], pstv_algn[2])
      if algn_pr in fnd_algns:
        continue
      if utls.is_gn_prdctd(hmlg_rlts[pstv_algn[2]], pstv_algn[1], rf_sq_fl):
        tp += 1
      else:
        fp += 1
      fnd_algns.append(algn_pr)
    pstvs.append(tp + fp)
    tps.append(tp)
  sbrn.barplot(e_vls, pstvs, ax = ax, fc = clr_plt[1], order = e_vls)
  sbrn.barplot(e_vls, tps, ax = ax, fc = clr_plt[0], order = e_vls)
  pstv_brs = pyplt.Rectangle((0, 0), 1, 1, fc = clr_plt[1])
  tp_brs = pyplt.Rectangle((0, 0), 1, 1, fc = clr_plt[0])
  pyplt.legend([tp_brs, pstv_brs], ["TPs", "FPs"], loc = 0)
  ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation = 45)
  ax.set_xlabel("Thres. of # E-value of seeds")
  ax.set_ylabel("# of positives")
  fg.savefig(img_dr + "/crast_sd_e_vl_pstv_nm_rlt.eps", bbox_inches = "tight")
  (fg, ax) = pyplt.subplots()
  bs_rts = [str(bs_rt / 10.) for bs_rt in range(0, 11, 1)]
  pstvs = []
  tps = []
  for bs_rt in bs_rts:
    otpt_fl = otpt_dr + "/h_spns_m_mscls_hmlg_lncrnas_2_m_mscls_ncrnas_wth_bs_rt_" + bs_rt + ".maf"
    if not os.path.isfile(otpt_fl):
      cmnd = prgrm_dr + "/bin/crast --db " + ast_dr + "/m_mscls_ncrna_db " + " -i " + ast_dr + "/h_spns_m_mscls_hmlg_lncrnas.fa -o " + otpt_fl + " -b " + bs_rt + " -u " + ast_dr + "/h_spns_m_mscls_hmlg_lncrna_cntxt_dst_sqs.dat"
      utls.rn_cmnd(cmnd)
    pstv_algns = utls.gt_prs_algns(otpt_fl)
    tp = fp = 0.
    fnd_algns = []
    for pstv_algn in pstv_algns:
      algn_pr = (pstv_algn[1], pstv_algn[2])
      if algn_pr in fnd_algns:
        continue
      if utls.is_gn_prdctd(hmlg_rlts[pstv_algn[2]], pstv_algn[1], rf_sq_fl):
        tp += 1
      else:
        fp += 1
      fnd_algns.append(algn_pr)
    pstvs.append(tp + fp)
    tps.append(tp)
  sbrn.barplot(bs_rts, pstvs, ax = ax, fc = clr_plt[1])
  sbrn.barplot(bs_rts, tps, ax = ax, fc = clr_plt[0])
  pstv_brs = pyplt.Rectangle((0, 0), 1, 1, fc = clr_plt[1])
  tp_brs = pyplt.Rectangle((0, 0), 1, 1, fc = clr_plt[0])
  pyplt.legend([tp_brs, pstv_brs], ["TPs", "FPs"], loc = 0)
  ax.set_xlabel("Contribution ratio of base to score")
  ax.set_ylabel("# of positives")
  fg.savefig(img_dr + "/crast_bs_rt_pstv_nm_rlt.eps", bbox_inches = "tight")
  (fg, ax) = pyplt.subplots()
  sq_e_vls = ["%.1e" % Dcml(str(10. ** e_vl)) for e_vl in range(-6, 7, 1)]
  cntxt_sq_e_vls = ["%.1e" % Dcml(str(10. ** e_vl)) for e_vl in range(-1, 7, 1)]
  dt = OrdrDct([("sq_e_vl", []), ("cntxt_sq_e_vl", []), ("pstvs", []), ("prcsn", [])])
  for sq_e_vl in sq_e_vls:
    for cntxt_sq_e_vl in cntxt_sq_e_vls:
      otpt_fl = otpt_dr + "/h_spns_m_mscls_hmlg_lncrnas_2_m_mscls_ncrnas_wth_ungp_algn_sq_e_vl_" + sq_e_vl + "_cntxt_sq_e_vl_" + cntxt_sq_e_vl + ".maf"
      if not os.path.isfile(otpt_fl): 
        cmnd = prgrm_dr + "/bin/crast --db " + ast_dr + "/m_mscls_ncrna_db " + " -i " + ast_dr + "/h_spns_m_mscls_hmlg_lncrnas.fa -o " + otpt_fl + " --e2 " + sq_e_vl + " --e3 " + cntxt_sq_e_vl + " -u " + ast_dr + "/h_spns_m_mscls_hmlg_lncrna_cntxt_dst_sqs.dat"
        utls.rn_cmnd(cmnd)
      pstv_algns = utls.gt_prs_algns(otpt_fl)
      tp = fp = 0
      fnd_algns = []
      for pstv_algn in pstv_algns:
        algn_pr = (pstv_algn[1], pstv_algn[2])
        if algn_pr in fnd_algns:
          continue
        if utls.is_gn_prdctd(hmlg_rlts[pstv_algn[2]], pstv_algn[1], rf_sq_fl):
          tp += 1
        else:
          fp += 1
        fnd_algns.append(algn_pr)
      dt["sq_e_vl"].append(float(sq_e_vl))
      dt["cntxt_sq_e_vl"].append(float(cntxt_sq_e_vl))
      dt["pstvs"].append(str(tp) + "/" + str(fp))
      dt["prcsn"].append(tp / float(tp + fp))
  dt = DtFrm(dt)
  ants = pvt_tbl(data = dt, values = "pstvs", index = "sq_e_vl", columns = "cntxt_sq_e_vl", aggfunc = lambda x: " ".join(x))
  dt = pvt_tbl(data = dt, values = "prcsn", index = "sq_e_vl", columns = "cntxt_sq_e_vl")
  sbrn.heatmap(dt, annot = ants, fmt = "s", xticklabels = cntxt_sq_e_vls, yticklabels = sq_e_vls)
  pyplt.xticks(rotation = 45)
  pyplt.yticks(rotation = 0)
  ax.set_xlabel("# E-value of ungapped aligns on context seq.")
  ax.set_ylabel("# E-value of ungapped aligns on seq.")
  fg.savefig(img_dr + "/crast_ungp_algn_e_vl_pstv_nm_rlt.eps", bbox_inches = "tight")
  (fg, ax) = pyplt.subplots()
  dt = OrdrDct([("sq_e_vl", []), ("cntxt_sq_e_vl", []), ("pstvs", []), ("prcsn", [])])
  for sq_e_vl in sq_e_vls:
    for cntxt_sq_e_vl in cntxt_sq_e_vls:
      otpt_fl = otpt_dr + "/h_spns_m_mscls_hmlg_lncrnas_2_m_mscls_ncrnas_wth_gp_algn_sq_e_vl_" + sq_e_vl + "_cntxt_sq_e_vl_" + cntxt_sq_e_vl + ".maf"
      if not os.path.isfile(otpt_fl): 
        cmnd = prgrm_dr + "/bin/crast --db " + ast_dr + "/m_mscls_ncrna_db " + " -i " + ast_dr + "/h_spns_m_mscls_hmlg_lncrnas.fa -o " + otpt_fl + " --e4 " + sq_e_vl + " --e5 " + cntxt_sq_e_vl + " -u " + ast_dr + "/h_spns_m_mscls_hmlg_lncrna_cntxt_dst_sqs.dat"
        utls.rn_cmnd(cmnd)
      pstv_algns = utls.gt_prs_algns(otpt_fl)
      tp = fp = 0
      fnd_algns = []
      for pstv_algn in pstv_algns:
        algn_pr = (pstv_algn[1], pstv_algn[2])
        if algn_pr in fnd_algns:
          continue
        if utls.is_gn_prdctd(hmlg_rlts[pstv_algn[2]], pstv_algn[1], rf_sq_fl):
          tp += 1
        else:
          fp += 1
        fnd_algns.append(algn_pr)
      dt["sq_e_vl"].append(float(sq_e_vl))
      dt["cntxt_sq_e_vl"].append(float(cntxt_sq_e_vl))
      dt["pstvs"].append(str(tp) + "/" + str(fp))
      dt["prcsn"].append(tp / float(tp + fp))
  dt = DtFrm(dt)
  ants = pvt_tbl(data = dt, values = "pstvs", index = "sq_e_vl", columns = "cntxt_sq_e_vl", aggfunc = lambda x: " ".join(x))
  dt = pvt_tbl(data = dt, values = "prcsn", index = "sq_e_vl", columns = "cntxt_sq_e_vl")
  sbrn.heatmap(dt, annot = ants, fmt = "s", xticklabels = cntxt_sq_e_vls, yticklabels = sq_e_vls)
  pyplt.xticks(rotation = 45)
  pyplt.yticks(rotation = 0)
  ax.set_xlabel("# E-value of gapped aligns on context seq.")
  ax.set_ylabel("# E-value of gapped aligns on seq.")
  fg.savefig(img_dr + "/crast_gp_algn_e_vl_pstv_nm_rlt.eps", bbox_inches = "tight")
  (fg, ax) = pyplt.subplots()
  dt = OrdrDct([("sq_e_vl", []), ("cntxt_sq_e_vl", []), ("pstvs", []), ("prcsn", [])])
  for sq_e_vl in sq_e_vls:
    for cntxt_sq_e_vl in cntxt_sq_e_vls:
      otpt_fl = otpt_dr + "/h_spns_m_mscls_hmlg_lncrnas_2_m_mscls_ncrnas_wth_fnl_gp_algn_sq_e_vl_" + sq_e_vl + "_cntxt_sq_e_vl_" + cntxt_sq_e_vl + ".maf"
      if not os.path.isfile(otpt_fl): 
        cmnd = prgrm_dr + "/bin/crast --db " + ast_dr + "/m_mscls_ncrna_db " + " -i " + ast_dr + "/h_spns_m_mscls_hmlg_lncrnas.fa -o " + otpt_fl + " --e6 " + sq_e_vl + " --e7 " + cntxt_sq_e_vl + " -u " + ast_dr + "/h_spns_m_mscls_hmlg_lncrna_cntxt_dst_sqs.dat"
        utls.rn_cmnd(cmnd)
      pstv_algns = utls.gt_prs_algns(otpt_fl)
      tp = fp = 0
      fnd_algns = []
      for pstv_algn in pstv_algns:
        algn_pr = (pstv_algn[1], pstv_algn[2])
        if algn_pr in fnd_algns:
          continue
        if utls.is_gn_prdctd(hmlg_rlts[pstv_algn[2]], pstv_algn[1], rf_sq_fl):
          tp += 1
        else:
          fp += 1
        fnd_algns.append(algn_pr)
      dt["sq_e_vl"].append(float(sq_e_vl))
      dt["cntxt_sq_e_vl"].append(float(cntxt_sq_e_vl))
      dt["pstvs"].append(str(tp) + "/" + str(fp))
      dt["prcsn"].append(tp / float(tp + fp))
  dt = DtFrm(dt)
  ants = pvt_tbl(data = dt, values = "pstvs", index = "sq_e_vl", columns = "cntxt_sq_e_vl", aggfunc = lambda x: " ".join(x))
  dt = pvt_tbl(data = dt, values = "prcsn", index = "sq_e_vl", columns = "cntxt_sq_e_vl")
  sbrn.heatmap(dt, annot = ants, fmt = "s", xticklabels = cntxt_sq_e_vls, yticklabels = sq_e_vls)
  pyplt.xticks(rotation = 45)
  pyplt.yticks(rotation = 0)
  ax.set_xlabel("# E-value of final gapped aligns on context seq.")
  ax.set_ylabel("# E-value of final gapped aligns on seq.")
  fg.savefig(img_dr + "/crast_fnl_gp_algn_e_vl_pstv_nm_rlt.eps", bbox_inches = "tight")

if __name__ == "__main__":
  mn()