crast 1.0.4

CRAST, Context RNA Alignment Search Tool
Documentation
#! /usr/bin/env python

import os
import utls
from Bio import SeqIO as SqIO
from bx.align import maf
import numpy as nmpy
from math import sqrt

def mn():
  (cr_wrk_dr, ast_dr, prgrm_dr, cnd_prgrm_dr) = utls.gt_drs()
  pstv_algn_fls = [
    ast_dr + "/h_spns_m_mscls_hmlg_lncrnas_2_hmlg_m_mscls_ncrnas.maf",
    ast_dr + "/fldlgn_h_spns_m_mscls_hmlg_lncrnas_2_hmlg_m_mscls_ncrnas.dat",
  ]
  ngtv_algn_fls = [
    ast_dr + "/shfl_h_spns_m_mscls_hmlg_lncrnas_2_hmlg_m_mscls_ncrnas.maf",
    ast_dr + "/fldlgn_shfl_h_spns_m_mscls_hmlg_lncrnas_2_hmlg_m_mscls_ncrnas.dat",
  ]
  rf_sq_fl = ast_dr + "/hmlg_m_mscls_ncrnas.fa"
  hmlg_rlts = ast_dr + "/h_spns_m_mscls_hmlg_lncrnas.dat"
  hmlg_rlts = utls.gt_prs_hmlg_rlts(hmlg_rlts)
  rf_sq_nm = len([rc for rc in SqIO.parse(rf_sq_fl, "fasta")])
  for pstv_algn_fl, ngtv_algn_fl in zip(pstv_algn_fls, ngtv_algn_fls):
    pstv_algns = gt_prs_fldlgn_algns(pstv_algn_fl) if pstv_algn_fl.find("fldlgn") != -1 else utls.gt_prs_algns(pstv_algn_fl)
    ngtv_algns = gt_prs_fldlgn_algns(ngtv_algn_fl) if ngtv_algn_fl.find("fldlgn") != -1 else utls.gt_prs_algns(ngtv_algn_fl)
    tp = fp = tn = fn = 0.
    fnd_algns = []
    for pstv_algn in pstv_algns:
      algn_pr = (pstv_algn[1], pstv_algn[2])
      if algn_pr in fnd_algns:
        continue
      if utls.is_gn_prdctd(hmlg_rlts[pstv_algn[2]], pstv_algn[1], rf_sq_fl):
        tp += 1
      else:
        fp += 1
      fnd_algns.append(algn_pr)
    fp += rf_sq_nm - tp
    fnd_algns = []
    for ngtv_algn in ngtv_algns:
      algn_pr = (ngtv_algn[1], ngtv_algn[2])
      if algn_pr in fnd_algns:
        continue
      if not utls.is_gn_prdctd(hmlg_rlts[ngtv_algn[2]], ngtv_algn[1], rf_sq_fl):
        tn += 1
      else:
        fn += 1
      fnd_algns.append(algn_pr)
    tn += rf_sq_nm - fn
    print tp, fp, tn, fn
    prcsn = tp / (tp + fp)
    rcl = tp / (tp + fn)
    f_ms = 2 * rcl * prcsn / (rcl + prcsn)
    print f_ms

def gt_prs_fldlgn_algns(algn_fl):
  prs_algns = []
  inpt_hndl = open(algn_fl, "rU")
  lns = inpt_hndl.readlines()
  for ln in lns:
    if ln.startswith("; ALIGNMENT_LIST"):
      id_pr = ln.split()
      id_pr = (id_pr[2], id_pr[3])
      prs_algns.append((0, id_pr[1], id_pr[0]))
    elif ln.startswith("; FOLDALIGN_SCORE"):
      prs_algns[-1] = (int(ln.split()[2]), prs_algns[-1][1], prs_algns[-1][2])
  return prs_algns

if __name__ == "__main__":
  mn()