crast 1.0.4

CRAST, Context RNA Alignment Search Tool
Documentation
extern crate num_cpus;
extern crate cpr;
extern crate bzip2;
extern crate bio;
extern crate crast;

#[macro_use]
pub mod utls;

use bzip2::Compression;
use bzip2::write::BzEncoder;
use utls::*;
use bio::data_structures::suffix_array::{suffix_array, RawSuffixArray};

const SNTNL: BElm = '$' as BElm;

fn main() {
  let args = env::args().collect::<Vec<Arg>>();
  let prgrm = args[0].clone();
  let mut opts = Options::new();
  opts.reqopt("i", "", "Ref. seqs in single file of FASTA format", "STR");
  opts.reqopt("o", "", "Dir. to locate output database files", "STR");
  opts.optopt("", "ms", &(String::from("Max. span between paired bases (Default: ") + &DFLT_MX_SPN.to_string() + ")"), "UINT");
  opts.optopt("t", "", "# of threads in multithreading (Default: system val.)", "UINT");
  opts.optflag("", "nr", "search only forward strands of ref. seqs (Default: also search reverse strands)");
  opts.optflag("a", "", "Approximate estimation of ref. RNA context seqs");
  opts.optflag("h", "help", "Print help menu");
  let mtchs = match opts.parse(&args[1 ..]) {
    Ok(mtch) => {mtch}
    Err(fl) => {prnt_usg(&prgrm, &opts); panic!(fl.to_string())}
  };
  if mtchs.opt_present("h") {
    prnt_usg(&prgrm, &opts);
    return;
  }
  let t = if mtchs.opt_present("t") {
    mtchs.opt_str("t").expect("Failed to retrieve # of threads from command args").parse().expect("Failed to parse # of threads")
  } else {
    num_cpus::get() as ThrdNm
  };
  let ms = if mtchs.opt_present("ms") {
    mtchs.opt_str("ms").expect("Failed to retrieve max. span between paired bases from command args").parse().expect("Failed to parse max. span between paired bases")
  } else {
    DFLT_MX_SPN
  };
  let i = mtchs.opt_str("i").expect("Failed to retrieve input file from command args");
  let o = mtchs.opt_str("o").expect("Failed to retrieve output file from command args");
  let a = mtchs.opt_present("a");
  let otpt_dr = Path::new(&o);
  if !otpt_dr.exists() {
    let _ = create_dir(otpt_dr);
  }
  let nr = mtchs.opt_present("nr");
  let mut otpt_fl = BufWriter::new(File::create(otpt_dr.join(CNFG_FL)).expect("Failed to create output file"));
  let _ = otpt_fl.write_all(if nr {b"true"} else {b"false"});
  let tmp_dr = String::from("/tmp/crast_db_") + &gt_tm_stmp();
  let tmp_dr = Path::new(&tmp_dr);
  if !tmp_dr.exists() {
    let _ = create_dir(tmp_dr);
  }
  let tmp_cntxt_dst_sq_dr = tmp_dr.join("cntxt_dst_sqs");
  let tmp_cntxt_dst_sq_dr = Path::new(&tmp_cntxt_dst_sq_dr);
  if !tmp_cntxt_dst_sq_dr.exists() {
    let _ = create_dir(tmp_cntxt_dst_sq_dr);
  }
  let tmp_rvrs_cntxt_dst_sq_dr = tmp_dr.join("rvrs_cntxt_dst_sqs");
  let tmp_rvrs_cntxt_dst_sq_dr = Path::new(&tmp_rvrs_cntxt_dst_sq_dr);
  if !tmp_rvrs_cntxt_dst_sq_dr.exists() {
    let _ = create_dir(tmp_rvrs_cntxt_dst_sq_dr);
  }
  let tmp_sfx_ar_dr = tmp_dr.join("sfx_ars");
  let tmp_sfx_ar_dr = Path::new(&tmp_sfx_ar_dr);
  if !tmp_sfx_ar_dr.exists() {
    let _ = create_dir(tmp_sfx_ar_dr);
  }
  let tmp_rvrs_sfx_ar_dr = tmp_dr.join("rvrs_sfx_ars");
  let tmp_rvrs_sfx_ar_dr = Path::new(&tmp_rvrs_sfx_ar_dr);
  if !tmp_rvrs_sfx_ar_dr.exists() {
    let _ = create_dir(tmp_rvrs_sfx_ar_dr);
  }
  let tmp_ptrn_hsh_mp_dr = tmp_dr.join("ptrn_hsh_mps");
  let tmp_ptrn_hsh_mp_dr = Path::new(&tmp_ptrn_hsh_mp_dr);
  if !tmp_ptrn_hsh_mp_dr.exists() {
    let _ = create_dir(tmp_ptrn_hsh_mp_dr);
  }
  let tmp_rvrs_ptrn_hsh_mp_dr = tmp_dr.join("rvrs_ptrn_hsh_mps");
  let tmp_rvrs_ptrn_hsh_mp_dr = Path::new(&tmp_rvrs_ptrn_hsh_mp_dr);
  if !tmp_rvrs_ptrn_hsh_mp_dr.exists() {
    let _ = create_dir(tmp_rvrs_ptrn_hsh_mp_dr);
  }
  let tmp_sq_dr = tmp_dr.join("sqs");
  let tmp_sq_dr = Path::new(&tmp_sq_dr);
  if !tmp_sq_dr.exists() {
    let _ = create_dir(tmp_sq_dr);
  }
  let fasta_rdr = Reader::from_file(Path::new(&i)).expect("Failed to read FASTA file");
  let mut db_sz = 0;
  let mut thrd_pl = Pool::new(t);
  thrd_pl.scoped(|scp| {
    for rc in fasta_rdr.records() {
      let dt = rc.expect("Failed to read FASTA record");
      let mut sq = dt.seq().to_vec();
      db_sz += sq.len();
      scp.execute(move || {
        let id = dt.id().expect("Failed to get FASTA record ID");
        let cntxt_dst_sq = cpr(&sq, ms, a);
        wrt_cntxt_dst_sq(id, &cntxt_dst_sq, tmp_cntxt_dst_sq_dr);
        if !nr {
          let mut rvrs_sq = sq.iter().rev().map(|chr| {gt_cmplmnt_bs(*chr)}).collect::<BSq>();
          let rvrs_cntxt_dst_sq = cpr(&rvrs_sq, ms, a);
          wrt_cntxt_dst_sq(id, &rvrs_cntxt_dst_sq, tmp_rvrs_cntxt_dst_sq_dr);
          rvrs_sq.push(SNTNL);
          let rvrs_sfx_ar = suffix_array(&rvrs_sq);
          wrt_sfx_ar(id, &rvrs_sfx_ar, tmp_rvrs_sfx_ar_dr);
          let rvrs_ptrn_hsh_mp = gt_k_mr_hsh_mp(&rvrs_sq, &rvrs_sfx_ar, rvrs_sq.len());
          wrt_ptrn_hsh_mp(id, &rvrs_ptrn_hsh_mp, tmp_rvrs_ptrn_hsh_mp_dr);
        }
        sq.push(SNTNL);
        wrt_sq(id, &sq, tmp_sq_dr);
        let sfx_ar = suffix_array(&sq);
        wrt_sfx_ar(id, &sfx_ar, tmp_sfx_ar_dr);
        let ptrn_hsh_mp = gt_k_mr_hsh_mp(&sq, &sfx_ar, sq.len());
        wrt_ptrn_hsh_mp(id, &ptrn_hsh_mp, tmp_ptrn_hsh_mp_dr);
      });
    }
  });
  let mut otpt_fl = BufWriter::new(File::create(otpt_dr.join(DB_SZ_FL)).expect("Failed to create output file"));
  let _ = otpt_fl.write_all(db_sz.to_string().as_bytes());
  let mut otpt_fl_4_ids = BzEncoder::new(File::create(otpt_dr.join(ID_FL)).expect("Failed to create output file"), Compression::Best);
  let mut otpt_fl_4_prb_dst_sqs = BzEncoder::new(File::create(otpt_dr.join(PRB_DST_SQ_FL)).expect("Failed to create output file"), Compression::Best);
  let mut otpt_fl_4_rvrs_prb_dst_sqs = BzEncoder::new(File::create(otpt_dr.join(RVRS_PRB_DST_SQ_FL)).expect("Failed to create output file"), Compression::Best);
  let mut otpt_fl_4_sfx_ars = BzEncoder::new(File::create(otpt_dr.join(SFX_AR_FL)).expect("Failed to create output file"), Compression::Best);
  let mut otpt_fl_4_rvrs_sfx_ars = BzEncoder::new(File::create(otpt_dr.join(RVRS_SFX_AR_FL)).expect("Failed to create output file"), Compression::Best);
  let mut otpt_fl_4_ptrn_hsh_mps = BzEncoder::new(File::create(otpt_dr.join(PTRN_HSH_MP_FL)).expect("Failed to create output file"), Compression::Best);
  let mut otpt_fl_4_rvrs_ptrn_hsh_mps = BzEncoder::new(File::create(otpt_dr.join(RVRS_PTRN_HSH_MP_FL)).expect("Failed to create output file"), Compression::Best);
  let mut otpt_fl_4_sqs = BzEncoder::new(File::create(otpt_dr.join(SQ_FL)).expect("Failed to create output file"), Compression::Best);
  for prb_dst_sq in tmp_cntxt_dst_sq_dr.read_dir().expect("Failed to read entries within temp. dir.") {
    let prb_dst_sq = prb_dst_sq.expect("Failed to read entry within temp. dir.");
    let prb_dst_sq_pth = prb_dst_sq.path();
    let fl_nm = prb_dst_sq_pth.file_name().expect("Failed to get file name of entry within temp. dir.").to_os_string();
    let mut prb_dst_sq = BufReader::new(File::open(prb_dst_sq_pth).expect("Failed to open entry within temp. dir."));
    let mut bfr = Vec::new();
    let _ = prb_dst_sq.read_to_end(&mut bfr);
    let frst_ln_brk_ps = bfr.iter().position(|chr| {*chr == '\n' as u8}).expect("Failed to find line break");
    let rst = bfr.split_off(frst_ln_brk_ps + 1);
    let _ = otpt_fl_4_prb_dst_sqs.write_all(&rst);
    let _ = otpt_fl_4_ids.write_all(&bfr[1 ..]);
    let mut sfx_ar = BufReader::new(File::open(tmp_sfx_ar_dr.join(&fl_nm)).expect("Failed to open entry within temp. dir."));
    let mut bfr = Vec::new();
    let _ = sfx_ar.read_to_end(&mut bfr);
    let _ = otpt_fl_4_sfx_ars.write_all(&bfr);
    let mut ptrn_hsh_mp = BufReader::new(File::open(tmp_ptrn_hsh_mp_dr.join(&fl_nm)).expect("Failed to open entry within temp. dir."));
    let mut bfr = Vec::new();
    let _ = ptrn_hsh_mp.read_to_end(&mut bfr);
    let _ = otpt_fl_4_ptrn_hsh_mps.write_all(&bfr);
    if !nr {
      let mut rvrs_prb_dst_sq = BufReader::new(File::open(tmp_rvrs_cntxt_dst_sq_dr.join(&fl_nm)).expect("Failed to open entry within temp. dir."));
      let mut bfr = Vec::new();
      let _ = rvrs_prb_dst_sq.read_to_end(&mut bfr);
      let frst_ln_brk_ps = bfr.iter().position(|chr| {*chr == '\n' as u8}).expect("Failed to find line break");
      let rst = bfr.split_off(frst_ln_brk_ps + 1);
      let _ = otpt_fl_4_rvrs_prb_dst_sqs.write_all(&rst);
      let mut rvrs_sfx_ar = BufReader::new(File::open(tmp_rvrs_sfx_ar_dr.join(&fl_nm)).expect("Failed to open entry within temp. dir."));
      let mut bfr = Vec::new();
      let _ = rvrs_sfx_ar.read_to_end(&mut bfr);
      let _ = otpt_fl_4_rvrs_sfx_ars.write_all(&bfr);
      let mut rvrs_ptrn_hsh_mp = BufReader::new(File::open(tmp_rvrs_ptrn_hsh_mp_dr.join(&fl_nm)).expect("Failed to open entry within temp. dir."));
      let mut bfr = Vec::new();
      let _ = rvrs_ptrn_hsh_mp.read_to_end(&mut bfr);
      let _ = otpt_fl_4_rvrs_ptrn_hsh_mps.write_all(&bfr);
    }
    let mut sq = BufReader::new(File::open(tmp_sq_dr.join(&fl_nm)).expect("Failed to open entry within temp. dir."));
    let mut bfr = Vec::new();
    let _ = sq.read_to_end(&mut bfr);
    let _ = otpt_fl_4_sqs.write_all(&bfr);
  }
  let _ = remove_dir_all(tmp_dr);
}

#[inline]
fn wrt_sq(id: IdStr, sq: &BSq, tmp_dr: &Path) {
  let mut tmp_otpt_fl = BufWriter::new(File::create(tmp_dr.join(&(String::from(id) + ".dat"))).expect("Failed to create temp. output file"));
  let _ = tmp_otpt_fl.write_all(sq);
  let _ = tmp_otpt_fl.write_all(b"\n");
}

#[inline]
fn wrt_sfx_ar(id: IdStr, sfx_ar: &RawSuffixArray, tmp_dr: &Path) {
  let mut bfr = sfx_ar.iter().fold(String::new(), |acmltr, sfx_indx| acmltr + &format!("{} ", sfx_indx));
  bfr.pop();
  let mut tmp_otpt_fl = BufWriter::new(File::create(tmp_dr.join(&(String::from(id) + ".dat"))).expect("Failed to create temp. output file"));
  let _ = tmp_otpt_fl.write_all(bfr.as_bytes());
  let _ = tmp_otpt_fl.write_all(b"\n");
}

#[inline]
fn wrt_ptrn_hsh_mp(id: IdStr, ptrn_hsh_mp: &PtrnHshMp, tmp_dr: &Path) {
  let mut bfr = ptrn_hsh_mp.iter().fold(String::new(), |acmltr_1, ky_vl_pr| {
    let mut vc_str = ky_vl_pr.0.iter().fold(String::new(), |acmltr_2, &elm| acmltr_2 + &format!("{} ", elm));
    vc_str.pop();
    acmltr_1 + &format!("{},{},{}\t", &vc_str, (ky_vl_pr.1).0, (ky_vl_pr.1).1)
  });
  bfr.pop();
  let mut tmp_otpt_fl = BufWriter::new(File::create(tmp_dr.join(&(String::from(id) + ".dat"))).expect("Failed to create temp. output file"));
  let _ = tmp_otpt_fl.write_all(bfr.as_bytes());
  let _ = tmp_otpt_fl.write_all(b"\n");
}

#[inline]
fn gt_k_mr_hsh_mp(sq: &BSq, sfx_ar: &RawSuffixArray, sq_ln: usize) -> PtrnHshMp {
  let mut k_mr_hsh_mp = HashMap::default();
  let alphbt = vec![A, T, U, G, C];
  let alphbt_ln = alphbt.len();
  let mut k_mrs = vec![Vec::new()];
  let fl_srch_rng = (0, sq_ln);
  for _ in 0 .. MX_K_MR_SZ {
    let mut nw_k_mrs = Vec::new();
    let ofst = k_mrs[0].len();
    for chr in &alphbt {
      for k_mr in &k_mrs {
        let mut nw_k_mr = k_mr.clone();
        nw_k_mr.push(*chr);
        let sfx_ar_indx_pr = match k_mr_hsh_mp.get(k_mr) {
          Some(&srch_rng) => {
            gt_sfx_ar_indx_pr(&nw_k_mr[..], sq, sfx_ar, &srch_rng, ofst, sq_ln)
          },
          None => {
            if k_mr_hsh_mp.len() < alphbt_ln {
             gt_sfx_ar_indx_pr(&nw_k_mr[..], sq, sfx_ar, &fl_srch_rng, ofst, sq_ln)
            } else {
              (0, 0)
            }
          },
        };
        if sfx_ar_indx_pr.1 - sfx_ar_indx_pr.0 > 0 {
          nw_k_mrs.push(nw_k_mr.clone());
          k_mr_hsh_mp.insert(nw_k_mr, sfx_ar_indx_pr);
        }
      }
    }
    if nw_k_mrs.len() == 0 {
      break;
    }
    k_mrs = nw_k_mrs;
  }
  k_mr_hsh_mp
}