extern crate num_cpus;
extern crate cpr;
extern crate bzip2;
extern crate bio;
extern crate crast;
#[macro_use]
pub mod utls;
use bzip2::Compression;
use bzip2::write::BzEncoder;
use utls::*;
use bio::data_structures::suffix_array::{suffix_array, RawSuffixArray};
const SNTNL: BElm = '$' as BElm;
fn main() {
let args = env::args().collect::<Vec<Arg>>();
let prgrm = args[0].clone();
let mut opts = Options::new();
opts.reqopt("i", "", "Ref. seqs in single file of FASTA format", "STR");
opts.reqopt("o", "", "Dir. to locate output database files", "STR");
opts.optopt("", "ms", &(String::from("Max. span between paired bases (Default: ") + &DFLT_MX_SPN.to_string() + ")"), "UINT");
opts.optopt("t", "", "# of threads in multithreading (Default: system val.)", "UINT");
opts.optflag("", "nr", "search only forward strands of ref. seqs (Default: also search reverse strands)");
opts.optflag("a", "", "Approximate estimation of ref. RNA context seqs");
opts.optflag("h", "help", "Print help menu");
let mtchs = match opts.parse(&args[1 ..]) {
Ok(mtch) => {mtch}
Err(fl) => {prnt_usg(&prgrm, &opts); panic!(fl.to_string())}
};
if mtchs.opt_present("h") {
prnt_usg(&prgrm, &opts);
return;
}
let t = if mtchs.opt_present("t") {
mtchs.opt_str("t").expect("Failed to retrieve # of threads from command args").parse().expect("Failed to parse # of threads")
} else {
num_cpus::get() as ThrdNm
};
let ms = if mtchs.opt_present("ms") {
mtchs.opt_str("ms").expect("Failed to retrieve max. span between paired bases from command args").parse().expect("Failed to parse max. span between paired bases")
} else {
DFLT_MX_SPN
};
let i = mtchs.opt_str("i").expect("Failed to retrieve input file from command args");
let o = mtchs.opt_str("o").expect("Failed to retrieve output file from command args");
let a = mtchs.opt_present("a");
let otpt_dr = Path::new(&o);
if !otpt_dr.exists() {
let _ = create_dir(otpt_dr);
}
let nr = mtchs.opt_present("nr");
let mut otpt_fl = BufWriter::new(File::create(otpt_dr.join(CNFG_FL)).expect("Failed to create output file"));
let _ = otpt_fl.write_all(if nr {b"true"} else {b"false"});
let tmp_dr = String::from("/tmp/crast_db_") + >_tm_stmp();
let tmp_dr = Path::new(&tmp_dr);
if !tmp_dr.exists() {
let _ = create_dir(tmp_dr);
}
let tmp_cntxt_dst_sq_dr = tmp_dr.join("cntxt_dst_sqs");
let tmp_cntxt_dst_sq_dr = Path::new(&tmp_cntxt_dst_sq_dr);
if !tmp_cntxt_dst_sq_dr.exists() {
let _ = create_dir(tmp_cntxt_dst_sq_dr);
}
let tmp_rvrs_cntxt_dst_sq_dr = tmp_dr.join("rvrs_cntxt_dst_sqs");
let tmp_rvrs_cntxt_dst_sq_dr = Path::new(&tmp_rvrs_cntxt_dst_sq_dr);
if !tmp_rvrs_cntxt_dst_sq_dr.exists() {
let _ = create_dir(tmp_rvrs_cntxt_dst_sq_dr);
}
let tmp_sfx_ar_dr = tmp_dr.join("sfx_ars");
let tmp_sfx_ar_dr = Path::new(&tmp_sfx_ar_dr);
if !tmp_sfx_ar_dr.exists() {
let _ = create_dir(tmp_sfx_ar_dr);
}
let tmp_rvrs_sfx_ar_dr = tmp_dr.join("rvrs_sfx_ars");
let tmp_rvrs_sfx_ar_dr = Path::new(&tmp_rvrs_sfx_ar_dr);
if !tmp_rvrs_sfx_ar_dr.exists() {
let _ = create_dir(tmp_rvrs_sfx_ar_dr);
}
let tmp_ptrn_hsh_mp_dr = tmp_dr.join("ptrn_hsh_mps");
let tmp_ptrn_hsh_mp_dr = Path::new(&tmp_ptrn_hsh_mp_dr);
if !tmp_ptrn_hsh_mp_dr.exists() {
let _ = create_dir(tmp_ptrn_hsh_mp_dr);
}
let tmp_rvrs_ptrn_hsh_mp_dr = tmp_dr.join("rvrs_ptrn_hsh_mps");
let tmp_rvrs_ptrn_hsh_mp_dr = Path::new(&tmp_rvrs_ptrn_hsh_mp_dr);
if !tmp_rvrs_ptrn_hsh_mp_dr.exists() {
let _ = create_dir(tmp_rvrs_ptrn_hsh_mp_dr);
}
let tmp_sq_dr = tmp_dr.join("sqs");
let tmp_sq_dr = Path::new(&tmp_sq_dr);
if !tmp_sq_dr.exists() {
let _ = create_dir(tmp_sq_dr);
}
let fasta_rdr = Reader::from_file(Path::new(&i)).expect("Failed to read FASTA file");
let mut db_sz = 0;
let mut thrd_pl = Pool::new(t);
thrd_pl.scoped(|scp| {
for rc in fasta_rdr.records() {
let dt = rc.expect("Failed to read FASTA record");
let mut sq = dt.seq().to_vec();
db_sz += sq.len();
scp.execute(move || {
let id = dt.id().expect("Failed to get FASTA record ID");
let cntxt_dst_sq = cpr(&sq, ms, a);
wrt_cntxt_dst_sq(id, &cntxt_dst_sq, tmp_cntxt_dst_sq_dr);
if !nr {
let mut rvrs_sq = sq.iter().rev().map(|chr| {gt_cmplmnt_bs(*chr)}).collect::<BSq>();
let rvrs_cntxt_dst_sq = cpr(&rvrs_sq, ms, a);
wrt_cntxt_dst_sq(id, &rvrs_cntxt_dst_sq, tmp_rvrs_cntxt_dst_sq_dr);
rvrs_sq.push(SNTNL);
let rvrs_sfx_ar = suffix_array(&rvrs_sq);
wrt_sfx_ar(id, &rvrs_sfx_ar, tmp_rvrs_sfx_ar_dr);
let rvrs_ptrn_hsh_mp = gt_k_mr_hsh_mp(&rvrs_sq, &rvrs_sfx_ar, rvrs_sq.len());
wrt_ptrn_hsh_mp(id, &rvrs_ptrn_hsh_mp, tmp_rvrs_ptrn_hsh_mp_dr);
}
sq.push(SNTNL);
wrt_sq(id, &sq, tmp_sq_dr);
let sfx_ar = suffix_array(&sq);
wrt_sfx_ar(id, &sfx_ar, tmp_sfx_ar_dr);
let ptrn_hsh_mp = gt_k_mr_hsh_mp(&sq, &sfx_ar, sq.len());
wrt_ptrn_hsh_mp(id, &ptrn_hsh_mp, tmp_ptrn_hsh_mp_dr);
});
}
});
let mut otpt_fl = BufWriter::new(File::create(otpt_dr.join(DB_SZ_FL)).expect("Failed to create output file"));
let _ = otpt_fl.write_all(db_sz.to_string().as_bytes());
let mut otpt_fl_4_ids = BzEncoder::new(File::create(otpt_dr.join(ID_FL)).expect("Failed to create output file"), Compression::Best);
let mut otpt_fl_4_prb_dst_sqs = BzEncoder::new(File::create(otpt_dr.join(PRB_DST_SQ_FL)).expect("Failed to create output file"), Compression::Best);
let mut otpt_fl_4_rvrs_prb_dst_sqs = BzEncoder::new(File::create(otpt_dr.join(RVRS_PRB_DST_SQ_FL)).expect("Failed to create output file"), Compression::Best);
let mut otpt_fl_4_sfx_ars = BzEncoder::new(File::create(otpt_dr.join(SFX_AR_FL)).expect("Failed to create output file"), Compression::Best);
let mut otpt_fl_4_rvrs_sfx_ars = BzEncoder::new(File::create(otpt_dr.join(RVRS_SFX_AR_FL)).expect("Failed to create output file"), Compression::Best);
let mut otpt_fl_4_ptrn_hsh_mps = BzEncoder::new(File::create(otpt_dr.join(PTRN_HSH_MP_FL)).expect("Failed to create output file"), Compression::Best);
let mut otpt_fl_4_rvrs_ptrn_hsh_mps = BzEncoder::new(File::create(otpt_dr.join(RVRS_PTRN_HSH_MP_FL)).expect("Failed to create output file"), Compression::Best);
let mut otpt_fl_4_sqs = BzEncoder::new(File::create(otpt_dr.join(SQ_FL)).expect("Failed to create output file"), Compression::Best);
for prb_dst_sq in tmp_cntxt_dst_sq_dr.read_dir().expect("Failed to read entries within temp. dir.") {
let prb_dst_sq = prb_dst_sq.expect("Failed to read entry within temp. dir.");
let prb_dst_sq_pth = prb_dst_sq.path();
let fl_nm = prb_dst_sq_pth.file_name().expect("Failed to get file name of entry within temp. dir.").to_os_string();
let mut prb_dst_sq = BufReader::new(File::open(prb_dst_sq_pth).expect("Failed to open entry within temp. dir."));
let mut bfr = Vec::new();
let _ = prb_dst_sq.read_to_end(&mut bfr);
let frst_ln_brk_ps = bfr.iter().position(|chr| {*chr == '\n' as u8}).expect("Failed to find line break");
let rst = bfr.split_off(frst_ln_brk_ps + 1);
let _ = otpt_fl_4_prb_dst_sqs.write_all(&rst);
let _ = otpt_fl_4_ids.write_all(&bfr[1 ..]);
let mut sfx_ar = BufReader::new(File::open(tmp_sfx_ar_dr.join(&fl_nm)).expect("Failed to open entry within temp. dir."));
let mut bfr = Vec::new();
let _ = sfx_ar.read_to_end(&mut bfr);
let _ = otpt_fl_4_sfx_ars.write_all(&bfr);
let mut ptrn_hsh_mp = BufReader::new(File::open(tmp_ptrn_hsh_mp_dr.join(&fl_nm)).expect("Failed to open entry within temp. dir."));
let mut bfr = Vec::new();
let _ = ptrn_hsh_mp.read_to_end(&mut bfr);
let _ = otpt_fl_4_ptrn_hsh_mps.write_all(&bfr);
if !nr {
let mut rvrs_prb_dst_sq = BufReader::new(File::open(tmp_rvrs_cntxt_dst_sq_dr.join(&fl_nm)).expect("Failed to open entry within temp. dir."));
let mut bfr = Vec::new();
let _ = rvrs_prb_dst_sq.read_to_end(&mut bfr);
let frst_ln_brk_ps = bfr.iter().position(|chr| {*chr == '\n' as u8}).expect("Failed to find line break");
let rst = bfr.split_off(frst_ln_brk_ps + 1);
let _ = otpt_fl_4_rvrs_prb_dst_sqs.write_all(&rst);
let mut rvrs_sfx_ar = BufReader::new(File::open(tmp_rvrs_sfx_ar_dr.join(&fl_nm)).expect("Failed to open entry within temp. dir."));
let mut bfr = Vec::new();
let _ = rvrs_sfx_ar.read_to_end(&mut bfr);
let _ = otpt_fl_4_rvrs_sfx_ars.write_all(&bfr);
let mut rvrs_ptrn_hsh_mp = BufReader::new(File::open(tmp_rvrs_ptrn_hsh_mp_dr.join(&fl_nm)).expect("Failed to open entry within temp. dir."));
let mut bfr = Vec::new();
let _ = rvrs_ptrn_hsh_mp.read_to_end(&mut bfr);
let _ = otpt_fl_4_rvrs_ptrn_hsh_mps.write_all(&bfr);
}
let mut sq = BufReader::new(File::open(tmp_sq_dr.join(&fl_nm)).expect("Failed to open entry within temp. dir."));
let mut bfr = Vec::new();
let _ = sq.read_to_end(&mut bfr);
let _ = otpt_fl_4_sqs.write_all(&bfr);
}
let _ = remove_dir_all(tmp_dr);
}
#[inline]
fn wrt_sq(id: IdStr, sq: &BSq, tmp_dr: &Path) {
let mut tmp_otpt_fl = BufWriter::new(File::create(tmp_dr.join(&(String::from(id) + ".dat"))).expect("Failed to create temp. output file"));
let _ = tmp_otpt_fl.write_all(sq);
let _ = tmp_otpt_fl.write_all(b"\n");
}
#[inline]
fn wrt_sfx_ar(id: IdStr, sfx_ar: &RawSuffixArray, tmp_dr: &Path) {
let mut bfr = sfx_ar.iter().fold(String::new(), |acmltr, sfx_indx| acmltr + &format!("{} ", sfx_indx));
bfr.pop();
let mut tmp_otpt_fl = BufWriter::new(File::create(tmp_dr.join(&(String::from(id) + ".dat"))).expect("Failed to create temp. output file"));
let _ = tmp_otpt_fl.write_all(bfr.as_bytes());
let _ = tmp_otpt_fl.write_all(b"\n");
}
#[inline]
fn wrt_ptrn_hsh_mp(id: IdStr, ptrn_hsh_mp: &PtrnHshMp, tmp_dr: &Path) {
let mut bfr = ptrn_hsh_mp.iter().fold(String::new(), |acmltr_1, ky_vl_pr| {
let mut vc_str = ky_vl_pr.0.iter().fold(String::new(), |acmltr_2, &elm| acmltr_2 + &format!("{} ", elm));
vc_str.pop();
acmltr_1 + &format!("{},{},{}\t", &vc_str, (ky_vl_pr.1).0, (ky_vl_pr.1).1)
});
bfr.pop();
let mut tmp_otpt_fl = BufWriter::new(File::create(tmp_dr.join(&(String::from(id) + ".dat"))).expect("Failed to create temp. output file"));
let _ = tmp_otpt_fl.write_all(bfr.as_bytes());
let _ = tmp_otpt_fl.write_all(b"\n");
}
#[inline]
fn gt_k_mr_hsh_mp(sq: &BSq, sfx_ar: &RawSuffixArray, sq_ln: usize) -> PtrnHshMp {
let mut k_mr_hsh_mp = HashMap::default();
let alphbt = vec![A, T, U, G, C];
let alphbt_ln = alphbt.len();
let mut k_mrs = vec![Vec::new()];
let fl_srch_rng = (0, sq_ln);
for _ in 0 .. MX_K_MR_SZ {
let mut nw_k_mrs = Vec::new();
let ofst = k_mrs[0].len();
for chr in &alphbt {
for k_mr in &k_mrs {
let mut nw_k_mr = k_mr.clone();
nw_k_mr.push(*chr);
let sfx_ar_indx_pr = match k_mr_hsh_mp.get(k_mr) {
Some(&srch_rng) => {
gt_sfx_ar_indx_pr(&nw_k_mr[..], sq, sfx_ar, &srch_rng, ofst, sq_ln)
},
None => {
if k_mr_hsh_mp.len() < alphbt_ln {
gt_sfx_ar_indx_pr(&nw_k_mr[..], sq, sfx_ar, &fl_srch_rng, ofst, sq_ln)
} else {
(0, 0)
}
},
};
if sfx_ar_indx_pr.1 - sfx_ar_indx_pr.0 > 0 {
nw_k_mrs.push(nw_k_mr.clone());
k_mr_hsh_mp.insert(nw_k_mr, sfx_ar_indx_pr);
}
}
}
if nw_k_mrs.len() == 0 {
break;
}
k_mrs = nw_k_mrs;
}
k_mr_hsh_mp
}