extern crate neofold;
extern crate getopts;
extern crate scoped_threadpool;
extern crate bio;
extern crate itertools;
extern crate num_cpus;
use neofold::*;
use getopts::Options;
use self::scoped_threadpool::Pool;
use std::env;
use std::path::Path;
use bio::io::fasta::Reader;
use std::io::prelude::*;
use std::io::{BufReader, BufWriter};
use std::fs::File;
use itertools::multizip;
type Arg = String;
type NumOfThreads = u32;
type FastaId = String;
type FastaRecord = (FastaId, Seq, usize);
type FastaRecords = Vec<FastaRecord>;
type Strings = Vec<String>;
const DEFAULT_GAMMA: Prob = 1.;
const VERSION: &'static str = "0.1.0";
fn main() {
let args = env::args().collect::<Vec<Arg>>();
let program_name = args[0].clone();
let mut opts = Options::new();
opts.reqopt("f", "input_fasta_file_path", "The path to an input FASTA file containing RNA sequences", "STR");
opts.reqopt("p", "input_base_pair_prob_matrix_file_path", "The path to an input file containing base-pairing probability matrices", "STR");
opts.reqopt("q", "input_unpair_prob_matrix_file_path", "The path to an input file containing unpairing probability matrices", "STR");
opts.reqopt("o", "output_file_path", "The path to an output file which will contain estimated secondary structures", "STR");
opts.optopt("", "gamma", &format!("An MEA gamma (Uses {} by default)", DEFAULT_GAMMA), "FLOAT");
opts.optopt("t", "num_of_threads", "The number of threads in multithreading (Uses the number of all the threads of this computer by default)", "UINT");
opts.optflag("h", "help", "Print a help menu");
let matches = match opts.parse(&args[1 ..]) {
Ok(opt) => {opt}
Err(failure) => {print_program_usage(&program_name, &opts); panic!(failure.to_string())}
};
if matches.opt_present("h") {
print_program_usage(&program_name, &opts);
return;
}
let input_fasta_file_path = matches.opt_str("f").expect("Failed to get the path to an input FASTA file containing RNA sequences from command arguments.");
let input_fasta_file_path = Path::new(&input_fasta_file_path);
let input_bpp_mat_file_path = matches.opt_str("p").expect("Failed to get the path to an input file containing base-pairing probability matrices from command arguments.");
let input_bpp_mat_file_path = Path::new(&input_bpp_mat_file_path);
let input_upp_mat_file_path = matches.opt_str("q").expect("Failed to get the path to an input file containing unpairing probability matrices from command arguments.");
let input_upp_mat_file_path = Path::new(&input_upp_mat_file_path);
let output_file_path = matches.opt_str("o").expect("Failed to get the path to an output file which will contain estimated secondary structures from command arguments.");
let output_file_path = Path::new(&output_file_path);
let gamma = if matches.opt_present("gamma") {
matches.opt_str("gamma").expect("Failed to get an MEA gamma from command arguments.").parse().expect("Failed to parse an MEA gamma.")
} else {
DEFAULT_GAMMA
};
let num_of_threads = if matches.opt_present("t") {
matches.opt_str("t").expect("Failed to get the number of threads in multithreading from command arguments.").parse().expect("Failed to parse the number of threads in multithreading.")
} else {
num_cpus::get() as NumOfThreads
};
let fasta_file_reader = Reader::from_file(Path::new(&input_fasta_file_path)).expect("Failed to set a FASTA file reader.");
let mut fasta_records = FastaRecords::new();
for fasta_record in fasta_file_reader.records() {
let fasta_record = fasta_record.expect("Failed to read a FASTA record.");
let seq = unsafe {from_utf8_unchecked(fasta_record.seq()).to_uppercase().as_bytes().iter().filter(|&&base| {is_rna_base(base)}).map(|&base| {base}).collect::<Seq>()};
let seq_len = seq.len();
fasta_records.push((String::from(fasta_record.id()), seq, seq_len));
}
let num_of_fasta_records = fasta_records.len();
let mut bpp_mats = vec![SparseProbMat::default(); num_of_fasta_records];
let mut reader_2_input_bpp_mat_file = BufReader::new(File::open(input_bpp_mat_file_path).expect("Failed to read an input file."));
let mut buf_4_reader_2_input_bpp_mat_file = Vec::new();
for _ in 0 .. 2 {
let _ = reader_2_input_bpp_mat_file.read_until(b'>', &mut buf_4_reader_2_input_bpp_mat_file);
}
for (i, vec) in reader_2_input_bpp_mat_file.split(b'>').enumerate() {
if i == num_of_fasta_records {continue;}
let vec = vec.expect("Failed to read an input file.");
let substrings = unsafe {String::from_utf8_unchecked(vec).split_whitespace().map(|string| {String::from(string)}).collect::<Strings>()};
let rna_id = substrings[0].parse::<RnaId>().expect("Failed to parse an RNA ID.");
let seq_len = fasta_records[rna_id].2;
bpp_mats[rna_id].insert((0, seq_len + 1), 1.);
for subsubstring in &substrings[1 ..] {
let subsubsubstrings = subsubstring.split(",").collect::<Vec<&str>>();
bpp_mats[rna_id].insert((
subsubsubstrings[0].parse::<Pos>().expect("Failed to parse an index.") + 1,
subsubsubstrings[1].parse::<Pos>().expect("Failed to parse an index.") + 1),
subsubsubstrings[2].parse().expect("Failed to parse a base-pairing probability."),
);
}
}
let mut upp_mats = vec![Probs::new(); num_of_fasta_records];
let mut reader_2_input_upp_mat_file = BufReader::new(File::open(input_upp_mat_file_path).expect("Failed to read an input file."));
let mut buf_4_reader_2_input_upp_mat_file = Vec::new();
for _ in 0 .. 2 {
let _ = reader_2_input_upp_mat_file.read_until(b'>', &mut buf_4_reader_2_input_upp_mat_file);
}
for (i, vec) in reader_2_input_upp_mat_file.split(b'>').enumerate() {
if i == num_of_fasta_records {continue;}
let vec = vec.expect("Failed to read an input file.");
let substrings = unsafe {String::from_utf8_unchecked(vec).split_whitespace().map(|string| {String::from(string)}).collect::<Strings>()};
let rna_id = substrings[0].parse::<RnaId>().expect("Failed to parse an RNA ID.");
for subsubstring in &substrings[1 ..] {
let subsubsubstrings = subsubstring.split(",").collect::<Vec<&str>>();
upp_mats[rna_id].push(subsubsubstrings[1].parse().expect("Failed to parse a base-pairing probability."));
}
upp_mats[rna_id].insert(0, 0.);
upp_mats[rna_id].push(0.);
}
let mut mea_sss = vec![MeaSs::new(); num_of_fasta_records];
let mut thread_pool = Pool::new(num_of_threads);
thread_pool.scoped(|scope| {
for (mea_ss, bpp_mat, upp_mat, fasta_record) in multizip((mea_sss.iter_mut(), bpp_mats.iter(), upp_mats.iter(), fasta_records.iter())) {
scope.execute(move || {
*mea_ss = neofold(bpp_mat, upp_mat, fasta_record.2, gamma);
});
}
});
let mut writer_2_output_file = BufWriter::new(File::create(output_file_path).expect("Failed to create an output file."));
let mut buf_4_writer_2_output_file = format!("; The version {} of the NeoFold program.\n; The path to the input FASTA file for computing the secondary structures (= SSs) in this file = \"{}\".\n; The path to the input base-pairing probability matrix file for computing these structures = \"{}\".\n; The path to the input unpairing probability matrix file for computing these structures = \"{}\".\n; The values of the parameters used for computing these structures are as follows.\n; \"gamma\" = {}, \"num_of_threads\" = {}.\n; Each row beginning with \">\" is with a pair of the ID of an RNA sequence and expected accuracy of the maximum-expected-accuracy SS computed from this sequence. The row next to this row is with this SS.", VERSION, input_fasta_file_path.display(), input_bpp_mat_file_path.display(), input_upp_mat_file_path.display(), gamma, num_of_threads);
for (rna_id, mea_ss) in mea_sss.iter().enumerate() {
let buf_4_rna_id = format!("\n\n>{},{}\n", rna_id, mea_ss.ea) + &unsafe {String::from_utf8_unchecked(get_mea_ss_str(mea_ss, fasta_records[rna_id].2))};
buf_4_writer_2_output_file.push_str(&buf_4_rna_id);
}
let _ = writer_2_output_file.write_all(buf_4_writer_2_output_file.as_bytes());
}
#[inline]
fn print_program_usage(program_name: &str, opts: &Options) {
let program_usage = format!("The usage of this program: {} [options]", program_name);
print!("{}", opts.usage(&program_usage));
}
#[inline]
fn get_mea_ss_str(mea_ss: &MeaSs, seq_len: usize) -> MeaSsStr {
let mut mea_ss_str = vec![UNPAIRING_BASE; seq_len];
let pseudo_pos_pair = (0, seq_len + 1);
let mut pos_pair_stack = vec![pseudo_pos_pair];
while pos_pair_stack.len() > 0 {
let pos_pair = pos_pair_stack.pop().expect("Failed to pop an element of a vector.");
let (i, j) = pos_pair;
if pos_pair != pseudo_pos_pair {
mea_ss_str[i - 1] = BASE_PAIRING_LEFT_BASE;
mea_ss_str[j - 1] = BASE_PAIRING_RIGHT_BASE;
}
match mea_ss.bp_pos_pair_seqs_inside_pos_pairs.get(&pos_pair) {
Some(pos_pairs) => {
for pos_pair in pos_pairs {
pos_pair_stack.push(*pos_pair);
}
}, None => {},
}
}
mea_ss_str
}