extern crate fasten;
extern crate fastq;
extern crate getopts;
extern crate rand;
use std::io::stdin;
use std::collections::HashMap;
use std::f32;
use fasten::fasten_base_options;
use fasten::fasten_base_options_matches;
use fastq::{Parser, Record};
use fasten::logmsg;
const TEN: f32 = 10.0;
const READ_SEPARATOR :char = '~';
fn main(){
let mut opts = fasten_base_options();
let default_phred_min_char:char = '!';
let default_phred_max_char:char = 'I';
let default_phred_min:u8 = default_phred_min_char as u8 - 33;
let default_phred_max:u8 = default_phred_max_char as u8 - 33;
let mut qual_range_string = String::with_capacity((default_phred_max - default_phred_min + 1) as usize);
for phred in default_phred_min..default_phred_max+1 {
qual_range_string.push((phred+33) as char);
}
opts.optopt("","max-qual-char",
format!("Maximum quality character (default: {})", default_phred_max_char).as_str(),
"CHAR");
opts.optopt("","min-qual-char",
format!("Minimum quality character (default: {})", default_phred_min_char).as_str(),
"CHAR");
let description:String = format!("Collapse identical reads into single reads, recalculating quality values. If paired end, then each set of reads must be identical to be collapsed. Warning: due to multiple reads collapsing into one, read identifiers will be reconstituted. NOTE: range of quality scores is {}", qual_range_string);
let matches = fasten_base_options_matches(&description, opts);
let max_qual_char:char = matches.opt_default("max-qual-char", &default_phred_max_char.to_string())
.unwrap_or(String::from(default_phred_max_char))
.parse()
.expect("ERROR converting --max-qual-int value to integer");
let mut min_qual_char:char =
matches.opt_default("min-qual-char", &default_phred_min_char.to_string())
.unwrap_or(String::from(default_phred_min_char))
.parse()
.expect("ERROR converting --min-qual-int value to integer");
if min_qual_char < default_phred_min_char {
logmsg("--min-qual-char was less than the default minimum and so it will be set to the default");
min_qual_char = default_phred_min_char;
}
let min_qual:u8 = min_qual_char as u8;
let max_qual:u8 = max_qual_char as u8;
let paired_end = matches.opt_present("paired-end");
let _num_cpus:usize = {
if matches.opt_present("numcpus") {
logmsg("Warning: This script does not make use of --numcpus");
1 as usize
} else {
1 as usize
}
};
let mut seq_count :HashMap<String, u32> =HashMap::new();
let mut seq_error_rate :HashMap<String, Vec<f32>> = HashMap::new();
let parser = Parser::new(stdin());
let mut parser_getter = parser.ref_iter();
parser_getter.advance().expect("Could not read the first fastq entry");
while let Some(record1) = parser_getter.get() {
let mut id:Vec<u8> = record1.head().to_vec();
let mut seq:Vec<u8> = record1.seq().to_vec();
let mut qual:Vec<u8> = record1.qual().to_vec();
if paired_end {
match parser_getter.advance() {
Ok(_) => {},
Err(err) => {
panic!("ERROR: could not read the second entry in a paired end read: {}", err);
}
};
let record2 = &parser_getter.get().expect("ERROR parsing second pair in a paired end read");
let id2:&[u8] = record2.head();
let seq2:&[u8] = record2.seq();
let qual2:&[u8]= record2.qual();
id.push(READ_SEPARATOR as u8);
seq.push(READ_SEPARATOR as u8);
qual.push(READ_SEPARATOR as u8);
id.extend_from_slice(id2);
seq.extend_from_slice(seq2);
qual.extend_from_slice(qual2);
}
let seq_string:String = String::from(
std::str::from_utf8(&seq[..])
.expect("ERROR converting slice to str")
);
let count = seq_count.entry(seq_string.clone()).or_insert(0);
*count += 1 as u32;
if !seq_error_rate.contains_key(&seq_string) {
let mut qual_vec:Vec<f32> = vec![];
for q in qual {
if q == READ_SEPARATOR as u8 {
qual_vec.push(q as u8 as f32);
continue;
}
let qual_int = q as u8 as f32 - 33.0;
let p :f32 = TEN.powf((-1.0 * qual_int)/TEN);
qual_vec.push(p);
}
seq_error_rate.insert(seq_string.clone(), qual_vec);
}
else {
let qual_vec = seq_error_rate.entry(seq_string.clone()).or_insert(Vec::new());
let these_errors = qual.into_iter().map(|qual_char|{
if qual_char == READ_SEPARATOR as u8 {
return qual_char as u8 as f32;
}
let qual_int = qual_char as u8 as f32 - 33.0;
let p :f32 = TEN.powf((-1.0 * qual_int)/TEN);
return p;
}).collect();
let new_qual = combine_error_vectors(&qual_vec,&these_errors);
*qual_vec = new_qual;
}
match &parser_getter.advance() {
Ok(_) => {},
Err(_) => {break;}
};
}
let mut seq_counter=0;
for (seq,combined_qual) in seq_error_rate {
seq_counter += 1;
let mut qual_cigar = String::new();
for p in combined_qual {
let mut qual_recalc :f32 = -TEN * (p).log(TEN)+33.0;
if qual_recalc.is_infinite() || qual_recalc > max_qual as f32 {
qual_recalc = max_qual as f32;
}
let mut qual_recalc_char = qual_recalc.round() as u8 as char;
if (qual_recalc_char as u8) > max_qual {
qual_recalc_char = max_qual_char;
}
if (qual_recalc_char as u8) < min_qual {
qual_recalc_char = min_qual_char;
}
qual_cigar.push(qual_recalc_char);
}
if paired_end {
let separator_pos = seq.find(READ_SEPARATOR).expect("ERROR finding read separator");
let r1_seq = seq[0..separator_pos].to_string();
let r2_seq = seq[separator_pos+1..].to_string();
let r1_qual= qual_cigar[0..separator_pos].to_string();
let r2_qual= qual_cigar[separator_pos+1..].to_string();
println!("@{}/1\n{}\n+\n{}",seq_counter,r1_seq,r1_qual);
println!("@{}/2\n{}\n+\n{}",seq_counter,r2_seq,r2_qual);
} else {
println!("@{}\n{}\n+\n{}",seq_counter,seq,qual_cigar);
}
}
}
fn combine_error_vectors(errors1 :&Vec<f32>, errors2: &Vec<f32>) -> Vec<f32> {
if errors1.len() != errors2.len() {
panic!("Lengths of error vectors do not match: {} and {}", errors1.len(), errors2.len());
}
let mut errors_iter2=errors2.iter();
let mut new_errors :Vec<f32> = Vec::new(); for p1 in errors1 {
let p2 = errors_iter2.next().expect("ERROR: could not get the error probability from the second read");
new_errors.push(p1 * p2);
}
return new_errors;
}