extern crate log;
use std::sync::Arc;
use triple_accel::{levenshtein_exp,levenshtein_search,Match};
use crate::bv_bwt::BitVectorBWT;
use crate::stats_util;
use crate::string_util;
const VALID_CHARS: [u8; 4] = [1, 2, 3, 5];
const VALID_CHARS_LEN: usize = VALID_CHARS.len();
pub struct CorrectionParameters {
pub kmer_sizes: Vec<usize>,
pub min_count: u64,
pub max_branch_attempt_length: usize,
pub branch_limit_factor: f64,
pub branch_buffer_factor: f64,
pub midpoint_ed_factor: f64,
pub tail_buffer_factor: f64,
pub frac: f64,
pub verbose: bool
}
pub struct Correction {
start_pos: usize,
end_pos: usize,
seq: Vec<u8>
}
#[derive(Clone,Debug)]
pub struct LongReadFA {
pub read_index: u64,
pub label: String,
pub seq: String
}
#[derive(Clone,Debug)]
pub struct CorrectionResults {
pub read_index: u64,
pub label: String,
pub original_seq: String,
pub corrected_seq: String,
pub avg_before: f64,
pub avg_after: f64
}
pub fn correction_job(arc_bwt: Arc<BitVectorBWT>, long_read: LongReadFA, arc_params: Arc<CorrectionParameters>) -> CorrectionResults {
let bwt: &BitVectorBWT = &*arc_bwt;
let params: &CorrectionParameters = &*arc_params;
let mut seq_i: Vec<u8> = string_util::convert_stoi(&long_read.seq);
for k in params.kmer_sizes.iter() {
seq_i = correction_pass(bwt, &seq_i, params, *k);
}
let corrected_seq: String = string_util::convert_itos(&seq_i);
let avg_before: f64 = if params.verbose {
let orig_seq: Vec<u8> = string_util::convert_stoi(&long_read.seq);
let counts_before: Vec<u64> = bwt.count_pileup(&orig_seq, params.kmer_sizes[0]);
let sum_before: u64 = counts_before.iter().sum();
sum_before as f64 / counts_before.len() as f64
} else {
0.0
};
let avg_after: f64 = if params.verbose {
let counts_after: Vec<u64> = bwt.count_pileup(&seq_i, params.kmer_sizes[0]);
let sum_after: u64 = counts_after.iter().sum();
sum_after as f64 / counts_after.len() as f64
} else {
0.0
};
CorrectionResults {
read_index: long_read.read_index,
label: long_read.label,
original_seq: long_read.seq,
corrected_seq,
avg_before,
avg_after
}
}
pub fn correction_pass(bwt: &BitVectorBWT, seq_i: &[u8], params: &CorrectionParameters, kmer_size: usize) -> Vec<u8> {
let pileup: Vec<u64> = bwt.count_pileup(seq_i, kmer_size);
let nz_med = stats_util::calculate_bounded_median(&pileup, params.min_count);
if nz_med < params.min_count {
return seq_i.to_owned();
}
let pileup_size: usize = pileup.len();
let branch_limit = params.branch_limit_factor as usize*kmer_size;
let threshold: u64 = std::cmp::max((params.frac * nz_med as f64) as u64, params.min_count);
let mut prev_found: isize = -1;
let mut seed_kmer: Vec<u8> = vec![0; kmer_size];
let mut target_kmer: Vec<u8> = vec![0; kmer_size];
let mut max_branch_length: usize;
let mut bridge_points: Vec<Vec<u8>>;
let mut corrections_list: Vec<Correction> = Vec::<Correction>::new();
let mut new_corr: Correction;
let mut x: usize = 0;
while x < pileup_size {
if pileup[x] < threshold {
prev_found = x as isize-1;
while x < pileup_size && pileup[x] < threshold {
x += 1;
}
if prev_found == -1 && x < pileup_size {
max_branch_length = (params.tail_buffer_factor*(x+kmer_size) as f64) as usize;
if max_branch_length < params.max_branch_attempt_length {
seed_kmer[..].clone_from_slice(&seq_i[x..x+kmer_size]);
seed_kmer = string_util::reverse_complement_i(&seed_kmer);
bridge_points = assemble_from_kmer(bwt, &seed_kmer, threshold, branch_limit, max_branch_length);
let orig: Vec<u8> = string_util::reverse_complement_i(&seq_i[0..x+kmer_size]);
let best_match: Option<Vec<u8>> = pick_best_levenshtein_search(&orig, bridge_points, bwt, kmer_size, 0xFFFFFFFF);
if let Some(bm) = best_match {
let rev_comp_seq: Vec<u8> = string_util::reverse_complement_i(&bm);
new_corr = Correction {
start_pos: 0,
end_pos: x+kmer_size,
seq: rev_comp_seq
};
corrections_list.push(new_corr);
}
}
}
else if prev_found >= 0 && x < pileup_size {
seed_kmer[..].clone_from_slice(&seq_i[prev_found as usize..prev_found as usize+kmer_size]);
target_kmer[..].clone_from_slice(&seq_i[x..x+kmer_size]);
max_branch_length = (params.branch_buffer_factor*(x-prev_found as usize+kmer_size) as f64) as usize;
if max_branch_length < params.max_branch_attempt_length {
bridge_points = bridge_kmers(bwt, &seed_kmer, &target_kmer, threshold, branch_limit, max_branch_length);
if bridge_points.is_empty() {
bridge_points = bridge_kmers(
bwt,
&string_util::reverse_complement_i(&target_kmer),
&string_util::reverse_complement_i(&seed_kmer),
threshold,
branch_limit,
max_branch_length
);
for bp in &mut bridge_points {
*bp = string_util::reverse_complement_i(&bp);
}
}
let best_match: Option<Vec<u8>> = pick_best_levenshtein(&seq_i[prev_found as usize..x+kmer_size], bridge_points, bwt, kmer_size);
if let Some(bm) = best_match {
new_corr = Correction {
start_pos: prev_found as usize,
end_pos: x+kmer_size,
seq: bm
};
corrections_list.push(new_corr);
} else if x - prev_found as usize > kmer_size {
let mid_point: usize = (prev_found as usize+x+kmer_size) / 2;
max_branch_length = (params.tail_buffer_factor*(mid_point - prev_found as usize) as f64) as usize;
let max_edit_distance: u32 = ((mid_point - prev_found as usize) as f64 * params.midpoint_ed_factor) as u32;
bridge_points = assemble_from_kmer(bwt, &seed_kmer, threshold, branch_limit, max_branch_length);
let best_match: Option<Vec<u8>> = pick_best_levenshtein_search(&seq_i[prev_found as usize..mid_point], bridge_points, bwt, kmer_size, max_edit_distance);
if let Some(bm) = best_match {
new_corr = Correction {
start_pos: prev_found as usize,
end_pos: mid_point,
seq: bm
};
corrections_list.push(new_corr);
}
let rev_target = string_util::reverse_complement_i(&target_kmer);
bridge_points = assemble_from_kmer(bwt, &rev_target, threshold, branch_limit, max_branch_length);
let orig: Vec<u8> = string_util::reverse_complement_i(&seq_i[mid_point..x+kmer_size]);
let best_match: Option<Vec<u8>> = pick_best_levenshtein_search(&orig, bridge_points, bwt, kmer_size, max_edit_distance);
if let Some(bm) = best_match {
let rev_comp_seq: Vec<u8> = string_util::reverse_complement_i(&bm);
new_corr = Correction {
start_pos: mid_point,
end_pos: x+kmer_size,
seq: rev_comp_seq
};
corrections_list.push(new_corr);
}
}
}
}
}
else {
x += 1;
}
}
if prev_found >= 0 && pileup[pileup_size-1] < threshold {
max_branch_length = (params.tail_buffer_factor*(pileup_size-1-prev_found as usize+kmer_size) as f64) as usize;
if max_branch_length <= params.max_branch_attempt_length {
seed_kmer[..].clone_from_slice(&seq_i[prev_found as usize..prev_found as usize+kmer_size]);
bridge_points = assemble_from_kmer(bwt, &seed_kmer, threshold, branch_limit, max_branch_length);
let best_match: Option<Vec<u8>> = pick_best_levenshtein_search(&seq_i[prev_found as usize..], bridge_points, bwt, kmer_size, 0xFFFFFFFF);
if let Some(bm) = best_match {
new_corr = Correction {
start_pos: prev_found as usize,
end_pos: seq_i.len(),
seq: bm
};
corrections_list.push(new_corr);
}
}
}
let mut current_position: usize = 0;
let mut corrected_seq: Vec<u8> = Vec::<u8>::new();
for correction in corrections_list {
if current_position > correction.start_pos {
corrected_seq.truncate(corrected_seq.len() - (current_position - correction.start_pos));
}
else {
corrected_seq.extend_from_slice(&seq_i[current_position..correction.start_pos]);
}
corrected_seq.extend_from_slice(&correction.seq);
current_position = correction.end_pos as usize;
}
corrected_seq.extend_from_slice(&seq_i[current_position..]);
corrected_seq
}
#[inline]
fn pick_best_levenshtein_search(original: &[u8], candidates: Vec<Vec<u8>>, bwt: &BitVectorBWT, kmer_size: usize, max_ed: u32) -> Option<Vec<u8>> {
let mut ed_scores: Vec<Option<Match>> = Vec::<Option<Match>>::with_capacity(candidates.len());
let mut min_score: u32 = max_ed;
for candidate in candidates.iter() {
let matches: Vec<Match> = levenshtein_search(&original, &candidate).collect();
let mut best_match: Option<Match> = None;
for m in matches {
if m.start == 0 && m.end >= kmer_size {
match &best_match {
Some(bm) => {
if m.k < bm.k || (m.k == bm.k && m.end < bm.end) {
best_match = Some(m);
}
},
None => {
best_match = Some(m);
}
}
}
}
match &best_match {
Some(bm) => {
if bm.k < min_score {
min_score = bm.k;
}
},
None => {}
}
ed_scores.push(best_match);
}
let mut candidates_ed: Vec<&[u8]> = Vec::<&[u8]>::with_capacity(candidates.len());
for y in 0..candidates.len() {
match &ed_scores[y] {
Some(eds) => {
if eds.k == min_score {
candidates_ed.push(&candidates[y][0..eds.end]);
}
},
None => {}
}
}
if candidates_ed.is_empty() {
None
}
else if candidates_ed.len() == 1 {
Some(candidates_ed[0].to_vec())
}
else {
let mut max_counts: u64 = 0;
let mut max_id: usize = 0;
let mut ed_pu: Vec<u64>;
let mut summation: u64;
for (y, candidate) in candidates_ed.iter().enumerate() {
ed_pu = bwt.count_pileup(candidate, kmer_size);
summation = ed_pu.iter().sum();
if summation > max_counts {
max_id = y;
max_counts = summation;
}
}
Some(candidates_ed[max_id].to_vec())
}
}
#[inline]
fn pick_best_levenshtein(original: &[u8], candidates: Vec<Vec<u8>>, bwt: &BitVectorBWT, kmer_size: usize) -> Option<Vec<u8>> {
if candidates.is_empty() {
None
}
else if candidates.len() == 1 {
Some(candidates[0].clone())
}
else {
let ed_scores: Vec<u32> = candidates.iter()
.map(|candidate| levenshtein_exp(&original, &candidate))
.collect::<Vec<u32>>();
let min_score: u32 = *ed_scores.iter().min().unwrap();
let mut candidates_ed: Vec<&Vec<u8>> = Vec::<&Vec<u8>>::with_capacity(candidates.len());
for y in 0..candidates.len() {
if ed_scores[y] == min_score {
candidates_ed.push(&candidates[y]);
}
}
if candidates_ed.len() == 1 {
Some(candidates_ed[0].clone())
}
else {
let mut max_counts: u64 = 0;
let mut max_id: usize = 0;
let mut ed_pu: Vec<u64>;
let mut summation: u64;
for (y, candidate) in candidates_ed.iter().enumerate() {
ed_pu = bwt.count_pileup(candidate, kmer_size);
summation = ed_pu.iter().sum();
if summation > max_counts {
max_id = y;
max_counts = summation;
}
}
Some(candidates_ed[max_id].clone())
}
}
}
pub fn bridge_kmers(
bwt: &BitVectorBWT, seed_kmer: &[u8], target_kmer: &[u8], min_count: u64, branch_limit: usize,
max_branch_len: usize
) -> Vec<Vec<u8>> {
let mut ret = Vec::<Vec<u8>>::new();
let kmer_len = seed_kmer.len();
assert_eq!(kmer_len, target_kmer.len());
let mut counts: [u64; VALID_CHARS_LEN] = [0; VALID_CHARS_LEN];
let mut fw_counts: [u64; VALID_CHARS_LEN] = [0; VALID_CHARS_LEN];
let mut rev_counts: [u64; VALID_CHARS_LEN] = [0; VALID_CHARS_LEN];
let mut num_branched: usize = 0;
let mut max_pos: usize;
let mut curr_buffer: Vec<u8> = vec![4; max_branch_len];
let mut rev_buffer: Vec<u8> = vec![4; max_branch_len];
let mut possible_bridges: Vec<Vec<u8>> = Vec::<Vec<u8>>::new();
let mut seed_vec: Vec<u8> = vec![0; kmer_len];
seed_vec.clone_from_slice(seed_kmer);
possible_bridges.push(seed_vec);
while !possible_bridges.is_empty() && num_branched < branch_limit {
let mut curr_bridge: Vec<u8> = possible_bridges.pop().unwrap();
let mut curr_bridge_len = curr_bridge.len();
let mut curr_offset: usize = 0;
num_branched += 1;
for x in 0..kmer_len {
curr_buffer[x] = curr_bridge[curr_bridge_len-kmer_len+x];
rev_buffer[x] = string_util::COMPLEMENT_INT[curr_buffer[x] as usize];
}
while curr_bridge_len < max_branch_len {
curr_offset += 1;
bwt.prefix_revkmer_noalloc_fixed(&rev_buffer[curr_offset..curr_offset+kmer_len-1], &mut rev_counts);
bwt.postfix_kmer_noalloc_fixed(&curr_buffer[curr_offset..curr_offset+kmer_len-1], &mut fw_counts);
max_pos=0;
for x in 0..VALID_CHARS_LEN {
counts[x] = fw_counts[x] + rev_counts[x];
if counts[x] > counts[max_pos] {
max_pos = x;
}
}
if counts[max_pos] < min_count {
break;
}
curr_bridge.push(4);
for x in 0..VALID_CHARS_LEN {
if x != max_pos && counts[x] >= min_count {
curr_bridge[curr_bridge_len] = VALID_CHARS[x];
possible_bridges.push(curr_bridge.clone());
}
}
if possible_bridges.len() >= branch_limit {
return Vec::<Vec<u8>>::new();
}
curr_bridge[curr_bridge_len] = VALID_CHARS[max_pos];
curr_bridge_len += 1;
curr_buffer[curr_offset+kmer_len-1] = VALID_CHARS[max_pos];
rev_buffer[curr_offset+kmer_len-1] = string_util::COMPLEMENT_INT[VALID_CHARS[max_pos] as usize];
if &curr_buffer[curr_offset..curr_offset+kmer_len] == target_kmer {
ret.push(curr_bridge.clone());
if ret.len() >= branch_limit {
return Vec::<Vec<u8>>::new();
}
}
}
}
if num_branched < branch_limit {
ret
}
else {
Vec::<Vec<u8>>::new()
}
}
pub fn assemble_from_kmer(
bwt: &BitVectorBWT, seed_kmer: &[u8], min_count: u64, branch_limit: usize, max_branch_len: usize
) -> Vec<Vec<u8>> {
let mut ret = Vec::<Vec<u8>>::new();
let kmer_len = seed_kmer.len();
let mut counts: [u64; VALID_CHARS_LEN] = [0; VALID_CHARS_LEN];
let mut fw_counts: [u64; VALID_CHARS_LEN] = [0; VALID_CHARS_LEN];
let mut rev_counts: [u64; VALID_CHARS_LEN] = [0; VALID_CHARS_LEN];
let mut num_branched: usize = 0;
let mut max_pos: usize;
let mut curr_buffer: Vec<u8> = vec![4; max_branch_len];
let mut rev_buffer: Vec<u8> = vec![4; max_branch_len];
let mut possible_bridges: Vec<Vec<u8>> = Vec::<Vec<u8>>::new();
let mut seed_vec: Vec<u8> = vec![0; kmer_len];
seed_vec.clone_from_slice(seed_kmer);
possible_bridges.push(seed_vec);
while !possible_bridges.is_empty() && num_branched < branch_limit {
let mut curr_bridge: Vec<u8> = possible_bridges.pop().unwrap();
let mut curr_bridge_len = curr_bridge.len();
let mut curr_offset: usize = 0;
num_branched += 1;
for x in 0..kmer_len {
curr_buffer[x] = curr_bridge[curr_bridge_len-kmer_len+x];
rev_buffer[x] = string_util::COMPLEMENT_INT[curr_buffer[x] as usize];
}
while curr_bridge_len < max_branch_len {
curr_offset += 1;
bwt.prefix_revkmer_noalloc_fixed(&rev_buffer[curr_offset..curr_offset+kmer_len-1], &mut rev_counts);
bwt.postfix_kmer_noalloc_fixed(&curr_buffer[curr_offset..curr_offset+kmer_len-1], &mut fw_counts);
max_pos=0;
for x in 0..VALID_CHARS_LEN {
counts[x] = fw_counts[x] + rev_counts[x];
if counts[x] > counts[max_pos] {
max_pos = x;
}
}
if counts[max_pos] < min_count {
break;
}
curr_bridge.push(4);
for x in 0..VALID_CHARS_LEN {
if x != max_pos && counts[x] >= min_count {
curr_bridge[curr_bridge_len] = VALID_CHARS[x];
possible_bridges.push(curr_bridge.clone());
}
}
if possible_bridges.len() >= branch_limit {
return Vec::<Vec<u8>>::new();
}
curr_bridge[curr_bridge_len] = VALID_CHARS[max_pos];
curr_bridge_len += 1;
curr_buffer[curr_offset+kmer_len-1] = VALID_CHARS[max_pos];
rev_buffer[curr_offset+kmer_len-1] = string_util::COMPLEMENT_INT[VALID_CHARS[max_pos] as usize];
}
if curr_bridge_len == max_branch_len {
ret.push(curr_bridge);
if ret.len() >= branch_limit {
return Vec::<Vec<u8>>::new();
}
}
}
if num_branched < branch_limit {
ret
}
else {
Vec::<Vec<u8>>::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
use crate::bwt_converter::convert_to_vec;
use crate::ropebwt2_util::create_bwt_from_strings;
use crate::string_util::{convert_stoi, reverse_complement_i};
fn get_constant_bwt() -> BitVectorBWT {
let const_string = "AACGGATCAAGCTTACCAGTATTTACGT";
let rep_count = 30;
let mut data: Vec<&str> = vec![];
for _i in 0..rep_count {
data.push(&const_string);
}
let bwt_string = create_bwt_from_strings(&data).unwrap();
let bwt_cursor = Cursor::new(bwt_string);
let vec = convert_to_vec(bwt_cursor);
let mut bv_bwt: BitVectorBWT = BitVectorBWT::new();
bv_bwt.load_vector(vec);
bv_bwt
}
fn get_diploid_bwt() -> BitVectorBWT {
let const_string = "AACGGATCAAGCTTACCAGTATTTACGT";
let rep_count = 30;
let mut data: Vec<&str> = vec![];
for _i in 0..rep_count {
data.push(&const_string);
}
let lesser_string = "AACTGATCAAGCTGACCAGTATTTACTT";
let lesser_string = string_util::convert_itos(
&string_util::reverse_complement_i(
&string_util::convert_stoi(&lesser_string)
)
);
let lesser_count = 20;
for _i in 0..lesser_count {
data.push(&lesser_string);
}
let bwt_string = create_bwt_from_strings(&data).unwrap();
let bwt_cursor = Cursor::new(bwt_string);
let vec = convert_to_vec(bwt_cursor);
let mut bv_bwt: BitVectorBWT = BitVectorBWT::new();
bv_bwt.load_vector(vec);
bv_bwt
}
fn get_expansion_bwt() -> BitVectorBWT {
let const_string = "AACGGATACACACACACACACTTTACGT";
let rep_count = 30;
let mut data: Vec<&str> = vec![];
for _i in 0..rep_count {
data.push(&const_string);
}
let bwt_string = create_bwt_from_strings(&data).unwrap();
let bwt_cursor = Cursor::new(bwt_string);
let vec = convert_to_vec(bwt_cursor);
let mut bv_bwt: BitVectorBWT = BitVectorBWT::new();
bv_bwt.load_vector(vec);
bv_bwt
}
#[test]
fn test_bridge() {
let bwt = get_constant_bwt();
let query = convert_stoi(&"AACGGATCAAGCTTACCAGTATTTACGT");
assert_eq!(bwt.count_kmer(&query), 30);
let seed = convert_stoi(&"AACGGAT");
let target = convert_stoi(&"TTTACGT");
let min_count = 15;
let branch_lim = 20;
let max_branch_len = 40;
let bridges = bridge_kmers(&bwt, &seed, &target, min_count, branch_lim, max_branch_len);
assert_eq!(bridges.len(), 1);
assert_eq!(bridges[0], query);
let rev_seed = reverse_complement_i(&target);
let rev_target = reverse_complement_i(&seed);
let rev_bridges = bridge_kmers(&bwt, &rev_seed, &rev_target, min_count, branch_lim, max_branch_len);
assert_eq!(rev_bridges.len(), 1);
assert_eq!(rev_bridges[0], reverse_complement_i(&query));
}
#[test]
fn test_expansion_bridge() {
let bwt = get_expansion_bwt();
let query = convert_stoi(&"AACGGATACACACACACACACTTTACGT");
assert_eq!(bwt.count_kmer(&query), 30);
let seed = convert_stoi(&"AACGGAT");
let target = convert_stoi(&"TTTACGT");
let min_count = 15;
let branch_lim = 20;
let max_branch_len = query.len();
let bridges = bridge_kmers(&bwt, &seed, &target, min_count, branch_lim, max_branch_len);
assert_eq!(bridges.len(), 5);
for bridge in bridges.iter() {
assert_eq!(bridge[0..seed.len()], seed[..]);
assert_eq!(bridge[bridge.len()-target.len()..bridge.len()], target[..]);
}
}
#[test]
fn test_assemble() {
let bwt = get_constant_bwt();
let query = convert_stoi(&"AACGGATCAAGCTTACCAGTATTTACGT");
assert_eq!(bwt.count_kmer(&query), 30);
let seed = convert_stoi(&"AACGGAT");
let min_count = 15;
let branch_lim = 20;
let max_branch_len = query.len();
let bridges = assemble_from_kmer(&bwt, &seed, min_count, branch_lim, max_branch_len);
assert_eq!(bridges.len(), 1);
assert_eq!(bridges[0], query);
}
#[test]
fn test_triple_accel() {
let query = convert_stoi(&"AACGGATCAAGCTTACCAGTATTTACGT");
let short_seq = convert_stoi(&"AACGGATGAA");
let long_seq = convert_stoi(&"AACGGATCATAGCTTACCAGTATATACGT");
let l_dist = levenshtein_exp(&long_seq, &query);
assert_eq!(l_dist, 2);
let matches: Vec<Match> = levenshtein_search(&short_seq, &query).collect();
assert_eq!(matches.len(), 1);
assert_eq!(matches[0], Match{start: 0, end: 10, k: 1})
}
#[test]
fn test_correction_pass() {
let bwt = get_constant_bwt();
let query = convert_stoi(&"AACGGATCAAGCTTACCAGTATTTACGT");
assert_eq!(bwt.count_kmer(&query), 30);
let params = CorrectionParameters {
kmer_sizes: vec![9],
min_count: 5,
max_branch_attempt_length: 10000,
branch_limit_factor: 4.0,
branch_buffer_factor: 1.3,
midpoint_ed_factor: 0.4,
tail_buffer_factor: 1.00,
frac: 0.1,
verbose: true
};
let change_head_seq = convert_stoi(&"AATGGATCAAGCTTACCAGTATTTACGT");
let corrected = correction_pass(&bwt, &change_head_seq, ¶ms, 9);
assert_eq!(corrected, query);
let change_head_seq = convert_stoi(&"TGGATCAAGCTTACCAGTATTTACGT");
let corrected = correction_pass(&bwt, &change_head_seq, ¶ms, 9);
assert_eq!(corrected[..], query[2..]);
let change_mid_seq = convert_stoi(&"AACGGATCAAGCTTCCAGTATTTACGT");
let corrected = correction_pass(&bwt, &change_mid_seq, ¶ms, 9);
assert_eq!(corrected, query);
let change_tail_seq = convert_stoi(&"AACGGATCAAGCTTACCAGTATTTGCGT");
let corrected = correction_pass(&bwt, &change_tail_seq, ¶ms, 9);
assert_eq!(corrected, query);
let change_tail_seq = convert_stoi(&"AACGGATCAAGCTTACCAGTATTTG");
let corrected = correction_pass(&bwt, &change_tail_seq, ¶ms, 9);
assert_eq!(corrected[..], query[..query.len()-3]);
}
#[test]
fn test_diploid_correction_pass() {
let bwt = get_diploid_bwt();
let const_string = convert_stoi(&"AACGGATCAAGCTTACCAGTATTTACGT");
let lesser_string = convert_stoi(&"AACTGATCAAGCTGACCAGTATTTACTT");
let params = CorrectionParameters {
kmer_sizes: vec![9],
min_count: 5,
max_branch_attempt_length: 10000,
branch_limit_factor: 4.0,
branch_buffer_factor: 1.3,
midpoint_ed_factor: 0.4,
tail_buffer_factor: 1.00,
frac: 0.1,
verbose: true
};
let change_head_seq = convert_stoi(&"ATCGGATCAAGCT");
let corrected = correction_pass(&bwt, &change_head_seq, ¶ms, 9);
assert_eq!(&corrected[..], &const_string[0..corrected.len()]);
let change_head_seq = convert_stoi(&"AAGTGATCAAGCT");
let corrected = correction_pass(&bwt, &change_head_seq, ¶ms, 9);
assert_eq!(&corrected[..], &lesser_string[0..corrected.len()]);
let change_head_seq = convert_stoi(&"AACAGATCAAGCT");
let corrected = correction_pass(&bwt, &change_head_seq, ¶ms, 9);
assert_eq!(&corrected[..], &const_string[0..corrected.len()]);
let change_mid_seq = convert_stoi(&"AACGGATCAAGCTTAGCAGTATTTACGT");
let corrected = correction_pass(&bwt, &change_mid_seq, ¶ms, 9);
assert_eq!(&corrected, &const_string);
let change_mid_seq = convert_stoi(&"AACTGATCAAGCTGAGCAGTATTTACTT");
let corrected = correction_pass(&bwt, &change_mid_seq, ¶ms, 9);
assert_eq!(&corrected, &lesser_string);
let change_mid_seq = convert_stoi(&"GATCAAGCTAACCAGTATT");
let corrected = correction_pass(&bwt, &change_mid_seq, ¶ms, 9);
assert_eq!(&corrected, &convert_stoi(&"GATCAAGCTTACCAGTATT"));
let change_tail_seq = convert_stoi(&"CAGTATTTAGGT");
let corrected = correction_pass(&bwt, &change_tail_seq, ¶ms, 9);
assert_eq!(&corrected[..], &const_string[const_string.len()-corrected.len()..]);
let change_tail_seq = convert_stoi(&"CAGTATTTAGTT");
let corrected = correction_pass(&bwt, &change_tail_seq, ¶ms, 9);
assert_eq!(&corrected[..], &lesser_string[lesser_string.len()-corrected.len()..]);
let change_tail_seq = convert_stoi(&"CAGTATTTACAT");
let corrected = correction_pass(&bwt, &change_tail_seq, ¶ms, 9);
assert_eq!(&corrected[..], &const_string[const_string.len()-corrected.len()..]);
}
#[test]
fn test_correction_job() {
let bwt: BitVectorBWT = get_constant_bwt();
let arc_bwt: Arc<BitVectorBWT> = Arc::new(bwt);
let params = CorrectionParameters {
kmer_sizes: vec![9],
min_count: 5,
max_branch_attempt_length: 10000,
branch_limit_factor: 4.0,
branch_buffer_factor: 1.3,
midpoint_ed_factor: 0.4,
tail_buffer_factor: 1.00,
frac: 0.1,
verbose: true
};
let arc_params: Arc<CorrectionParameters> = Arc::new(params);
let const_string: String = "AACGGATCAAGCTTACCAGTATTTACGT".to_string();
let long_read_const_mod: LongReadFA = LongReadFA {
read_index: 0,
label: "test".to_string(),
seq: "TACGGATCAAGCATACCAGTATGTACGT".to_string()
};
let corr_results = correction_job(arc_bwt, long_read_const_mod.clone(), arc_params);
assert_eq!(corr_results.label, long_read_const_mod.label);
assert_eq!(corr_results.original_seq, long_read_const_mod.seq);
assert_eq!(corr_results.corrected_seq, const_string);
assert_eq!(corr_results.avg_before, 6.0);
assert_eq!(corr_results.avg_after, 30.0);
}
#[test]
fn test_correction_deletion_special() {
let bwt: BitVectorBWT = get_constant_bwt();
let arc_bwt: Arc<BitVectorBWT> = Arc::new(bwt);
let params = CorrectionParameters {
kmer_sizes: vec![9],
min_count: 5,
max_branch_attempt_length: 10000,
branch_limit_factor: 4.0,
branch_buffer_factor: 1.3,
midpoint_ed_factor: 0.4,
tail_buffer_factor: 1.00,
frac: 0.1,
verbose: true
};
let arc_params: Arc<CorrectionParameters> = Arc::new(params);
let const_string: String = "AACGGATCAAGCTTACCAGTATTTACGT".to_string();
let long_read_const_mod: LongReadFA = LongReadFA {
read_index: 10,
label: "test".to_string(),
seq: "AACGGATCAAGCTTTACCAGTATTTACGT".to_string()
};
let corr_results = correction_job(arc_bwt, long_read_const_mod.clone(), arc_params);
assert_eq!(corr_results.read_index, long_read_const_mod.read_index);
assert_eq!(corr_results.label, long_read_const_mod.label);
assert_eq!(corr_results.original_seq, long_read_const_mod.seq);
assert_eq!(corr_results.corrected_seq, const_string);
}
}