#[cfg(not(feature = "simd_avx2"))]
fn main() {}
#[cfg(feature = "simd_avx2")]
fn test(file_name: &str, max_size: usize, name: &str, verbose: bool, writer: &mut impl std::io::Write) -> (usize, usize, usize, f64, usize, f64, f64, usize, usize) {
use parasailors::{Matrix, *};
use rust_wfa2::aligner::*;
use bio::alignment::distance::simd::levenshtein;
use block_aligner::percent_len;
use block_aligner::scan_block::*;
use block_aligner::scores::*;
use std::fs::File;
use std::io::{BufRead, BufReader};
let mut wrong = 0usize;
let mut min_size_wrong = 0usize;
let mut wfa_wrong = 0usize;
let mut wrong_avg = 0f64;
let mut count = 0usize;
let mut seq_id_avg = 0f64;
let mut largest_gap_avg = 0f64;
let mut largest_gap_max = 0usize;
let mut largest_gap_max_correct = 0usize;
let reader = BufReader::new(File::open(file_name).unwrap());
let all_lines = reader.lines().collect::<Vec<_>>();
for lines in all_lines.chunks(2) {
let r = lines[0].as_ref().unwrap().to_ascii_uppercase();
let q = lines[1].as_ref().unwrap().to_ascii_uppercase();
let correct_score;
if r.len().max(q.len()) < 15000 {
let matrix = Matrix::create("ACGNT", 2, -4);
let profile = parasailors::Profile::new(q.as_bytes(), &matrix);
let parasail_score = global_alignment_score(&profile, r.as_bytes(), 6, 2);
correct_score = parasail_score;
} else {
let len = 8192;
let r_padded = PaddedBytes::from_bytes::<NucMatrix>(r.as_bytes(), len);
let q_padded = PaddedBytes::from_bytes::<NucMatrix>(q.as_bytes(), len);
let run_gaps = Gaps { open: -6, extend: -2 };
let matrix = NucMatrix::new_simple(2, -4);
let mut block_aligner = Block::<false, false>::new(q.len(), r.len(), len);
block_aligner.align(&q_padded, &r_padded, &matrix, run_gaps, len..=len, 0);
let scan_score = block_aligner.res().score;
correct_score = scan_score;
}
let r_padded = PaddedBytes::from_bytes::<NucMatrix>(r.as_bytes(), max_size);
let q_padded = PaddedBytes::from_bytes::<NucMatrix>(q.as_bytes(), max_size);
let run_gaps = Gaps { open: -6, extend: -2 };
let matrix = NucMatrix::new_simple(2, -4);
let mut block_aligner = Block::<false, false>::new(q.len(), r.len(), max_size);
let max_len = q.len().max(r.len());
block_aligner.align(&q_padded, &r_padded, &matrix, run_gaps, percent_len(max_len, 0.01)..=percent_len(max_len, 0.1), 0);
let scan_score = block_aligner.res().score;
if correct_score != scan_score {
wrong += 1;
wrong_avg += ((correct_score - scan_score) as f64) / (correct_score as f64);
if verbose {
let edit_dist = levenshtein(q.as_bytes(), r.as_bytes());
println!(
"parasail: {}, ours: {}, edit dist: {}\nq (len = {}): {}\nr (len = {}): {}",
correct_score,
scan_score,
edit_dist,
q.len(),
q,
r.len(),
r
);
}
}
block_aligner.align(&q_padded, &r_padded, &matrix, run_gaps, percent_len(max_len, 0.01)..=percent_len(max_len, 0.01), 0);
let min_size_score = block_aligner.res().score;
if min_size_score != correct_score {
min_size_wrong += 1;
}
let wfa_adaptive_score = {
let mut wfa = WFAlignerGapAffine::new(4, 4, 2, AlignmentScope::Score, MemoryModel::MemoryHigh);
wfa.set_heuristic(Heuristic::WFadaptive(10, percent_len(q.len().max(r.len()), 0.01) as i32, 1));
wfa.align_end_to_end(q.as_bytes(), r.as_bytes());
wfa.score()
};
let largest_gap;
let wfa_score = {
let mut wfa = WFAlignerGapAffine::new(4, 4, 2, AlignmentScope::Alignment, MemoryModel::MemoryHigh);
wfa.set_heuristic(Heuristic::None);
wfa.align_end_to_end(q.as_bytes(), r.as_bytes());
let cigar = wfa.cigar();
let matches = cigar.bytes().filter(|&c| c == b'M').count();
let seq_id = (matches as f64) / (cigar.len() as f64);
seq_id_avg += seq_id;
largest_gap = get_largest_gap(cigar.as_bytes());
largest_gap_avg += largest_gap as f64;
largest_gap_max = largest_gap_max.max(largest_gap);
if scan_score == correct_score {
largest_gap_max_correct = largest_gap_max_correct.max(largest_gap);
}
wfa.score()
};
if wfa_adaptive_score != wfa_score {
wfa_wrong += 1;
}
write!(
writer,
"{}, {}, {}, {}, {}, {}\n",
name,
q.len(),
r.len(),
largest_gap,
scan_score,
correct_score
).unwrap();
count += 1;
}
(wrong, min_size_wrong, wfa_wrong, wrong_avg / (wrong as f64), count, seq_id_avg / (count as f64), largest_gap_avg / (count as f64), largest_gap_max, largest_gap_max_correct)
}
#[cfg(feature = "simd_avx2")]
fn get_largest_gap(cigar: &[u8]) -> usize {
let mut prev = b'?';
let mut curr = 0;
let mut max = 0;
for &op in cigar {
if op != prev {
prev = op;
curr = 0;
}
if op == b'I' || op == b'D' {
curr += 1;
max = max.max(curr);
}
}
max
}
#[cfg(feature = "simd_avx2")]
fn main() {
use std::env;
use std::fs::File;
use std::io::{Write, BufWriter};
let arg1 = env::args().skip(1).next();
let verbose = arg1.is_some() && arg1.unwrap() == "-v";
let paths = ["data/real.illumina.b10M.txt", "data/real.ont.b10M.txt", "data/seq_pairs.10kbps.5000.txt", "data/seq_pairs.50kbps.10000.txt"];
let names = ["illumina", "nanopore 1kbp", "nanopore <10kbp", "nanopore <50kbp"];
let max_size = [32, 128, 1024, 8192];
let out_file_name = "data/nanopore_accuracy.csv";
let mut writer = BufWriter::new(File::create(out_file_name).unwrap());
write!(writer, "dataset, query len, reference len, largest gap, pred score, true score\n").unwrap();
println!("\ndataset, total, wrong, wrong % error, min size wrong, wfa wrong");
for ((path, name), &max_size) in paths.iter().zip(&names).zip(&max_size) {
let (wrong, min_size_wrong, wfa_wrong, wrong_avg, count, seq_id_avg, largest_gap_avg, largest_gap_max, largest_gap_max_correct) = test(path, max_size, name, verbose, &mut writer);
println!("\n{}, {}, {}, {}, {}, {}", name, count, wrong, wrong_avg, min_size_wrong, wfa_wrong);
println!("# {} seq id avg: {}", name, seq_id_avg);
println!("# {} largest gap avg: {}, largest gap max: {}, largest gap max for correct: {}", name, largest_gap_avg, largest_gap_max, largest_gap_max_correct);
}
println!("# Done!");
}