use bitvec::prelude::*;
use itertools::Itertools;
use std::clone::Clone;
use std::collections::HashMap;
use std::fmt::Debug;
use std::hash::Hash;
pub fn pairwise_merge_iter<T: Ord + Hash + Clone + Debug>(
current_interleave: &BitVec<u64, Msb0>,
bwt0: &[T],
bwt1: &[T],
offsets: &HashMap<T, usize>,
) -> BitVec<u64, Msb0> {
let mut next_interleave = bitvec![u64, Msb0; 0; current_interleave.len()];
let mut current_pos0 = 0usize;
let mut current_pos1 = 0usize;
let mut temp_index = offsets.clone();
for b in current_interleave.iter() {
let c: &T = if *b {
let v = &bwt0[current_pos0];
current_pos0 += 1;
v
} else {
let v = &bwt1[current_pos1];
current_pos1 += 1;
v
};
let next_position = temp_index
.get_mut(c)
.expect("Unknown character encountered in merging BWTs");
next_interleave.set(*next_position, *b);
*next_position += 1;
}
next_interleave
}
pub fn generate_offset_hashmap<T: Ord + Hash + Clone + Debug>(bwts: &[&[T]]) -> HashMap<T, usize> {
let mut num_occurrences: HashMap<&T, usize> = HashMap::new();
for &bwt in bwts.iter() {
for c in bwt {
num_occurrences.entry(c).and_modify(|counter| *counter += 1).or_insert(1);
}
}
let ordered_chars = num_occurrences.keys().sorted().map(|&c| c.clone()).collect::<Vec<T>>();
let mut total = 0usize;
let mut offset_map = HashMap::with_capacity(num_occurrences.len());
for c in ordered_chars {
offset_map.insert(c.clone(), total);
total += num_occurrences[&c];
}
offset_map
}
pub fn pairwise_bwt_merge<T: Ord + Hash + Clone + Debug>(bwt0: &[T], bwt1: &[T]) -> Vec<T> {
let total_len = bwt0.len() + bwt1.len();
let initial_offsets = generate_offset_hashmap(&[bwt0, bwt1]);
let mut interleave = bitvec![u64, Msb0; 0; total_len];
let mut final_interleave = bitvec![u64, Msb0; 0; total_len];
for i in 0..bwt0.len() {
final_interleave.set(i, true)
}
while interleave != final_interleave {
interleave = final_interleave;
final_interleave = pairwise_merge_iter(&interleave, bwt0, bwt1, &initial_offsets);
}
let mut return_val = Vec::with_capacity(total_len);
let mut current_pos0 = 0usize;
let mut current_pos1 = 0usize;
for val in final_interleave.into_iter() {
if val {
return_val.push(bwt0[current_pos0].clone());
current_pos0 += 1;
} else {
return_val.push(bwt1[current_pos1].clone());
current_pos1 += 1;
}
}
return_val
}
pub fn naive_bwt(inputs: &[&str]) -> String {
let mut rotations: Vec<String> = vec![];
for s in inputs.iter() {
let dollar_string = s.to_string() + "$";
for l in 0..dollar_string.len() {
rotations.push(
dollar_string[l..].to_string() + &dollar_string + &dollar_string[..l],
);
}
}
rotations.sort();
let mut ret: String = String::with_capacity(rotations.len());
for r in rotations.iter() {
ret.push(r.as_bytes()[r.len() - 1] as char);
}
ret
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic() {
let data: Vec<&str> = vec!["CCGT", "N", "ACG"];
let bwt_stream = naive_bwt(&data);
assert_eq!(bwt_stream, "GTN$$ACCC$G");
}
#[test]
fn test_diff_len() {
let data: Vec<&str> = vec!["A", "AA", "AAA"];
let bwt_stream = naive_bwt(&data);
assert_eq!(bwt_stream, "AAA$AA$A$");
}
#[test]
fn test_cycle_breaker() {
let data: Vec<&str> = vec!["ACA", "CA"];
let bwt_stream = naive_bwt(&data);
assert_eq!(bwt_stream, "AACC$A$");
}
fn test_recursive_merging_result_vs_naive_result(data: Vec<&str>) {
let bwt_stream = naive_bwt(&data);
let mut bwts: Vec<String> = data.iter().map(|s| naive_bwt(&[s])).collect::<Vec<_>>();
let mut current_bwt = bwts.pop().unwrap().as_bytes().to_vec();
while let Some(next_bwt) = bwts.pop() {
current_bwt = pairwise_bwt_merge(¤t_bwt, next_bwt.as_bytes());
}
assert_eq!(bwt_stream.as_bytes(), current_bwt);
}
#[test]
fn merging_paper_example_works() {
let data: Vec<&str> = vec!["ACCA", "CAAA"];
test_recursive_merging_result_vs_naive_result(data)
}
#[test]
fn merging_samples_of_different_size_example_works() {
let data: Vec<&str> = vec!["ACCA", "CA"];
test_recursive_merging_result_vs_naive_result(data)
}
#[test]
fn merging_samples_of_high_similarity_works() {
let data1: Vec<&str> = vec!["A", "AA", "AAA", "AAAA", "AAAAA"];
test_recursive_merging_result_vs_naive_result(data1);
let data2: Vec<&str> = vec!["AAAAA", "AAAA", "AAA", "AA", "A"];
test_recursive_merging_result_vs_naive_result(data2);
}
}