use std::iter::repeat;
use crate::alphabets::Alphabet;
use crate::data_structures::suffix_array::RawSuffixArraySlice;
use crate::utils::prescan;
pub type BWT = Vec<u8>;
pub type BWTSlice = [u8];
pub type Less = Vec<usize>;
pub type BWTFind = Vec<usize>;
pub fn bwt(text: &[u8], pos: RawSuffixArraySlice) -> BWT {
assert_eq!(text.len(), pos.len());
let n = text.len();
let mut bwt: BWT = repeat(0).take(n).collect();
for r in 0..n {
let p = pos[r];
bwt[r] = if p > 0 { text[p - 1] } else { text[n - 1] };
}
bwt
}
pub fn invert_bwt(bwt: &BWTSlice) -> Vec<u8> {
let alphabet = Alphabet::new(bwt);
let n = bwt.len();
let bwtfind = bwtfind(bwt, &alphabet);
let mut inverse = Vec::with_capacity(n);
let mut r = bwtfind[0];
for _ in 0..n {
r = bwtfind[r];
inverse.push(bwt[r]);
}
inverse
}
#[derive(Default, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)]
pub struct Occ {
occ: Vec<Vec<usize>>,
k: u32,
}
impl Occ {
pub fn new(bwt: &BWTSlice, k: u32, alphabet: &Alphabet) -> Self {
let n = bwt.len();
let m = alphabet
.max_symbol()
.expect("Expecting non-empty alphabet.") as usize
+ 1;
let mut alpha = alphabet.symbols.iter().collect::<Vec<usize>>();
if (b'$' as usize) < m && !alphabet.is_word(b"$") {
alpha.push(b'$' as usize);
}
let mut occ: Vec<Vec<usize>> = vec![Vec::new(); m];
let mut curr_occ = vec![0usize; m];
for &a in &alpha {
occ[a].reserve(n / k as usize);
}
for (i, &c) in bwt.iter().enumerate() {
curr_occ[c as usize] += 1;
if i % k as usize == 0 {
for &a in &alpha {
occ[a].push(curr_occ[a]);
}
}
}
Occ { occ, k }
}
pub fn get(&self, bwt: &BWTSlice, r: usize, a: u8) -> usize {
let lo_checkpoint = r / self.k as usize;
let lo_occ = self.occ[a as usize][lo_checkpoint];
if self.k > 64 {
let hi_checkpoint = lo_checkpoint + 1;
if let Some(&hi_occ) = self.occ[a as usize].get(hi_checkpoint) {
if lo_occ == hi_occ {
return lo_occ;
}
let hi_idx = hi_checkpoint * self.k as usize;
if (hi_idx - r) < (self.k as usize / 2) {
return hi_occ - bytecount::count(&bwt[r + 1..=hi_idx], a);
}
}
}
let lo_idx = lo_checkpoint * self.k as usize;
bytecount::count(&bwt[lo_idx + 1..=r], a) + lo_occ
}
}
pub fn less(bwt: &BWTSlice, alphabet: &Alphabet) -> Less {
let m = alphabet
.max_symbol()
.expect("Expecting non-empty alphabet.") as usize
+ 2;
let mut less: Less = repeat(0).take(m).collect();
for &c in bwt.iter() {
less[c as usize] += 1;
}
prescan(&mut less[..], 0, |a, b| a + b);
less
}
pub fn bwtfind(bwt: &BWTSlice, alphabet: &Alphabet) -> BWTFind {
let n = bwt.len();
let mut less = less(bwt, alphabet);
let mut bwtfind: BWTFind = repeat(0).take(n).collect();
for (r, &c) in bwt.iter().enumerate() {
bwtfind[less[c as usize]] = r;
less[c as usize] += 1;
}
bwtfind
}
#[cfg(test)]
mod tests {
use super::{bwt, bwtfind, invert_bwt, Occ};
use crate::alphabets::dna;
use crate::alphabets::Alphabet;
use crate::data_structures::suffix_array::suffix_array;
use crate::data_structures::wavelet_matrix::WaveletMatrix;
#[test]
fn test_bwtfind() {
let text = b"cabca$";
let alphabet = Alphabet::new(b"abc$");
let pos = suffix_array(text);
let bwt = bwt(text, &pos);
let bwtfind = bwtfind(&bwt, &alphabet);
assert_eq!(bwtfind, vec![5, 0, 3, 4, 1, 2]);
}
#[test]
fn test_invert_bwt() {
let text = b"cabca$";
let pos = suffix_array(text);
let bwt = bwt(text, &pos);
let inverse = invert_bwt(&bwt);
assert_eq!(inverse, text);
}
#[test]
fn test_occ() {
let bwt = vec![1u8, 3u8, 3u8, 1u8, 2u8, 0u8];
let alphabet = Alphabet::new([0u8, 1u8, 2u8, 3u8]);
let occ = Occ::new(&bwt, 3, &alphabet);
assert_eq!(occ.occ, [[0, 0], [1, 2], [0, 0], [0, 2]]);
assert_eq!(occ.get(&bwt, 4, 2u8), 1);
assert_eq!(occ.get(&bwt, 4, 3u8), 2);
}
#[test]
fn test_occwm() {
let text = b"GCCTTAACATTATTACGCCTA$";
let alphabet = {
let mut a = dna::n_alphabet();
a.insert(b'$');
a
};
let sa = suffix_array(text);
let bwt = bwt(text, &sa);
let occ = Occ::new(&bwt, 3, &alphabet);
let wm = WaveletMatrix::new(&bwt);
for c in [b'A', b'C', b'G', b'T', b'$'] {
for p in 0..text.len() {
assert_eq!(occ.get(&bwt, p, c) as u64, wm.rank(c, p as u64));
}
}
}
}