use sa::{insert, suffix_array};
use std::ops::Index;
pub fn bwt(input: &[u8]) -> Vec<u8> {
suffix_array(input).into_iter().map(|i| {
if i == 0 { 0 } else { input[(i - 1) as usize] }
}).collect()
}
fn generate_occurrence_index(map: &mut Vec<u32>) {
let mut idx = 0;
for i in 0..map.len() {
let c = map[i];
map[i] = idx;
idx += c;
}
}
pub fn ibwt(input: &[u8]) -> Vec<u8> {
let mut map = Vec::new();
for i in input {
insert(&mut map, *i);
}
generate_occurrence_index(&mut map);
let mut lf = vec![0; input.len()];
for (i, c) in input.iter().enumerate() {
let byte = *c as usize;
let val = map[byte];
lf[i] = val;
map[byte] = val + 1;
}
let mut idx = 0;
let mut output = vec![0; input.len()];
for i in (0..(input.len() - 1)).rev() {
output[i] = input[idx];
idx = lf[idx] as usize;
}
output.pop();
output
}
#[derive(Clone, Debug)]
pub struct FMIndex {
data: Vec<u8>,
cache: Vec<u32>,
occ_map: Vec<u32>,
lf_vec: Vec<u32>,
}
impl FMIndex {
#[inline]
pub fn new(data: &[u8]) -> FMIndex {
FMIndex::new_from_bwt(bwt(data))
}
pub fn bwt(&self) -> &[u8] {
&self.data
}
pub fn new_from_bwt(bwt_data: Vec<u8>) -> FMIndex {
let mut map = Vec::new();
let mut count = vec![0u32; bwt_data.len()];
let mut idx = 0;
for i in &bwt_data {
let value = insert(&mut map, *i);
count[idx] = value;
idx += 1;
}
generate_occurrence_index(&mut map);
let mut lf_vec = count.clone();
let mut lf_occ_map = map.clone();
for (i, c) in bwt_data.iter().enumerate() {
let idx = *c as usize;
lf_vec[i] = lf_occ_map[idx];
lf_occ_map[idx] += 1;
}
let mut i = lf_vec[0] as usize;
lf_vec[0] = 0;
let mut counter = bwt_data.len() as u32 - 1;
for _ in 0..(bwt_data.len() - 1) {
let next = lf_vec[i];
lf_vec[i] = counter;
i = next as usize;
counter -= 1;
}
FMIndex {
data: bwt_data,
cache: count,
occ_map: map,
lf_vec: lf_vec,
}
}
pub fn nearest(&self, idx: usize, ch: u8) -> usize {
match self.occ_map.get(ch as usize) {
Some(res) if *res > 0 => {
*res as usize + (0..idx).rev()
.find(|&i| self.data[i] == ch)
.map(|i| self.cache[i] as usize)
.unwrap_or(0)
},
_ => 0,
}
}
fn get_range(&self, query: &str) -> Option<(usize, usize)> {
let mut top = 0;
let mut bottom = self.data.len();
for ch in query.as_bytes().iter().rev() {
top = self.nearest(top, *ch);
bottom = self.nearest(bottom, *ch);
if top >= bottom {
return None
}
}
if top >= bottom {
None
} else {
Some((top, bottom))
}
}
pub fn count(&self, query: &str) -> usize {
match self.get_range(query) {
Some((top, bottom)) => bottom - top,
None => 0,
}
}
pub fn search(&self, query: &str) -> Vec<usize> {
match self.get_range(query) {
Some((top, bottom)) => (top..bottom).map(|idx| {
let i = self.nearest(idx, self.data[idx]);
self.lf_vec[i] as usize
}).collect(),
None => Vec::new(),
}
}
}
impl Index<usize> for FMIndex {
type Output = u32;
fn index(&self, i: usize) -> &u32 {
self.lf_vec.get(i).expect("index out of range")
}
}
#[cfg(test)]
mod tests {
use super::{FMIndex, bwt, ibwt};
#[test]
fn test_bwt_and_ibwt() {
let text = String::from("ATCTAGGAGATCTGAATCTAGTTCAACTAGCTAGATCTAGAGACAGCTAA");
let bw = bwt(text.as_bytes());
let ibw = ibwt(&bw);
assert_eq!(String::from("AATCGGAGTTGCTTTG\u{0}AGTAGTGATTTTAAGAAAAAACCCCCCTAAAACG"),
String::from_utf8(bw).unwrap());
assert_eq!(text, String::from_utf8(ibw).unwrap());
}
#[test]
fn test_fm_index() {
let text = String::from("GCGTGCCCAGGGCACTGCCGCTGCAGGCGTAGGCATCGCATCACACGCGT");
let index = FMIndex::new(text.as_bytes());
assert_eq!(0, index.count("CCCCC"));
let mut result = index.search("TG");
result.sort();
assert_eq!(result, vec![3, 15, 21]);
let mut result = index.search("GCGT");
result.sort();
assert_eq!(result, vec![0, 26, 46]);
assert_eq!(vec![1], index.search("CGTGCCC"));
}
}