use bit_vec::BitVec;
use num_traits::{Num, NumCast, cast};
use std::u32::MAX as MARKER;
fn induced_sort_large<T>(input: &[T], approx_sa: &mut [u32],
mut bucket_heads: Vec<u32>, type_map: &BitVec)
where T: Num + NumCast + PartialOrd + Copy
{
for i in 0..approx_sa.len() {
let byte = approx_sa[i];
if byte == MARKER || byte == 0 {
continue
}
let j = (byte - 1) as usize;
if !type_map.get(j).unwrap() {
continue }
let bucket_idx: usize = cast(input[j]).unwrap();
let bucket_value = bucket_heads[bucket_idx];
approx_sa[bucket_value as usize] = byte - 1;
bucket_heads[bucket_idx] += 1;
}
}
fn induced_sort_small<T>(input: &[T], approx_sa: &mut [u32],
mut bucket_tails: Vec<u32>, type_map: &BitVec)
where T: Num + NumCast + PartialOrd + Copy
{
for i in (0..approx_sa.len()).rev() {
let byte = approx_sa[i];
if byte == MARKER || byte == 0 {
continue
}
let j = (byte - 1) as usize;
if type_map.get(j).unwrap() {
continue }
let bucket_idx: usize = cast(input[j]).unwrap();
let bucket_value = bucket_tails[bucket_idx];
approx_sa[bucket_value as usize] = byte - 1;
bucket_tails[bucket_idx] -= 1;
}
}
fn is_equal_lms<T>(input: &[T], lms_map: &BitVec, j: usize, k: usize) -> bool
where T: Num + NumCast + PartialOrd + Copy
{
let length = input.len();
if j == length || k == length {
return false }
for i in 0..(length + 1) {
let first_lms = lms_map.get(i + j).unwrap();
let second_lms = lms_map.get(i + k).unwrap();
if first_lms && second_lms && i > 0 {
return true
} else if (first_lms != second_lms) || (input[i + j] != input[i + k]) {
return false
}
}
false
}
pub fn insert<T>(vec: &mut Vec<u32>, value: T) -> u32
where T: Num + NumCast + PartialOrd + Copy
{
let idx = cast(value).unwrap();
if vec.len() <= idx {
vec.resize(idx + 1, 0);
}
vec[idx] += 1;
vec[idx]
}
pub fn suffix_array<T>(input: &[T]) -> Vec<u32>
where T: Num + NumCast + PartialOrd + Copy
{
let length = input.len();
let length_32 = length as u32;
let mut type_map = BitVec::from_elem(length + 1, false); let mut lms_map = BitVec::from_elem(length + 1, false); let mut bucket_sizes = Vec::new();
lms_map.set(length, true); type_map.set(length - 1, true); insert(&mut bucket_sizes, input[length - 1]);
for i in (0..length - 1).rev() {
let prev_type = type_map.get(i + 1).unwrap();
insert(&mut bucket_sizes, input[i]);
if input[i] > input[i + 1] ||
(input[i] == input[i + 1] && prev_type ) {
if !prev_type { lms_map.set(i + 1, true);
}
type_map.set(i, true);
}
}
let mut idx = 1;
let bytes = bucket_sizes.iter().enumerate().filter_map(|(i, c)| {
if *c == 0 { None } else { Some(i) }
}).collect::<Vec<_>>();
let max_byte = bytes[bytes.len() - 1];
let mut bucket_tails = vec![0u32; max_byte + 1];
let mut bucket_heads = vec![0u32; max_byte + 1];
let mut j = 0;
for i in 0..(max_byte + 1) {
bucket_heads[i] = idx;
if i == bytes[j] {
idx += bucket_sizes[bytes[j]];
j += 1;
}
bucket_tails[i] = idx - 1;
}
drop(bytes);
drop(bucket_sizes);
let mut approx_sa = {
let mut vec = vec![MARKER; length + 1];
let mut bucket_tails = bucket_tails.clone();
for (i, byte) in input.iter().enumerate() {
if !lms_map.get(i).unwrap() {
continue }
let bucket_idx: usize = cast(*byte).unwrap();
let bucket_value = bucket_tails[bucket_idx];
vec[bucket_value as usize] = i as u32;
bucket_tails[bucket_idx] -= 1;
}
vec[0] = length_32; vec
};
induced_sort_large(&input, &mut approx_sa, bucket_heads.clone(), &type_map);
induced_sort_small(&input, &mut approx_sa, bucket_tails.clone(), &type_map);
let mut label = 0;
let mut lms_bytes = {
let mut approx_sa = approx_sa;
let mut last_idx = approx_sa[0] as usize;
let mut lms_vec = vec![length_32; length + 1];
lms_vec[last_idx] = 0;
for count in approx_sa.drain(1..) {
let idx = if count == MARKER { length } else { count as usize };
if !lms_map.get(idx).unwrap() {
continue
}
if !is_equal_lms(&input, &lms_map, last_idx, idx) {
label += 1;
}
last_idx = idx;
lms_vec[idx] = label;
}
lms_vec
};
drop(lms_map);
let mut summary_index_idx = Vec::with_capacity(lms_bytes.len() / 2);
let mut summary_index_val = Vec::with_capacity(lms_bytes.len() / 2);
for (i, b) in lms_bytes.drain(..).enumerate() {
if b != length_32 {
summary_index_idx.push(i as u32);
summary_index_val.push(b);
}
}
let summary_len = summary_index_idx.len() as u32;
drop(lms_bytes);
let mut final_sa = {
let summary_sa = if label + 1 < summary_len {
let array = suffix_array(&summary_index_val);
drop(summary_index_val);
array
} else {
let summary_index = summary_index_val;
let mut sum_sa = vec![0u32; summary_index.len() + 1];
sum_sa[0] = summary_len;
for (i, idx) in summary_index.into_iter().enumerate() {
sum_sa[idx as usize + 1] = i as u32;
}
sum_sa };
let mut bucket_tails = bucket_tails.clone();
let mut suffix_idx = vec![MARKER; length + 1];
let summary_index = summary_index_idx;
for i in (2..summary_sa.len()).rev() {
let idx = summary_index[summary_sa[i] as usize];
let bucket_idx: usize = cast(input[idx as usize]).unwrap();
let bucket_value = bucket_tails[bucket_idx];
suffix_idx[bucket_value as usize] = idx as u32;
bucket_tails[bucket_idx] -= 1;
}
suffix_idx[0] = length_32;
suffix_idx
};
induced_sort_large(&input, &mut final_sa, bucket_heads, &type_map);
induced_sort_small(&input, &mut final_sa, bucket_tails, &type_map);
final_sa }
#[cfg(test)]
mod tests {
use super::suffix_array;
#[test]
fn test_suffix_array() {
let text = b"ATCGAATCGAGAGATCATCGAATCGAGATCATCGAAATCATCGAATCGTC";
let sa = suffix_array(text as &[u8]);
let mut rotations = (0..text.len()).map(|i| &text[i..]).collect::<Vec<_>>();
rotations.sort();
assert_eq!(sa.into_iter().skip(1).map(|i| &text[i as usize..]).collect::<Vec<_>>(),
rotations);
}
}