// Copyright 2014-2016 Johannes Köster, Taylor Cramer.
// Licensed under the MIT license (http://opensource.org/licenses/MIT)
// This file may not be copied, modified, or distributed
// except according to those terms.
//! Suffix arrays and related algorithms.
//! The implementation is based on the lecture notes
//! "Algorithmen auf Sequenzen", Kopczynski, Marschall, Martin and Rahmann, 2008 - 2015.
//! The original algorithm desciption can be found in:
//! [Ge Nong, Sen Zhang, Wai Hong Chan: Two Efficient Algorithms for Linear Time Suffix Array Construction. IEEE Trans. Computers 60(10): 1471–1484 (2011)](https://doi.org/10.1109/TC.2010.188)
//!
//! # Examples
//!
//! ```
//! use bio::data_structures::suffix_array::suffix_array;
//! let text = b"GCCTTAACATTATTACGCCTA$";
//! let pos = suffix_array(text);
//! assert_eq!(
//! pos,
//! vec![21, 20, 5, 6, 14, 11, 8, 7, 17, 1, 15, 18, 2, 16, 0, 19, 4, 13, 10, 3, 12, 9]
//! );
//! ```
use std::borrow::Borrow;
use std::cmp;
use std::collections::HashMap;
use std::fmt::Debug;
use std::hash::BuildHasherDefault;
use std::iter;
use std::ops::Deref;
use num_integer::Integer;
use num_traits::{cast, NumCast, Unsigned};
use bv::{BitVec, Bits, BitsMut};
use vec_map::VecMap;
use fxhash::FxHasher;
use crate::alphabets::{Alphabet, RankTransform};
use crate::data_structures::bwt::{Less, Occ, BWT};
use crate::data_structures::smallints::SmallInts;
pub type LCPArray = SmallInts<i8, isize>;
pub type RawSuffixArray = Vec<usize>;
pub type RawSuffixArraySlice<'a> = &'a [usize];
type HashMapFx<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher>>;
/// A trait exposing general functionality of suffix arrays.
pub trait SuffixArray {
fn get(&self, index: usize) -> Option<usize>;
fn len(&self) -> usize;
fn is_empty(&self) -> bool;
/// Sample the suffix array with the given sample rate.
///
/// # Arguments
///
/// * `text` - text that the suffix array is built on
/// * `bwt` - the corresponding BWT
/// * `less` - the corresponding less array
/// * `occ` - the corresponding occ table
/// * `sampling_rate` - if sampling rate is k, every k-th entry will be kept
///
/// # Example
///
/// ```
/// use bio::alphabets::dna;
/// use bio::data_structures::bwt::{bwt, less, Occ};
/// use bio::data_structures::suffix_array::{suffix_array, SuffixArray};
///
/// let text = b"ACGCGAT$";
/// let alphabet = dna::n_alphabet();
/// let sa = suffix_array(text);
/// let bwt = bwt(text, &sa);
/// let less = less(&bwt, &alphabet);
/// let occ = Occ::new(&bwt, 3, &alphabet);
/// let sampled = sa.sample(text, &bwt, &less, &occ, 2);
///
/// for i in 0..sa.len() {
/// assert_eq!(sa.get(i), sampled.get(i));
/// }
/// ```
fn sample<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>>(
&self,
text: &[u8],
bwt: DBWT,
less: DLess,
occ: DOcc,
sampling_rate: usize,
) -> SampledSuffixArray<DBWT, DLess, DOcc> {
let mut sample =
Vec::with_capacity((self.len() as f32 / sampling_rate as f32).ceil() as usize);
let mut extra_rows = HashMapFx::default();
let sentinel = sentinel(text);
for i in 0..self.len() {
let idx = self.get(i).unwrap();
if (i % sampling_rate) == 0 {
sample.push(idx);
} else if bwt.borrow()[i] == sentinel {
// If bwt lookup will return a sentinel
// Text suffixes that begin right after a sentinel are always saved as extra rows
// to help deal with FM index last to front inaccuracy when there are many sentinels
extra_rows.insert(i, idx);
}
}
SampledSuffixArray {
bwt,
less,
occ,
sample,
s: sampling_rate,
extra_rows,
sentinel,
}
}
}
/// A sampled suffix array.
#[derive(Default, Clone, Eq, PartialEq, Debug, Serialize, Deserialize)]
pub struct SampledSuffixArray<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>> {
bwt: DBWT,
less: DLess,
occ: DOcc,
sample: Vec<usize>,
s: usize, // Rate of sampling
extra_rows: HashMapFx<usize, usize>,
sentinel: u8,
}
impl SuffixArray for RawSuffixArray {
fn get(&self, index: usize) -> Option<usize> {
// Explicitly written out because Vec::get(index) generates a recursion warning
if index < self.len() {
Some(self[index])
} else {
None
}
}
fn len(&self) -> usize {
Vec::len(self)
}
fn is_empty(&self) -> bool {
Vec::is_empty(self)
}
}
impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>> SuffixArray
for SampledSuffixArray<DBWT, DLess, DOcc>
{
fn get(&self, index: usize) -> Option<usize> {
if index < self.len() {
let mut pos = index;
let mut offset = 0;
loop {
if pos % self.s == 0 {
return Some(self.sample[pos / self.s] + offset);
}
let c = self.bwt.borrow()[pos];
if c == self.sentinel {
// Check if next character in the bwt is the sentinel
// If so, there must be a cached result to workaround FM index last to front
// mapping inaccuracy when there are multiple sentinels
// This branch should rarely be triggered so the performance impact
// of hashmap lookups would be low
return Some(self.extra_rows[&pos] + offset);
}
pos = self.less.borrow()[c as usize]
+ self.occ.borrow().get(self.bwt.borrow(), pos - 1, c);
offset += 1;
}
} else {
None
}
}
fn len(&self) -> usize {
self.bwt.borrow().len()
}
fn is_empty(&self) -> bool {
self.bwt.borrow().is_empty()
}
}
impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>>
SampledSuffixArray<DBWT, DLess, DOcc>
{
/// Get the sampling rate of the suffix array.
pub fn sampling_rate(&self) -> usize {
self.s
}
/// Get a reference to the internal BWT.
pub fn bwt(&self) -> &BWT {
self.bwt.borrow()
}
/// Get a reference to the internal Less.
pub fn less(&self) -> &Less {
self.less.borrow()
}
/// Get a reference to the internal Occ.
pub fn occ(&self) -> &Occ {
self.occ.borrow()
}
}
/// Construct suffix array for given text of length n.
/// Complexity: O(n).
/// This is an implementation of the induced sorting as presented by
/// Ge Nong, Sen Zhang und Wai Hong Chan (2009), also known as SAIS.
/// The implementation is based on the following lecture notes:
/// http://ls11-www.cs.tu-dortmund.de/people/rahmann/algoseq.pdf
///
/// The idea is to first mark positions as L or S, with L being a position
/// the suffix of which is lexicographically larger than that of the next position.
/// Then, LMS-positions (leftmost S) are S-positions right to an L-position.
/// An LMS substring is the substring from one LMS position to the next (inclusive).
/// The algorithm works as follows:
///
/// 1. Sort LMS positions: first step 2 is applied to the unsorted sequence
/// of positions. Surprisingly, this sorts the LMS substrings. If all substrings
/// are different, LMS positions (and their suffixes) are sorted. Else, a reduced
/// text is build (at most half the size of the original text) and we recurse into
/// suffix array construction on the reduced text, yielding the sorted LMS positions.
/// 2. Derive the order of the other positions/suffixes from the (sorted) LMS positions.
/// For this, the (still empty) suffix array is partitioned into buckets.
/// Each bucket denotes an interval of suffixes with the same first symbol.
/// We know that the L-suffixes have to occur first in the buckets, because they
/// have to be lexicographically smaller than the S-suffixes with the same first letter.
/// The LMS-positions can now be used to insert the L-positions in the correct order
/// into the buckets.
/// Then, the S-positions can be inserted, again using the already existing entries
/// in the array.
///
/// # Arguments
///
/// * `text` - the text, ended by sentinel symbol (being lexicographically smallest). The text may
/// also contain multiple sentinel symbols, used to concatenate multiple sequences without mixing
/// their suffixes together.
///
/// # Example
///
/// ```
/// use bio::data_structures::suffix_array::suffix_array;
/// let text = b"GCCTTAACATTATTACGCCTA$";
/// let pos = suffix_array(text);
/// assert_eq!(
/// pos,
/// vec![21, 20, 5, 6, 14, 11, 8, 7, 17, 1, 15, 18, 2, 16, 0, 19, 4, 13, 10, 3, 12, 9]
/// );
/// ```
pub fn suffix_array(text: &[u8]) -> RawSuffixArray {
let n = text.len();
let alphabet = Alphabet::new(text);
let sentinel_count = sentinel_count(text);
let mut sais = Sais::new(n);
match alphabet.len() + sentinel_count {
a if a <= u8::MAX as usize => {
sais.construct(&transform_text::<u8>(text, &alphabet, sentinel_count))
}
a if a <= u16::MAX as usize => {
sais.construct(&transform_text::<u16>(text, &alphabet, sentinel_count))
}
a if a <= u32::MAX as usize => {
sais.construct(&transform_text::<u32>(text, &alphabet, sentinel_count))
}
_ => sais.construct(&transform_text::<u64>(text, &alphabet, sentinel_count)),
}
sais.pos
}
/// Construct suffix array for given text from integer alphabet.
/// Complexity: O(n).
/// # Arguments
///
/// * `text` - the text, ended by sentinel symbol (being lexicographically smallest).
/// All symbols, from lexicographically smallest to largest, need to be present in the text,
/// otherwise SAIS algorithm panics on 'index out of bounds' error.
/// The text may also contain multiple sentinel symbols, used to concatenate
/// multiple sequences without mixing their suffixes together.
///
/// # Example
///
/// ```
/// use bio::data_structures::suffix_array::suffix_array_int;
/// let text: Vec<usize> = vec![3, 2, 2, 4, 4, 1, 2, 1, 0];
/// let sa = suffix_array_int(&text);
/// assert_eq!(sa, vec![8, 7, 5, 6, 1, 2, 0, 4, 3]);
/// ```
pub fn suffix_array_int<T>(text: &[T]) -> RawSuffixArray
where
T: Integer + Unsigned + NumCast + Copy + Debug,
{
let mut sais = Sais::new(text.len());
sais.construct(&text);
sais.pos
}
/// Construct lcp array for given text and suffix array of length n.
/// Complexity: O(n).
///
/// # Arguments
///
/// * `text` - the text ended by sentinel symbol (being lexicographically smallest)
/// * `pos` - the suffix array for the text
///
/// # Example
///
/// ```
/// use bio::data_structures::suffix_array::{lcp, suffix_array};
/// let text = b"GCCTTAACATTATTACGCCTA$";
/// let pos = suffix_array(text);
///
/// // obtain compressed LCP array
/// let lcp = lcp(text, &pos);
///
/// // get most values in O(1).
/// assert_eq!(lcp.get(6).unwrap(), 4);
///
/// // obtain uncompressed LCP array.
/// let uncompressed = lcp.decompress();
/// assert_eq!(
/// uncompressed,
/// [-1, 0, 1, 1, 2, 1, 4, 0, 1, 3, 1, 1, 2, 0, 4, 0, 2, 2, 2, 1, 3, 3, -1]
/// )
/// ```
pub fn lcp<SA: Deref<Target = RawSuffixArray>>(text: &[u8], pos: SA) -> LCPArray {
assert_eq!(text.len(), pos.len());
let n = text.len();
// provide the lexicographical rank for each suffix
let mut rank: Vec<usize> = iter::repeat(0).take(n).collect();
for (r, p) in pos.iter().enumerate() {
rank[*p] = r;
}
let mut lcp = SmallInts::from_elem(-1, n + 1);
let mut l = 0usize;
for (p, &r) in rank.iter().enumerate().take(n - 1) {
// since the sentinel has rank 0 and is excluded above,
// we will never have a negative index below
let pred = pos[r - 1];
while pred + l < n && p + l < n && text[p + l] == text[pred + l] {
l += 1;
}
lcp.set(r, l as isize);
l = if l > 0 { l - 1 } else { 0 };
}
lcp
}
/// Calculate all locally shortest unique substrings from a given suffix and lcp array
/// (Ohlebusch (2013). "Bioinformatics Algorithms". ISBN 978-3-00-041316-2).
/// Complexity: O(n)
///
/// # Arguments
///
/// * `pos` - the suffix array
/// * `lcp` - the lcp array
///
/// # Returns
///
/// An vector of the length of the shortest unique substring for each position of the text.
/// Suffixes are excluded. If no unique substring starts at a given position, the entry is `None`.
///
/// # Example
///
/// ```
/// use bio::data_structures::suffix_array::{lcp, shortest_unique_substrings, suffix_array};
/// let text = b"GCTGCTA$";
/// let pos = suffix_array(text);
///
/// // obtain compressed LCP array
/// let lcp = lcp(text, &pos);
///
/// // calculate shortest unique substrings
/// let sus = shortest_unique_substrings(&pos, &lcp);
/// assert_eq!(
/// sus,
/// [
/// Some(4),
/// Some(3),
/// Some(2),
/// Some(4),
/// Some(3),
/// Some(2),
/// Some(1),
/// Some(1)
/// ]
/// );
/// ```
pub fn shortest_unique_substrings<SA: SuffixArray>(pos: &SA, lcp: &LCPArray) -> Vec<Option<usize>> {
let n = pos.len();
// Initialize array representing the length of the shortest unique substring starting at position i
let mut sus = vec![None; n];
for i in 0..n {
// The longest common prefixes (LCP) of suffix pos[i] with its predecessor and successor are not unique.
// In turn the their maximum + 1 is the length of the shortest unique substring starting at pos[i].
let len = 1 + cmp::max(lcp.get(i).unwrap(), lcp.get(i + 1).unwrap_or(0)) as usize;
let p = pos.get(i).unwrap();
// Check if the suffix pos[i] is a prefix of pos[i+1]. In that case, there is no unique substring
// at this position.
if n - p >= len {
sus[p] = Some(len);
}
}
sus
}
/// Return last character of the text (expected to be the sentinel).
fn sentinel(text: &[u8]) -> u8 {
text[text.len() - 1]
}
/// Count the sentinels occurring in the text given that the last character is the sentinel.
fn sentinel_count(text: &[u8]) -> usize {
let sentinel = sentinel(text);
assert!(
text.iter().all(|&a| a >= sentinel),
"Expecting extra sentinel symbol being lexicographically smallest at the end of the \
text."
);
text.iter()
.fold(0, |count, &a| count + (a == sentinel) as usize)
}
/// Transform the given text into integers for usage in `SAIS`.
fn transform_text<T: Integer + Unsigned + NumCast + Copy + Debug>(
text: &[u8],
alphabet: &Alphabet,
sentinel_count: usize,
) -> Vec<T> {
let sentinel = sentinel(text);
let transform = RankTransform::new(alphabet);
let offset = sentinel_count - 1;
let mut transformed: Vec<T> = Vec::with_capacity(text.len());
let mut s = sentinel_count;
for &a in text {
if a == sentinel {
s -= 1;
transformed.push(cast(s).unwrap());
} else {
transformed
.push(cast(*(transform.ranks.get(a as usize)).unwrap() as usize + offset).unwrap());
}
}
transformed
}
/// SAIS implementation (see function `suffix_array` for description).
struct Sais {
pos: Vec<usize>,
lms_pos: Vec<usize>,
reduced_text_pos: Vec<usize>,
bucket_sizes: VecMap<usize>,
bucket_start: Vec<usize>,
bucket_end: Vec<usize>,
}
impl Sais {
/// Create a new instance.
fn new(n: usize) -> Self {
Sais {
pos: Vec::with_capacity(n),
lms_pos: Vec::with_capacity(n),
reduced_text_pos: vec![0; n],
bucket_sizes: VecMap::new(),
bucket_start: Vec::with_capacity(n),
bucket_end: Vec::with_capacity(n),
}
}
/// Init buckets.
fn init_bucket_start<T: Integer + Unsigned + NumCast + Copy>(&mut self, text: &[T]) {
self.bucket_sizes.clear();
self.bucket_start.clear();
for &c in text {
if !self.bucket_sizes.contains_key(cast(c).unwrap()) {
self.bucket_sizes.insert(cast(c).unwrap(), 0);
}
*(self.bucket_sizes.get_mut(cast(c).unwrap()).unwrap()) += 1;
}
let mut sum = 0;
for &size in self.bucket_sizes.values() {
self.bucket_start.push(sum);
sum += size;
}
}
/// Initialize pointers to the last element of the buckets.
fn init_bucket_end<T: Integer + Unsigned + NumCast + Copy>(&mut self, text: &[T]) {
self.bucket_end.clear();
for &r in &self.bucket_start[1..] {
self.bucket_end.push(r - 1);
}
self.bucket_end.push(text.len() - 1);
}
/// Check if two LMS substrings are equal.
fn lms_substring_eq<T: Integer + Unsigned + NumCast + Copy>(
&self,
text: &[T],
pos_types: &PosTypes,
i: usize,
j: usize,
) -> bool {
for k in 0.. {
let lmsi = pos_types.is_lms_pos(i + k);
let lmsj = pos_types.is_lms_pos(j + k);
if text[i + k] != text[j + k] {
// different symbols
return false;
}
if lmsi != lmsj {
// different length
return false;
}
if k > 0 && lmsi && lmsj {
// same symbols and same length
return true;
}
}
false
}
/// Sort LMS suffixes.
fn sort_lms_suffixes<
T: Integer + Unsigned + NumCast + Copy + Debug,
S: Integer + Unsigned + NumCast + Copy + Debug,
>(
&mut self,
text: &[T],
pos_types: &PosTypes,
lms_substring_count: usize,
) {
// if less than 2 LMS substrings are present, no further sorting is needed
if lms_substring_count > 1 {
// sort LMS suffixes by recursively building SA on reduced text
let mut reduced_text: Vec<S> = vec![cast(0).unwrap(); lms_substring_count];
let mut label = 0;
reduced_text[self.reduced_text_pos[self.pos[0]]] = cast(label).unwrap();
let mut prev = None;
for &p in &self.pos {
if pos_types.is_lms_pos(p) {
// choose same label if substrings are equal
if prev.is_some() && !self.lms_substring_eq(text, pos_types, prev.unwrap(), p) {
label += 1;
}
reduced_text[self.reduced_text_pos[p]] = cast(label).unwrap();
prev = Some(p);
}
}
// if we have less labels than substrings, we have to sort by recursion
// because two or more substrings are equal
if label + 1 < lms_substring_count {
// backup lms_pos
let lms_pos = self.lms_pos.clone();
// recurse SA construction for reduced text
self.construct(&reduced_text);
// obtain sorted lms suffixes
self.lms_pos.clear();
for &p in &self.pos {
self.lms_pos.push(lms_pos[p]);
}
} else {
// otherwise, lms_pos is updated with the sorted suffixes from pos
// obtain sorted lms suffixes
self.lms_pos.clear();
for &p in &self.pos {
if pos_types.is_lms_pos(p) {
self.lms_pos.push(p);
}
}
}
}
}
/// Construct the suffix array.
fn construct<T: Integer + Unsigned + NumCast + Copy + Debug>(&mut self, text: &[T]) {
let pos_types = PosTypes::new(text);
self.calc_lms_pos(text, &pos_types);
self.calc_pos(text, &pos_types);
}
/// Step 1 of the SAIS algorithm.
fn calc_lms_pos<T: Integer + Unsigned + NumCast + Copy + Debug>(
&mut self,
text: &[T],
pos_types: &PosTypes,
) {
let n = text.len();
// collect LMS positions
self.lms_pos.clear();
let mut i = 0;
for r in 0..n {
if pos_types.is_lms_pos(r) {
self.lms_pos.push(r);
self.reduced_text_pos[r] = i;
i += 1;
}
}
// sort LMS substrings by applying step 2 with unsorted LMS positions
self.calc_pos(text, pos_types);
let lms_substring_count = self.lms_pos.len();
if lms_substring_count <= u8::MAX as usize {
self.sort_lms_suffixes::<T, u8>(text, pos_types, lms_substring_count);
} else if lms_substring_count <= u16::MAX as usize {
self.sort_lms_suffixes::<T, u16>(text, pos_types, lms_substring_count);
} else if lms_substring_count <= u32::MAX as usize {
self.sort_lms_suffixes::<T, u32>(text, pos_types, lms_substring_count);
} else {
self.sort_lms_suffixes::<T, u64>(text, pos_types, lms_substring_count);
}
}
/// Step 2 of the SAIS algorithm.
fn calc_pos<T: Integer + Unsigned + NumCast + Copy>(
&mut self,
text: &[T],
pos_types: &PosTypes,
) {
let n = text.len();
self.pos.clear();
self.init_bucket_start(text);
self.init_bucket_end(text);
// init all positions as unknown (n-1 is max position)
self.pos.resize(n, n);
// insert LMS positions to the end of their buckets
for &p in self.lms_pos.iter().rev() {
let c: usize = cast(text[p]).unwrap();
self.pos[self.bucket_end[c]] = p;
// subtract without overflow: last -1 will cause overflow, but it does not matter
self.bucket_end[c] = self.bucket_end[c].wrapping_sub(1);
}
// reset bucket ends
self.init_bucket_end(text);
// insert L-positions into buckets
for r in 0..n {
let p = self.pos[r];
// ignore undefined positions and the zero since it has no predecessor
if p == n || p == 0 {
continue;
}
let pred = p - 1;
if pos_types.is_l_pos(pred) {
let c: usize = cast(text[pred]).unwrap();
self.pos[self.bucket_start[c]] = pred;
self.bucket_start[c] += 1;
}
}
// insert S-positions into buckets
for r in (0..n).rev() {
let p = self.pos[r];
if p == 0 {
continue;
}
let pred = p - 1;
if pos_types.is_s_pos(pred) {
let c: usize = cast(text[pred]).unwrap();
self.pos[self.bucket_end[c]] = pred;
// subtract without overflow: last -1 will cause overflow, but it won't be used
self.bucket_end[c] = self.bucket_end[c].wrapping_sub(1);
}
}
}
}
/// Position types (L or S).
#[derive(Debug)]
struct PosTypes {
pos_types: BitVec,
}
impl PosTypes {
/// Calculate the text position type.
/// L-type marks suffixes being lexicographically larger than their successor,
/// S-type marks the others.
/// This function fills a BitVec, with 1-bits denoting S-type
/// and 0-bits denoting L-type.
///
/// # Arguments
///
/// * `text` - the text, ending with a sentinel.
fn new<T: Integer + Unsigned + NumCast + Copy>(text: &[T]) -> Self {
let n = text.len();
let mut pos_types = BitVec::new_fill(false, n as u64);
pos_types.set_bit(n as u64 - 1, true);
for p in (0..n - 1).rev() {
if text[p] == text[p + 1] {
// if the characters are equal, the next position determines
// the lexicographical order
let v = pos_types.get_bit(p as u64 + 1);
pos_types.set_bit(p as u64, v);
} else {
pos_types.set_bit(p as u64, text[p] < text[p + 1]);
}
}
PosTypes { pos_types }
}
/// Check if p is S-position.
fn is_s_pos(&self, p: usize) -> bool {
self.pos_types.get_bit(p as u64)
}
/// Check if p is L-position.
fn is_l_pos(&self, p: usize) -> bool {
!self.pos_types.get_bit(p as u64)
}
/// Check if p is LMS-position.
fn is_lms_pos(&self, p: usize) -> bool {
p != 0 && self.is_s_pos(p) && self.is_l_pos(p - 1)
}
}
#[cfg(test)]
mod tests {
use super::*;
use super::{transform_text, PosTypes, Sais};
use crate::alphabets::{dna, Alphabet};
use crate::data_structures::bwt::{bwt, less};
use bv::{BitVec, BitsPush};
use rand;
use rand::prelude::*;
use std::str;
#[test]
fn test_pos_types() {
let orig_text = b"GCCTTAACATTATTACGCCTA$";
let alphabet = Alphabet::new(orig_text);
let text: Vec<u8> = transform_text(orig_text, &alphabet, 1);
let n = text.len();
let pos_types = PosTypes::new(&text);
//let mut test = BitSlice::from_slice(&[0b01100110, 0b10010011, 0b01100100]).to_owned();
let mut test = BitVec::new();
test.push_block(0b001001101100100101100110);
test.truncate(n as u64);
assert_eq!(pos_types.pos_types, test);
let lms_pos: Vec<usize> = (0..n).filter(|&p| pos_types.is_lms_pos(p)).collect();
assert_eq!(lms_pos, vec![1, 5, 8, 11, 14, 17, 21]);
}
#[test]
fn test_buckets() {
let orig_text = b"GCCTTAACATTATTACGCCTA$";
let alphabet = Alphabet::new(orig_text);
let text: Vec<u8> = transform_text(orig_text, &alphabet, 1);
let n = text.len();
let mut sais = Sais::new(n);
sais.init_bucket_start(&text);
assert_eq!(sais.bucket_start, vec![0, 1, 7, 13, 15]);
sais.init_bucket_end(&text);
assert_eq!(sais.bucket_end, vec![0, 6, 12, 14, 21]);
}
#[test]
fn test_pos() {
let orig_text = b"GCCTTAACATTATTACGCCTA$";
let alphabet = Alphabet::new(orig_text);
let text: Vec<u8> = transform_text(orig_text, &alphabet, 1);
let n = text.len();
let mut sais = Sais::new(n);
let pos_types = PosTypes::new(&text);
sais.lms_pos = vec![21, 5, 14, 8, 11, 17, 1];
sais.calc_pos(&text, &pos_types);
assert_eq!(
sais.pos,
vec![21, 20, 5, 6, 14, 11, 8, 7, 17, 1, 15, 18, 2, 16, 0, 19, 4, 13, 10, 3, 12, 9,]
);
}
#[test]
fn test_lms_pos() {
let orig_text = b"GCCTTAACATTATTACGCCTA$";
let alphabet = Alphabet::new(orig_text);
let text: Vec<u8> = transform_text(orig_text, &alphabet, 1);
let n = text.len();
let mut sais = Sais::new(n);
let pos_types = PosTypes::new(&text);
sais.calc_lms_pos(&text, &pos_types);
}
#[test]
fn test_issue10_1() {
let text = b"TGTGTGTGTG$";
let pos = suffix_array(text);
assert_eq!(pos, [10, 9, 7, 5, 3, 1, 8, 6, 4, 2, 0]);
}
#[test]
fn test_issue10_2() {
let text = b"TGTGTGTG$";
let pos = suffix_array(text);
assert_eq!(pos, [8, 7, 5, 3, 1, 6, 4, 2, 0]);
}
#[test]
fn test_handles_sentinels_properly() {
let reads = b"TACTCCGCTAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTT$AAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTAGCGGAGTA$TAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTTGTCCAGTA$TACTGGACAAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTA$ACGCACCCCGGCATTCGTCGACTCTACACTTAGTGGAACATACAAATTCGCTCGCAGGAGCGCCTCATACATTCTAACGCAGTGATCTTCGGCTGAGACT$AGTCTCAGCCGAAGATCACTGCGTTAGAATGTATGAGGCGCTCCTGCGAGCGAATTTGTATGTTCCACTAAGTGTAGAGTCGACGAATGCCGGGGTGCGT$";
suffix_array(reads);
}
fn str_from_pos(sa: &[usize], text: &[u8], index: usize) -> String {
String::from(
str::from_utf8(&text[sa[index]..])
.unwrap()
.split('$')
.next()
.unwrap_or(""),
) + "$"
}
fn rand_seqs(num_seqs: usize, seq_len: usize) -> Vec<u8> {
let mut rng = rand::rng();
let alpha = [b'A', b'T', b'C', b'G', b'N'];
let seqs = (0..num_seqs)
.map(|_| {
let len = rng.random_range((seq_len / 2)..=seq_len);
(0..len)
.map(|_| *alpha.choose(&mut rng).unwrap())
.collect::<Vec<u8>>()
})
.collect::<Vec<_>>();
let mut res = seqs.join(&b'$');
res.push(b'$');
res
}
#[test]
fn test_sorts_lexically() {
let mut test_cases = vec![(&b"A$C$G$T$"[..], "simple"),
(&b"A$A$T$T$"[..], "duplicates"),
(&b"AA$GA$CA$TA$TC$TG$GT$GC$"[..], "two letter"),
(&b"AGCCAT$CAGCC$"[..], "substring"),
(&b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\
AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$"[..],
"complex"),
(&b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\
TTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATTATAATTAGGCCTAC$\
AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$\
TATTGGAATAGCCATTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATT$"[..],
"complex with revcomps"),
];
let num_rand = 100;
let rand_cases: Vec<_> = (0..num_rand).map(|i| rand_seqs(10, i * 10)).collect();
test_cases.extend(
rand_cases
.iter()
.map(|case| (case.as_ref(), "rand test case")),
);
for &(text, test_name) in &test_cases {
let pos = suffix_array(text);
for i in 0..(pos.len() - 2) {
// Check that every element in the suffix array is lexically <= the next elem
let cur = str_from_pos(&pos, text, i);
let next = str_from_pos(&pos, text, i + 1);
assert!(
cur <= next,
"Failed:\n{}\n{}\nat positions {} and {} are out of order in \
test: {}",
cur,
next,
pos[i],
pos[i + 1],
test_name
);
}
}
}
#[test]
fn test_sampled_matches() {
let mut test_cases = vec![(&b"A$C$G$T$"[..], "simple"),
(&b"A$A$T$T$"[..], "duplicates"),
(&b"AA$GA$CA$TA$TC$TG$GT$GC$"[..], "two letter"),
(&b"AGCCAT$\
CAGCC$"[..],
"substring"),
(&b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\
AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$"[..],
"complex"),
(&b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\
TTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATTATAATTAGGCCTAC$\
AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$\
TATTGGAATAGCCATTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATT$"[..],
"complex with revcomps"),
(&b"GTAG$GCCTAAT$TATAATCAG$"[..], "issue70"),
(&b"TGTGTGTGTG$"[..], "repeating"),
(&b"TACTCCGCTAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTT$AAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTAGCGGAGTA$TAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTTGTCCAGTA$TACTGGACAAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTA$ACGCACCCCGGCATTCGTCGACTCTACACTTAGTGGAACATACAAATTCGCTCGCAGGAGCGCCTCATACATTCTAACGCAGTGATCTTCGGCTGAGACT$AGTCTCAGCCGAAGATCACTGCGTTAGAATGTATGAGGCGCTCCTGCGAGCGAATTTGTATGTTCCACTAAGTGTAGAGTCGACGAATGCCGGGGTGCGT$"[..], "complex sentinels"),
];
let num_rand = 100;
let rand_cases: Vec<_> = (0..num_rand).map(|i| rand_seqs(10, i * 10)).collect();
test_cases.extend(
rand_cases
.iter()
.map(|case| (case.as_ref(), "rand test case")),
);
for &(text, test_name) in test_cases.iter() {
for &sample_rate in &[2, 3, 5, 16] {
let alphabet = dna::n_alphabet();
let sa = suffix_array(text);
let bwt = bwt(text, &sa);
let less = less(&bwt, &alphabet);
let occ = Occ::new(&bwt, 3, &alphabet);
let sampled = sa.sample(text, &bwt, &less, &occ, sample_rate);
for i in 0..sa.len() {
let sa_idx = sa.get(i).unwrap();
let sampled_idx = sampled.get(i).unwrap();
assert_eq!(
sa_idx,
sampled_idx,
"Failed:\n{}\n{}\nat index {} do not match in test: {} (sample rate: {})",
str::from_utf8(&text[sa_idx..]).unwrap(),
str::from_utf8(&text[sampled_idx..]).unwrap(),
i,
test_name,
sample_rate
);
}
}
}
}
#[test]
fn can_construct_sa_for_usize() {
let text: Vec<usize> = vec![3, 2, 2, 4, 4, 1, 2, 1, 0];
let sa = suffix_array_int(&text);
assert_eq!(sa, vec![8, 7, 5, 6, 1, 2, 0, 4, 3]);
}
}