bio/data_structures/
suffix_array.rs

1// Copyright 2014-2016 Johannes Köster, Taylor Cramer.
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5
6//! Suffix arrays and related algorithms.
7//! The implementation is based on the lecture notes
8//! "Algorithmen auf Sequenzen", Kopczynski, Marschall, Martin and Rahmann, 2008 - 2015.
9//! The original algorithm desciption can be found in:
10//! [Ge Nong, Sen Zhang, Wai Hong Chan: Two Efficient Algorithms for Linear Time Suffix Array Construction. IEEE Trans. Computers 60(10): 1471–1484 (2011)](https://doi.org/10.1109/TC.2010.188)
11//!
12//! # Examples
13//!
14//! ```
15//! use bio::data_structures::suffix_array::suffix_array;
16//! let text = b"GCCTTAACATTATTACGCCTA$";
17//! let pos = suffix_array(text);
18//! assert_eq!(
19//!     pos,
20//!     vec![21, 20, 5, 6, 14, 11, 8, 7, 17, 1, 15, 18, 2, 16, 0, 19, 4, 13, 10, 3, 12, 9]
21//! );
22//! ```
23
24use std::borrow::Borrow;
25use std::cmp;
26use std::collections::HashMap;
27use std::fmt::Debug;
28use std::hash::BuildHasherDefault;
29use std::iter;
30use std::ops::Deref;
31
32use num_integer::Integer;
33use num_traits::{cast, NumCast, Unsigned};
34
35use bv::{BitVec, Bits, BitsMut};
36use vec_map::VecMap;
37
38use fxhash::FxHasher;
39
40use crate::alphabets::{Alphabet, RankTransform};
41use crate::data_structures::bwt::{Less, Occ, BWT};
42use crate::data_structures::smallints::SmallInts;
43
44pub type LCPArray = SmallInts<i8, isize>;
45pub type RawSuffixArray = Vec<usize>;
46pub type RawSuffixArraySlice<'a> = &'a [usize];
47
48type HashMapFx<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher>>;
49
50/// A trait exposing general functionality of suffix arrays.
51pub trait SuffixArray {
52    fn get(&self, index: usize) -> Option<usize>;
53    fn len(&self) -> usize;
54    fn is_empty(&self) -> bool;
55
56    /// Sample the suffix array with the given sample rate.
57    ///
58    /// # Arguments
59    ///
60    /// * `text` - text that the suffix array is built on
61    /// * `bwt` - the corresponding BWT
62    /// * `less` - the corresponding less array
63    /// * `occ` - the corresponding occ table
64    /// * `sampling_rate` - if sampling rate is k, every k-th entry will be kept
65    ///
66    /// # Example
67    ///
68    /// ```
69    /// use bio::alphabets::dna;
70    /// use bio::data_structures::bwt::{bwt, less, Occ};
71    /// use bio::data_structures::suffix_array::{suffix_array, SuffixArray};
72    ///
73    /// let text = b"ACGCGAT$";
74    /// let alphabet = dna::n_alphabet();
75    /// let sa = suffix_array(text);
76    /// let bwt = bwt(text, &sa);
77    /// let less = less(&bwt, &alphabet);
78    /// let occ = Occ::new(&bwt, 3, &alphabet);
79    /// let sampled = sa.sample(text, &bwt, &less, &occ, 2);
80    ///
81    /// for i in 0..sa.len() {
82    ///     assert_eq!(sa.get(i), sampled.get(i));
83    /// }
84    /// ```
85    fn sample<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>>(
86        &self,
87        text: &[u8],
88        bwt: DBWT,
89        less: DLess,
90        occ: DOcc,
91        sampling_rate: usize,
92    ) -> SampledSuffixArray<DBWT, DLess, DOcc> {
93        let mut sample =
94            Vec::with_capacity((self.len() as f32 / sampling_rate as f32).ceil() as usize);
95        let mut extra_rows = HashMapFx::default();
96        let sentinel = sentinel(text);
97
98        for i in 0..self.len() {
99            let idx = self.get(i).unwrap();
100            if (i % sampling_rate) == 0 {
101                sample.push(idx);
102            } else if bwt.borrow()[i] == sentinel {
103                // If bwt lookup will return a sentinel
104                // Text suffixes that begin right after a sentinel are always saved as extra rows
105                // to help deal with FM index last to front inaccuracy when there are many sentinels
106                extra_rows.insert(i, idx);
107            }
108        }
109
110        SampledSuffixArray {
111            bwt,
112            less,
113            occ,
114            sample,
115            s: sampling_rate,
116            extra_rows,
117            sentinel,
118        }
119    }
120}
121
122/// A sampled suffix array.
123#[derive(Default, Clone, Eq, PartialEq, Debug, Serialize, Deserialize)]
124pub struct SampledSuffixArray<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>> {
125    bwt: DBWT,
126    less: DLess,
127    occ: DOcc,
128    sample: Vec<usize>,
129    s: usize, // Rate of sampling
130    extra_rows: HashMapFx<usize, usize>,
131    sentinel: u8,
132}
133
134impl SuffixArray for RawSuffixArray {
135    fn get(&self, index: usize) -> Option<usize> {
136        // Explicitly written out because Vec::get(index) generates a recursion warning
137        if index < self.len() {
138            Some(self[index])
139        } else {
140            None
141        }
142    }
143
144    fn len(&self) -> usize {
145        Vec::len(self)
146    }
147
148    fn is_empty(&self) -> bool {
149        Vec::is_empty(self)
150    }
151}
152
153impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>> SuffixArray
154    for SampledSuffixArray<DBWT, DLess, DOcc>
155{
156    fn get(&self, index: usize) -> Option<usize> {
157        if index < self.len() {
158            let mut pos = index;
159            let mut offset = 0;
160            loop {
161                if pos % self.s == 0 {
162                    return Some(self.sample[pos / self.s] + offset);
163                }
164
165                let c = self.bwt.borrow()[pos];
166
167                if c == self.sentinel {
168                    // Check if next character in the bwt is the sentinel
169                    // If so, there must be a cached result to workaround FM index last to front
170                    // mapping inaccuracy when there are multiple sentinels
171                    // This branch should rarely be triggered so the performance impact
172                    // of hashmap lookups would be low
173                    return Some(self.extra_rows[&pos] + offset);
174                }
175
176                pos = self.less.borrow()[c as usize]
177                    + self.occ.borrow().get(self.bwt.borrow(), pos - 1, c);
178                offset += 1;
179            }
180        } else {
181            None
182        }
183    }
184
185    fn len(&self) -> usize {
186        self.bwt.borrow().len()
187    }
188
189    fn is_empty(&self) -> bool {
190        self.bwt.borrow().is_empty()
191    }
192}
193
194impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>>
195    SampledSuffixArray<DBWT, DLess, DOcc>
196{
197    /// Get the sampling rate of the suffix array.
198    pub fn sampling_rate(&self) -> usize {
199        self.s
200    }
201
202    /// Get a reference to the internal BWT.
203    pub fn bwt(&self) -> &BWT {
204        self.bwt.borrow()
205    }
206
207    /// Get a reference to the internal Less.
208    pub fn less(&self) -> &Less {
209        self.less.borrow()
210    }
211
212    /// Get a reference to the internal Occ.
213    pub fn occ(&self) -> &Occ {
214        self.occ.borrow()
215    }
216}
217
218/// Construct suffix array for given text of length n.
219/// Complexity: O(n).
220/// This is an implementation of the induced sorting as presented by
221/// Ge Nong, Sen Zhang und Wai Hong Chan (2009), also known as SAIS.
222/// The implementation is based on the following lecture notes:
223/// http://ls11-www.cs.tu-dortmund.de/people/rahmann/algoseq.pdf
224///
225/// The idea is to first mark positions as L or S, with L being a position
226/// the suffix of which is lexicographically larger than that of the next position.
227/// Then, LMS-positions (leftmost S) are S-positions right to an L-position.
228/// An LMS substring is the substring from one LMS position to the next (inclusive).
229/// The algorithm works as follows:
230///
231/// 1. Sort LMS positions: first step 2 is applied to the unsorted sequence
232///    of positions. Surprisingly, this sorts the LMS substrings. If all substrings
233///    are different, LMS positions (and their suffixes) are sorted. Else, a reduced
234///    text is build (at most half the size of the original text) and we recurse into
235///    suffix array construction on the reduced text, yielding the sorted LMS positions.
236/// 2. Derive the order of the other positions/suffixes from the (sorted) LMS positions.
237///    For this, the (still empty) suffix array is partitioned into buckets.
238///    Each bucket denotes an interval of suffixes with the same first symbol.
239///    We know that the L-suffixes have to occur first in the buckets, because they
240///    have to be lexicographically smaller than the S-suffixes with the same first letter.
241///    The LMS-positions can now be used to insert the L-positions in the correct order
242///    into the buckets.
243///    Then, the S-positions can be inserted, again using the already existing entries
244///    in the array.
245///
246/// # Arguments
247///
248/// * `text` - the text, ended by sentinel symbol (being lexicographically smallest). The text may
249///   also contain multiple sentinel symbols, used to concatenate multiple sequences without mixing
250///   their suffixes together.
251///
252/// # Example
253///
254/// ```
255/// use bio::data_structures::suffix_array::suffix_array;
256/// let text = b"GCCTTAACATTATTACGCCTA$";
257/// let pos = suffix_array(text);
258/// assert_eq!(
259///     pos,
260///     vec![21, 20, 5, 6, 14, 11, 8, 7, 17, 1, 15, 18, 2, 16, 0, 19, 4, 13, 10, 3, 12, 9]
261/// );
262/// ```
263pub fn suffix_array(text: &[u8]) -> RawSuffixArray {
264    let n = text.len();
265    let alphabet = Alphabet::new(text);
266    let sentinel_count = sentinel_count(text);
267    let mut sais = Sais::new(n);
268
269    match alphabet.len() + sentinel_count {
270        a if a <= u8::MAX as usize => {
271            sais.construct(&transform_text::<u8>(text, &alphabet, sentinel_count))
272        }
273        a if a <= u16::MAX as usize => {
274            sais.construct(&transform_text::<u16>(text, &alphabet, sentinel_count))
275        }
276        a if a <= u32::MAX as usize => {
277            sais.construct(&transform_text::<u32>(text, &alphabet, sentinel_count))
278        }
279        _ => sais.construct(&transform_text::<u64>(text, &alphabet, sentinel_count)),
280    }
281
282    sais.pos
283}
284
285/// Construct suffix array for given text from integer alphabet.
286/// Complexity: O(n).
287/// # Arguments
288///
289/// * `text` - the text, ended by sentinel symbol (being lexicographically smallest).
290/// All symbols, from lexicographically smallest to largest, need to be present in the text,
291/// otherwise SAIS algorithm panics on 'index out of bounds' error.
292/// The text may also contain multiple sentinel symbols, used to concatenate
293/// multiple sequences without mixing their suffixes together.
294///
295/// # Example
296///
297/// ```
298/// use bio::data_structures::suffix_array::suffix_array_int;
299/// let text: Vec<usize> = vec![3, 2, 2, 4, 4, 1, 2, 1, 0];
300/// let sa = suffix_array_int(&text);
301/// assert_eq!(sa, vec![8, 7, 5, 6, 1, 2, 0, 4, 3]);
302/// ```
303pub fn suffix_array_int<T>(text: &[T]) -> RawSuffixArray
304where
305    T: Integer + Unsigned + NumCast + Copy + Debug,
306{
307    let mut sais = Sais::new(text.len());
308    sais.construct(&text);
309    sais.pos
310}
311
312/// Construct lcp array for given text and suffix array of length n.
313/// Complexity: O(n).
314///
315/// # Arguments
316///
317/// * `text` - the text ended by sentinel symbol (being lexicographically smallest)
318/// * `pos` - the suffix array for the text
319///
320/// # Example
321///
322/// ```
323/// use bio::data_structures::suffix_array::{lcp, suffix_array};
324/// let text = b"GCCTTAACATTATTACGCCTA$";
325/// let pos = suffix_array(text);
326///
327/// // obtain compressed LCP array
328/// let lcp = lcp(text, &pos);
329///
330/// // get most values in O(1).
331/// assert_eq!(lcp.get(6).unwrap(), 4);
332///
333/// // obtain uncompressed LCP array.
334/// let uncompressed = lcp.decompress();
335/// assert_eq!(
336///     uncompressed,
337///     [-1, 0, 1, 1, 2, 1, 4, 0, 1, 3, 1, 1, 2, 0, 4, 0, 2, 2, 2, 1, 3, 3, -1]
338/// )
339/// ```
340pub fn lcp<SA: Deref<Target = RawSuffixArray>>(text: &[u8], pos: SA) -> LCPArray {
341    assert_eq!(text.len(), pos.len());
342    let n = text.len();
343
344    // provide the lexicographical rank for each suffix
345    let mut rank: Vec<usize> = iter::repeat(0).take(n).collect();
346    for (r, p) in pos.iter().enumerate() {
347        rank[*p] = r;
348    }
349
350    let mut lcp = SmallInts::from_elem(-1, n + 1);
351    let mut l = 0usize;
352    for (p, &r) in rank.iter().enumerate().take(n - 1) {
353        // since the sentinel has rank 0 and is excluded above,
354        // we will never have a negative index below
355        let pred = pos[r - 1];
356        while pred + l < n && p + l < n && text[p + l] == text[pred + l] {
357            l += 1;
358        }
359        lcp.set(r, l as isize);
360        l = if l > 0 { l - 1 } else { 0 };
361    }
362
363    lcp
364}
365
366/// Calculate all locally shortest unique substrings from a given suffix and lcp array
367/// (Ohlebusch (2013). "Bioinformatics Algorithms". ISBN 978-3-00-041316-2).
368/// Complexity: O(n)
369///
370/// # Arguments
371///
372/// * `pos` - the suffix array
373/// * `lcp` - the lcp array
374///
375/// # Returns
376///
377/// An vector of the length of the shortest unique substring for each position of the text.
378/// Suffixes are excluded. If no unique substring starts at a given position, the entry is `None`.
379///
380/// # Example
381///
382/// ```
383/// use bio::data_structures::suffix_array::{lcp, shortest_unique_substrings, suffix_array};
384/// let text = b"GCTGCTA$";
385/// let pos = suffix_array(text);
386///
387/// // obtain compressed LCP array
388/// let lcp = lcp(text, &pos);
389///
390/// // calculate shortest unique substrings
391/// let sus = shortest_unique_substrings(&pos, &lcp);
392/// assert_eq!(
393///     sus,
394///     [
395///         Some(4),
396///         Some(3),
397///         Some(2),
398///         Some(4),
399///         Some(3),
400///         Some(2),
401///         Some(1),
402///         Some(1)
403///     ]
404/// );
405/// ```
406pub fn shortest_unique_substrings<SA: SuffixArray>(pos: &SA, lcp: &LCPArray) -> Vec<Option<usize>> {
407    let n = pos.len();
408    // Initialize array representing the length of the shortest unique substring starting at position i
409    let mut sus = vec![None; n];
410    for i in 0..n {
411        // The longest common prefixes (LCP) of suffix pos[i] with its predecessor and successor are not unique.
412        // In turn the their maximum + 1 is the length of the shortest unique substring starting at pos[i].
413        let len = 1 + cmp::max(lcp.get(i).unwrap(), lcp.get(i + 1).unwrap_or(0)) as usize;
414        let p = pos.get(i).unwrap();
415        // Check if the suffix pos[i] is a prefix of pos[i+1]. In that case, there is no unique substring
416        // at this position.
417        if n - p >= len {
418            sus[p] = Some(len);
419        }
420    }
421    sus
422}
423
424/// Return last character of the text (expected to be the sentinel).
425fn sentinel(text: &[u8]) -> u8 {
426    text[text.len() - 1]
427}
428
429/// Count the sentinels occurring in the text given that the last character is the sentinel.
430fn sentinel_count(text: &[u8]) -> usize {
431    let sentinel = sentinel(text);
432    assert!(
433        text.iter().all(|&a| a >= sentinel),
434        "Expecting extra sentinel symbol being lexicographically smallest at the end of the \
435         text."
436    );
437
438    text.iter()
439        .fold(0, |count, &a| count + (a == sentinel) as usize)
440}
441
442/// Transform the given text into integers for usage in `SAIS`.
443fn transform_text<T: Integer + Unsigned + NumCast + Copy + Debug>(
444    text: &[u8],
445    alphabet: &Alphabet,
446    sentinel_count: usize,
447) -> Vec<T> {
448    let sentinel = sentinel(text);
449    let transform = RankTransform::new(alphabet);
450    let offset = sentinel_count - 1;
451
452    let mut transformed: Vec<T> = Vec::with_capacity(text.len());
453    let mut s = sentinel_count;
454    for &a in text {
455        if a == sentinel {
456            s -= 1;
457            transformed.push(cast(s).unwrap());
458        } else {
459            transformed
460                .push(cast(*(transform.ranks.get(a as usize)).unwrap() as usize + offset).unwrap());
461        }
462    }
463
464    transformed
465}
466
467/// SAIS implementation (see function `suffix_array` for description).
468struct Sais {
469    pos: Vec<usize>,
470    lms_pos: Vec<usize>,
471    reduced_text_pos: Vec<usize>,
472    bucket_sizes: VecMap<usize>,
473    bucket_start: Vec<usize>,
474    bucket_end: Vec<usize>,
475}
476
477impl Sais {
478    /// Create a new instance.
479    fn new(n: usize) -> Self {
480        Sais {
481            pos: Vec::with_capacity(n),
482            lms_pos: Vec::with_capacity(n),
483            reduced_text_pos: vec![0; n],
484            bucket_sizes: VecMap::new(),
485            bucket_start: Vec::with_capacity(n),
486            bucket_end: Vec::with_capacity(n),
487        }
488    }
489
490    /// Init buckets.
491    fn init_bucket_start<T: Integer + Unsigned + NumCast + Copy>(&mut self, text: &[T]) {
492        self.bucket_sizes.clear();
493        self.bucket_start.clear();
494
495        for &c in text {
496            if !self.bucket_sizes.contains_key(cast(c).unwrap()) {
497                self.bucket_sizes.insert(cast(c).unwrap(), 0);
498            }
499            *(self.bucket_sizes.get_mut(cast(c).unwrap()).unwrap()) += 1;
500        }
501
502        let mut sum = 0;
503        for &size in self.bucket_sizes.values() {
504            self.bucket_start.push(sum);
505            sum += size;
506        }
507    }
508
509    /// Initialize pointers to the last element of the buckets.
510    fn init_bucket_end<T: Integer + Unsigned + NumCast + Copy>(&mut self, text: &[T]) {
511        self.bucket_end.clear();
512        for &r in &self.bucket_start[1..] {
513            self.bucket_end.push(r - 1);
514        }
515        self.bucket_end.push(text.len() - 1);
516    }
517
518    /// Check if two LMS substrings are equal.
519    fn lms_substring_eq<T: Integer + Unsigned + NumCast + Copy>(
520        &self,
521        text: &[T],
522        pos_types: &PosTypes,
523        i: usize,
524        j: usize,
525    ) -> bool {
526        for k in 0.. {
527            let lmsi = pos_types.is_lms_pos(i + k);
528            let lmsj = pos_types.is_lms_pos(j + k);
529            if text[i + k] != text[j + k] {
530                // different symbols
531                return false;
532            }
533            if lmsi != lmsj {
534                // different length
535                return false;
536            }
537            if k > 0 && lmsi && lmsj {
538                // same symbols and same length
539                return true;
540            }
541        }
542        false
543    }
544
545    /// Sort LMS suffixes.
546    fn sort_lms_suffixes<
547        T: Integer + Unsigned + NumCast + Copy + Debug,
548        S: Integer + Unsigned + NumCast + Copy + Debug,
549    >(
550        &mut self,
551        text: &[T],
552        pos_types: &PosTypes,
553        lms_substring_count: usize,
554    ) {
555        // if less than 2 LMS substrings are present, no further sorting is needed
556        if lms_substring_count > 1 {
557            // sort LMS suffixes by recursively building SA on reduced text
558            let mut reduced_text: Vec<S> = vec![cast(0).unwrap(); lms_substring_count];
559            let mut label = 0;
560            reduced_text[self.reduced_text_pos[self.pos[0]]] = cast(label).unwrap();
561            let mut prev = None;
562            for &p in &self.pos {
563                if pos_types.is_lms_pos(p) {
564                    // choose same label if substrings are equal
565                    if prev.is_some() && !self.lms_substring_eq(text, pos_types, prev.unwrap(), p) {
566                        label += 1;
567                    }
568                    reduced_text[self.reduced_text_pos[p]] = cast(label).unwrap();
569                    prev = Some(p);
570                }
571            }
572
573            // if we have less labels than substrings, we have to sort by recursion
574            // because two or more substrings are equal
575            if label + 1 < lms_substring_count {
576                // backup lms_pos
577                let lms_pos = self.lms_pos.clone();
578                // recurse SA construction for reduced text
579                self.construct(&reduced_text);
580                // obtain sorted lms suffixes
581                self.lms_pos.clear();
582                for &p in &self.pos {
583                    self.lms_pos.push(lms_pos[p]);
584                }
585            } else {
586                // otherwise, lms_pos is updated with the sorted suffixes from pos
587                // obtain sorted lms suffixes
588                self.lms_pos.clear();
589                for &p in &self.pos {
590                    if pos_types.is_lms_pos(p) {
591                        self.lms_pos.push(p);
592                    }
593                }
594            }
595        }
596    }
597
598    /// Construct the suffix array.
599    fn construct<T: Integer + Unsigned + NumCast + Copy + Debug>(&mut self, text: &[T]) {
600        let pos_types = PosTypes::new(text);
601        self.calc_lms_pos(text, &pos_types);
602        self.calc_pos(text, &pos_types);
603    }
604
605    /// Step 1 of the SAIS algorithm.
606    fn calc_lms_pos<T: Integer + Unsigned + NumCast + Copy + Debug>(
607        &mut self,
608        text: &[T],
609        pos_types: &PosTypes,
610    ) {
611        let n = text.len();
612
613        // collect LMS positions
614        self.lms_pos.clear();
615        let mut i = 0;
616        for r in 0..n {
617            if pos_types.is_lms_pos(r) {
618                self.lms_pos.push(r);
619                self.reduced_text_pos[r] = i;
620                i += 1;
621            }
622        }
623
624        // sort LMS substrings by applying step 2 with unsorted LMS positions
625        self.calc_pos(text, pos_types);
626
627        let lms_substring_count = self.lms_pos.len();
628
629        if lms_substring_count <= u8::MAX as usize {
630            self.sort_lms_suffixes::<T, u8>(text, pos_types, lms_substring_count);
631        } else if lms_substring_count <= u16::MAX as usize {
632            self.sort_lms_suffixes::<T, u16>(text, pos_types, lms_substring_count);
633        } else if lms_substring_count <= u32::MAX as usize {
634            self.sort_lms_suffixes::<T, u32>(text, pos_types, lms_substring_count);
635        } else {
636            self.sort_lms_suffixes::<T, u64>(text, pos_types, lms_substring_count);
637        }
638    }
639
640    /// Step 2 of the SAIS algorithm.
641    fn calc_pos<T: Integer + Unsigned + NumCast + Copy>(
642        &mut self,
643        text: &[T],
644        pos_types: &PosTypes,
645    ) {
646        let n = text.len();
647        self.pos.clear();
648
649        self.init_bucket_start(text);
650        self.init_bucket_end(text);
651
652        // init all positions as unknown (n-1 is max position)
653        self.pos.resize(n, n);
654
655        // insert LMS positions to the end of their buckets
656        for &p in self.lms_pos.iter().rev() {
657            let c: usize = cast(text[p]).unwrap();
658            self.pos[self.bucket_end[c]] = p;
659            // subtract without overflow: last -1 will cause overflow, but it does not matter
660            self.bucket_end[c] = self.bucket_end[c].wrapping_sub(1);
661        }
662
663        // reset bucket ends
664        self.init_bucket_end(text);
665
666        // insert L-positions into buckets
667        for r in 0..n {
668            let p = self.pos[r];
669            // ignore undefined positions and the zero since it has no predecessor
670            if p == n || p == 0 {
671                continue;
672            }
673            let pred = p - 1;
674            if pos_types.is_l_pos(pred) {
675                let c: usize = cast(text[pred]).unwrap();
676                self.pos[self.bucket_start[c]] = pred;
677                self.bucket_start[c] += 1;
678            }
679        }
680
681        // insert S-positions into buckets
682        for r in (0..n).rev() {
683            let p = self.pos[r];
684            if p == 0 {
685                continue;
686            }
687            let pred = p - 1;
688            if pos_types.is_s_pos(pred) {
689                let c: usize = cast(text[pred]).unwrap();
690                self.pos[self.bucket_end[c]] = pred;
691                // subtract without overflow: last -1 will cause overflow, but it won't be used
692                self.bucket_end[c] = self.bucket_end[c].wrapping_sub(1);
693            }
694        }
695    }
696}
697
698/// Position types (L or S).
699#[derive(Debug)]
700struct PosTypes {
701    pos_types: BitVec,
702}
703
704impl PosTypes {
705    /// Calculate the text position type.
706    /// L-type marks suffixes being lexicographically larger than their successor,
707    /// S-type marks the others.
708    /// This function fills a BitVec, with 1-bits denoting S-type
709    /// and 0-bits denoting L-type.
710    ///
711    /// # Arguments
712    ///
713    /// * `text` - the text, ending with a sentinel.
714    fn new<T: Integer + Unsigned + NumCast + Copy>(text: &[T]) -> Self {
715        let n = text.len();
716        let mut pos_types = BitVec::new_fill(false, n as u64);
717        pos_types.set_bit(n as u64 - 1, true);
718
719        for p in (0..n - 1).rev() {
720            if text[p] == text[p + 1] {
721                // if the characters are equal, the next position determines
722                // the lexicographical order
723                let v = pos_types.get_bit(p as u64 + 1);
724                pos_types.set_bit(p as u64, v);
725            } else {
726                pos_types.set_bit(p as u64, text[p] < text[p + 1]);
727            }
728        }
729
730        PosTypes { pos_types }
731    }
732
733    /// Check if p is S-position.
734    fn is_s_pos(&self, p: usize) -> bool {
735        self.pos_types.get_bit(p as u64)
736    }
737
738    /// Check if p is L-position.
739    fn is_l_pos(&self, p: usize) -> bool {
740        !self.pos_types.get_bit(p as u64)
741    }
742
743    /// Check if p is LMS-position.
744    fn is_lms_pos(&self, p: usize) -> bool {
745        p != 0 && self.is_s_pos(p) && self.is_l_pos(p - 1)
746    }
747}
748
749#[cfg(test)]
750mod tests {
751    use super::*;
752    use super::{transform_text, PosTypes, Sais};
753    use crate::alphabets::{dna, Alphabet};
754    use crate::data_structures::bwt::{bwt, less};
755    use bv::{BitVec, BitsPush};
756    use rand;
757    use rand::prelude::*;
758    use std::str;
759
760    #[test]
761    fn test_pos_types() {
762        let orig_text = b"GCCTTAACATTATTACGCCTA$";
763        let alphabet = Alphabet::new(orig_text);
764        let text: Vec<u8> = transform_text(orig_text, &alphabet, 1);
765        let n = text.len();
766
767        let pos_types = PosTypes::new(&text);
768        //let mut test = BitSlice::from_slice(&[0b01100110, 0b10010011, 0b01100100]).to_owned();
769        let mut test = BitVec::new();
770        test.push_block(0b001001101100100101100110);
771        test.truncate(n as u64);
772        assert_eq!(pos_types.pos_types, test);
773        let lms_pos: Vec<usize> = (0..n).filter(|&p| pos_types.is_lms_pos(p)).collect();
774        assert_eq!(lms_pos, vec![1, 5, 8, 11, 14, 17, 21]);
775    }
776
777    #[test]
778    fn test_buckets() {
779        let orig_text = b"GCCTTAACATTATTACGCCTA$";
780        let alphabet = Alphabet::new(orig_text);
781        let text: Vec<u8> = transform_text(orig_text, &alphabet, 1);
782        let n = text.len();
783
784        let mut sais = Sais::new(n);
785        sais.init_bucket_start(&text);
786        assert_eq!(sais.bucket_start, vec![0, 1, 7, 13, 15]);
787        sais.init_bucket_end(&text);
788        assert_eq!(sais.bucket_end, vec![0, 6, 12, 14, 21]);
789    }
790
791    #[test]
792    fn test_pos() {
793        let orig_text = b"GCCTTAACATTATTACGCCTA$";
794        let alphabet = Alphabet::new(orig_text);
795        let text: Vec<u8> = transform_text(orig_text, &alphabet, 1);
796        let n = text.len();
797
798        let mut sais = Sais::new(n);
799        let pos_types = PosTypes::new(&text);
800        sais.lms_pos = vec![21, 5, 14, 8, 11, 17, 1];
801        sais.calc_pos(&text, &pos_types);
802        assert_eq!(
803            sais.pos,
804            vec![21, 20, 5, 6, 14, 11, 8, 7, 17, 1, 15, 18, 2, 16, 0, 19, 4, 13, 10, 3, 12, 9,]
805        );
806    }
807
808    #[test]
809    fn test_lms_pos() {
810        let orig_text = b"GCCTTAACATTATTACGCCTA$";
811        let alphabet = Alphabet::new(orig_text);
812        let text: Vec<u8> = transform_text(orig_text, &alphabet, 1);
813        let n = text.len();
814
815        let mut sais = Sais::new(n);
816        let pos_types = PosTypes::new(&text);
817        sais.calc_lms_pos(&text, &pos_types);
818    }
819
820    #[test]
821    fn test_issue10_1() {
822        let text = b"TGTGTGTGTG$";
823        let pos = suffix_array(text);
824        assert_eq!(pos, [10, 9, 7, 5, 3, 1, 8, 6, 4, 2, 0]);
825    }
826
827    #[test]
828    fn test_issue10_2() {
829        let text = b"TGTGTGTG$";
830        let pos = suffix_array(text);
831        assert_eq!(pos, [8, 7, 5, 3, 1, 6, 4, 2, 0]);
832    }
833
834    #[test]
835    fn test_handles_sentinels_properly() {
836        let reads = b"TACTCCGCTAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTT$AAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTAGCGGAGTA$TAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTTGTCCAGTA$TACTGGACAAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTA$ACGCACCCCGGCATTCGTCGACTCTACACTTAGTGGAACATACAAATTCGCTCGCAGGAGCGCCTCATACATTCTAACGCAGTGATCTTCGGCTGAGACT$AGTCTCAGCCGAAGATCACTGCGTTAGAATGTATGAGGCGCTCCTGCGAGCGAATTTGTATGTTCCACTAAGTGTAGAGTCGACGAATGCCGGGGTGCGT$";
837        suffix_array(reads);
838    }
839
840    fn str_from_pos(sa: &[usize], text: &[u8], index: usize) -> String {
841        String::from(
842            str::from_utf8(&text[sa[index]..])
843                .unwrap()
844                .split('$')
845                .next()
846                .unwrap_or(""),
847        ) + "$"
848    }
849
850    fn rand_seqs(num_seqs: usize, seq_len: usize) -> Vec<u8> {
851        let mut rng = rand::rng();
852        let alpha = [b'A', b'T', b'C', b'G', b'N'];
853        let seqs = (0..num_seqs)
854            .map(|_| {
855                let len = rng.random_range((seq_len / 2)..=seq_len);
856                (0..len)
857                    .map(|_| *alpha.choose(&mut rng).unwrap())
858                    .collect::<Vec<u8>>()
859            })
860            .collect::<Vec<_>>();
861        let mut res = seqs.join(&b'$');
862        res.push(b'$');
863        res
864    }
865
866    #[test]
867    fn test_sorts_lexically() {
868        let mut test_cases = vec![(&b"A$C$G$T$"[..], "simple"),
869             (&b"A$A$T$T$"[..], "duplicates"),
870             (&b"AA$GA$CA$TA$TC$TG$GT$GC$"[..], "two letter"),
871             (&b"AGCCAT$CAGCC$"[..], "substring"),
872             (&b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\
873                AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$"[..],
874                "complex"),
875             (&b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\
876                TTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATTATAATTAGGCCTAC$\
877                AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$\
878                TATTGGAATAGCCATTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATT$"[..],
879                "complex with revcomps"),
880             ];
881        let num_rand = 100;
882        let rand_cases: Vec<_> = (0..num_rand).map(|i| rand_seqs(10, i * 10)).collect();
883        test_cases.extend(
884            rand_cases
885                .iter()
886                .map(|case| (case.as_ref(), "rand test case")),
887        );
888
889        for &(text, test_name) in &test_cases {
890            let pos = suffix_array(text);
891            for i in 0..(pos.len() - 2) {
892                // Check that every element in the suffix array is lexically <= the next elem
893                let cur = str_from_pos(&pos, text, i);
894                let next = str_from_pos(&pos, text, i + 1);
895
896                assert!(
897                    cur <= next,
898                    "Failed:\n{}\n{}\nat positions {} and {} are out of order in \
899                         test: {}",
900                    cur,
901                    next,
902                    pos[i],
903                    pos[i + 1],
904                    test_name
905                );
906            }
907        }
908    }
909
910    #[test]
911    fn test_sampled_matches() {
912        let mut test_cases = vec![(&b"A$C$G$T$"[..], "simple"),
913             (&b"A$A$T$T$"[..], "duplicates"),
914             (&b"AA$GA$CA$TA$TC$TG$GT$GC$"[..], "two letter"),
915             (&b"AGCCAT$\
916                CAGCC$"[..],
917                "substring"),
918             (&b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\
919                AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$"[..],
920                "complex"),
921             (&b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\
922                TTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATTATAATTAGGCCTAC$\
923                AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$\
924                TATTGGAATAGCCATTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATT$"[..],
925                "complex with revcomps"),
926             (&b"GTAG$GCCTAAT$TATAATCAG$"[..], "issue70"),
927             (&b"TGTGTGTGTG$"[..], "repeating"),
928             (&b"TACTCCGCTAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTT$AAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTAGCGGAGTA$TAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTTGTCCAGTA$TACTGGACAAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTA$ACGCACCCCGGCATTCGTCGACTCTACACTTAGTGGAACATACAAATTCGCTCGCAGGAGCGCCTCATACATTCTAACGCAGTGATCTTCGGCTGAGACT$AGTCTCAGCCGAAGATCACTGCGTTAGAATGTATGAGGCGCTCCTGCGAGCGAATTTGTATGTTCCACTAAGTGTAGAGTCGACGAATGCCGGGGTGCGT$"[..], "complex sentinels"),
929             ];
930        let num_rand = 100;
931        let rand_cases: Vec<_> = (0..num_rand).map(|i| rand_seqs(10, i * 10)).collect();
932        test_cases.extend(
933            rand_cases
934                .iter()
935                .map(|case| (case.as_ref(), "rand test case")),
936        );
937
938        for &(text, test_name) in test_cases.iter() {
939            for &sample_rate in &[2, 3, 5, 16] {
940                let alphabet = dna::n_alphabet();
941                let sa = suffix_array(text);
942                let bwt = bwt(text, &sa);
943                let less = less(&bwt, &alphabet);
944                let occ = Occ::new(&bwt, 3, &alphabet);
945                let sampled = sa.sample(text, &bwt, &less, &occ, sample_rate);
946
947                for i in 0..sa.len() {
948                    let sa_idx = sa.get(i).unwrap();
949                    let sampled_idx = sampled.get(i).unwrap();
950                    assert_eq!(
951                        sa_idx,
952                        sampled_idx,
953                        "Failed:\n{}\n{}\nat index {} do not match in test: {} (sample rate: {})",
954                        str::from_utf8(&text[sa_idx..]).unwrap(),
955                        str::from_utf8(&text[sampled_idx..]).unwrap(),
956                        i,
957                        test_name,
958                        sample_rate
959                    );
960                }
961            }
962        }
963    }
964
965    #[test]
966    fn can_construct_sa_for_usize() {
967        let text: Vec<usize> = vec![3, 2, 2, 4, 4, 1, 2, 1, 0];
968        let sa = suffix_array_int(&text);
969        assert_eq!(sa, vec![8, 7, 5, 6, 1, 2, 0, 4, 3]);
970    }
971}
bio/data_structures/suffix_array.rs

bio/data_structures/
suffix_array.rs