1use std::borrow::Borrow;
25use std::cmp;
26use std::collections::HashMap;
27use std::fmt::Debug;
28use std::hash::BuildHasherDefault;
29use std::iter;
30use std::ops::Deref;
31
32use num_integer::Integer;
33use num_traits::{cast, NumCast, Unsigned};
34
35use bv::{BitVec, Bits, BitsMut};
36use vec_map::VecMap;
37
38use fxhash::FxHasher;
39
40use crate::alphabets::{Alphabet, RankTransform};
41use crate::data_structures::bwt::{Less, Occ, BWT};
42use crate::data_structures::smallints::SmallInts;
43
44pub type LCPArray = SmallInts<i8, isize>;
45pub type RawSuffixArray = Vec<usize>;
46pub type RawSuffixArraySlice<'a> = &'a [usize];
47
48type HashMapFx<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher>>;
49
50pub trait SuffixArray {
52 fn get(&self, index: usize) -> Option<usize>;
53 fn len(&self) -> usize;
54 fn is_empty(&self) -> bool;
55
56 fn sample<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>>(
86 &self,
87 text: &[u8],
88 bwt: DBWT,
89 less: DLess,
90 occ: DOcc,
91 sampling_rate: usize,
92 ) -> SampledSuffixArray<DBWT, DLess, DOcc> {
93 let mut sample =
94 Vec::with_capacity((self.len() as f32 / sampling_rate as f32).ceil() as usize);
95 let mut extra_rows = HashMapFx::default();
96 let sentinel = sentinel(text);
97
98 for i in 0..self.len() {
99 let idx = self.get(i).unwrap();
100 if (i % sampling_rate) == 0 {
101 sample.push(idx);
102 } else if bwt.borrow()[i] == sentinel {
103 extra_rows.insert(i, idx);
107 }
108 }
109
110 SampledSuffixArray {
111 bwt,
112 less,
113 occ,
114 sample,
115 s: sampling_rate,
116 extra_rows,
117 sentinel,
118 }
119 }
120}
121
122#[derive(Default, Clone, Eq, PartialEq, Debug, Serialize, Deserialize)]
124pub struct SampledSuffixArray<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>> {
125 bwt: DBWT,
126 less: DLess,
127 occ: DOcc,
128 sample: Vec<usize>,
129 s: usize, extra_rows: HashMapFx<usize, usize>,
131 sentinel: u8,
132}
133
134impl SuffixArray for RawSuffixArray {
135 fn get(&self, index: usize) -> Option<usize> {
136 if index < self.len() {
138 Some(self[index])
139 } else {
140 None
141 }
142 }
143
144 fn len(&self) -> usize {
145 Vec::len(self)
146 }
147
148 fn is_empty(&self) -> bool {
149 Vec::is_empty(self)
150 }
151}
152
153impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>> SuffixArray
154 for SampledSuffixArray<DBWT, DLess, DOcc>
155{
156 fn get(&self, index: usize) -> Option<usize> {
157 if index < self.len() {
158 let mut pos = index;
159 let mut offset = 0;
160 loop {
161 if pos % self.s == 0 {
162 return Some(self.sample[pos / self.s] + offset);
163 }
164
165 let c = self.bwt.borrow()[pos];
166
167 if c == self.sentinel {
168 return Some(self.extra_rows[&pos] + offset);
174 }
175
176 pos = self.less.borrow()[c as usize]
177 + self.occ.borrow().get(self.bwt.borrow(), pos - 1, c);
178 offset += 1;
179 }
180 } else {
181 None
182 }
183 }
184
185 fn len(&self) -> usize {
186 self.bwt.borrow().len()
187 }
188
189 fn is_empty(&self) -> bool {
190 self.bwt.borrow().is_empty()
191 }
192}
193
194impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>>
195 SampledSuffixArray<DBWT, DLess, DOcc>
196{
197 pub fn sampling_rate(&self) -> usize {
199 self.s
200 }
201
202 pub fn bwt(&self) -> &BWT {
204 self.bwt.borrow()
205 }
206
207 pub fn less(&self) -> &Less {
209 self.less.borrow()
210 }
211
212 pub fn occ(&self) -> &Occ {
214 self.occ.borrow()
215 }
216}
217
218pub fn suffix_array(text: &[u8]) -> RawSuffixArray {
264 let n = text.len();
265 let alphabet = Alphabet::new(text);
266 let sentinel_count = sentinel_count(text);
267 let mut sais = Sais::new(n);
268
269 match alphabet.len() + sentinel_count {
270 a if a <= u8::MAX as usize => {
271 sais.construct(&transform_text::<u8>(text, &alphabet, sentinel_count))
272 }
273 a if a <= u16::MAX as usize => {
274 sais.construct(&transform_text::<u16>(text, &alphabet, sentinel_count))
275 }
276 a if a <= u32::MAX as usize => {
277 sais.construct(&transform_text::<u32>(text, &alphabet, sentinel_count))
278 }
279 _ => sais.construct(&transform_text::<u64>(text, &alphabet, sentinel_count)),
280 }
281
282 sais.pos
283}
284
285pub fn suffix_array_int<T>(text: &[T]) -> RawSuffixArray
304where
305 T: Integer + Unsigned + NumCast + Copy + Debug,
306{
307 let mut sais = Sais::new(text.len());
308 sais.construct(&text);
309 sais.pos
310}
311
312pub fn lcp<SA: Deref<Target = RawSuffixArray>>(text: &[u8], pos: SA) -> LCPArray {
341 assert_eq!(text.len(), pos.len());
342 let n = text.len();
343
344 let mut rank: Vec<usize> = iter::repeat(0).take(n).collect();
346 for (r, p) in pos.iter().enumerate() {
347 rank[*p] = r;
348 }
349
350 let mut lcp = SmallInts::from_elem(-1, n + 1);
351 let mut l = 0usize;
352 for (p, &r) in rank.iter().enumerate().take(n - 1) {
353 let pred = pos[r - 1];
356 while pred + l < n && p + l < n && text[p + l] == text[pred + l] {
357 l += 1;
358 }
359 lcp.set(r, l as isize);
360 l = if l > 0 { l - 1 } else { 0 };
361 }
362
363 lcp
364}
365
366pub fn shortest_unique_substrings<SA: SuffixArray>(pos: &SA, lcp: &LCPArray) -> Vec<Option<usize>> {
407 let n = pos.len();
408 let mut sus = vec![None; n];
410 for i in 0..n {
411 let len = 1 + cmp::max(lcp.get(i).unwrap(), lcp.get(i + 1).unwrap_or(0)) as usize;
414 let p = pos.get(i).unwrap();
415 if n - p >= len {
418 sus[p] = Some(len);
419 }
420 }
421 sus
422}
423
424fn sentinel(text: &[u8]) -> u8 {
426 text[text.len() - 1]
427}
428
429fn sentinel_count(text: &[u8]) -> usize {
431 let sentinel = sentinel(text);
432 assert!(
433 text.iter().all(|&a| a >= sentinel),
434 "Expecting extra sentinel symbol being lexicographically smallest at the end of the \
435 text."
436 );
437
438 text.iter()
439 .fold(0, |count, &a| count + (a == sentinel) as usize)
440}
441
442fn transform_text<T: Integer + Unsigned + NumCast + Copy + Debug>(
444 text: &[u8],
445 alphabet: &Alphabet,
446 sentinel_count: usize,
447) -> Vec<T> {
448 let sentinel = sentinel(text);
449 let transform = RankTransform::new(alphabet);
450 let offset = sentinel_count - 1;
451
452 let mut transformed: Vec<T> = Vec::with_capacity(text.len());
453 let mut s = sentinel_count;
454 for &a in text {
455 if a == sentinel {
456 s -= 1;
457 transformed.push(cast(s).unwrap());
458 } else {
459 transformed
460 .push(cast(*(transform.ranks.get(a as usize)).unwrap() as usize + offset).unwrap());
461 }
462 }
463
464 transformed
465}
466
467struct Sais {
469 pos: Vec<usize>,
470 lms_pos: Vec<usize>,
471 reduced_text_pos: Vec<usize>,
472 bucket_sizes: VecMap<usize>,
473 bucket_start: Vec<usize>,
474 bucket_end: Vec<usize>,
475}
476
477impl Sais {
478 fn new(n: usize) -> Self {
480 Sais {
481 pos: Vec::with_capacity(n),
482 lms_pos: Vec::with_capacity(n),
483 reduced_text_pos: vec![0; n],
484 bucket_sizes: VecMap::new(),
485 bucket_start: Vec::with_capacity(n),
486 bucket_end: Vec::with_capacity(n),
487 }
488 }
489
490 fn init_bucket_start<T: Integer + Unsigned + NumCast + Copy>(&mut self, text: &[T]) {
492 self.bucket_sizes.clear();
493 self.bucket_start.clear();
494
495 for &c in text {
496 if !self.bucket_sizes.contains_key(cast(c).unwrap()) {
497 self.bucket_sizes.insert(cast(c).unwrap(), 0);
498 }
499 *(self.bucket_sizes.get_mut(cast(c).unwrap()).unwrap()) += 1;
500 }
501
502 let mut sum = 0;
503 for &size in self.bucket_sizes.values() {
504 self.bucket_start.push(sum);
505 sum += size;
506 }
507 }
508
509 fn init_bucket_end<T: Integer + Unsigned + NumCast + Copy>(&mut self, text: &[T]) {
511 self.bucket_end.clear();
512 for &r in &self.bucket_start[1..] {
513 self.bucket_end.push(r - 1);
514 }
515 self.bucket_end.push(text.len() - 1);
516 }
517
518 fn lms_substring_eq<T: Integer + Unsigned + NumCast + Copy>(
520 &self,
521 text: &[T],
522 pos_types: &PosTypes,
523 i: usize,
524 j: usize,
525 ) -> bool {
526 for k in 0.. {
527 let lmsi = pos_types.is_lms_pos(i + k);
528 let lmsj = pos_types.is_lms_pos(j + k);
529 if text[i + k] != text[j + k] {
530 return false;
532 }
533 if lmsi != lmsj {
534 return false;
536 }
537 if k > 0 && lmsi && lmsj {
538 return true;
540 }
541 }
542 false
543 }
544
545 fn sort_lms_suffixes<
547 T: Integer + Unsigned + NumCast + Copy + Debug,
548 S: Integer + Unsigned + NumCast + Copy + Debug,
549 >(
550 &mut self,
551 text: &[T],
552 pos_types: &PosTypes,
553 lms_substring_count: usize,
554 ) {
555 if lms_substring_count > 1 {
557 let mut reduced_text: Vec<S> = vec![cast(0).unwrap(); lms_substring_count];
559 let mut label = 0;
560 reduced_text[self.reduced_text_pos[self.pos[0]]] = cast(label).unwrap();
561 let mut prev = None;
562 for &p in &self.pos {
563 if pos_types.is_lms_pos(p) {
564 if prev.is_some() && !self.lms_substring_eq(text, pos_types, prev.unwrap(), p) {
566 label += 1;
567 }
568 reduced_text[self.reduced_text_pos[p]] = cast(label).unwrap();
569 prev = Some(p);
570 }
571 }
572
573 if label + 1 < lms_substring_count {
576 let lms_pos = self.lms_pos.clone();
578 self.construct(&reduced_text);
580 self.lms_pos.clear();
582 for &p in &self.pos {
583 self.lms_pos.push(lms_pos[p]);
584 }
585 } else {
586 self.lms_pos.clear();
589 for &p in &self.pos {
590 if pos_types.is_lms_pos(p) {
591 self.lms_pos.push(p);
592 }
593 }
594 }
595 }
596 }
597
598 fn construct<T: Integer + Unsigned + NumCast + Copy + Debug>(&mut self, text: &[T]) {
600 let pos_types = PosTypes::new(text);
601 self.calc_lms_pos(text, &pos_types);
602 self.calc_pos(text, &pos_types);
603 }
604
605 fn calc_lms_pos<T: Integer + Unsigned + NumCast + Copy + Debug>(
607 &mut self,
608 text: &[T],
609 pos_types: &PosTypes,
610 ) {
611 let n = text.len();
612
613 self.lms_pos.clear();
615 let mut i = 0;
616 for r in 0..n {
617 if pos_types.is_lms_pos(r) {
618 self.lms_pos.push(r);
619 self.reduced_text_pos[r] = i;
620 i += 1;
621 }
622 }
623
624 self.calc_pos(text, pos_types);
626
627 let lms_substring_count = self.lms_pos.len();
628
629 if lms_substring_count <= u8::MAX as usize {
630 self.sort_lms_suffixes::<T, u8>(text, pos_types, lms_substring_count);
631 } else if lms_substring_count <= u16::MAX as usize {
632 self.sort_lms_suffixes::<T, u16>(text, pos_types, lms_substring_count);
633 } else if lms_substring_count <= u32::MAX as usize {
634 self.sort_lms_suffixes::<T, u32>(text, pos_types, lms_substring_count);
635 } else {
636 self.sort_lms_suffixes::<T, u64>(text, pos_types, lms_substring_count);
637 }
638 }
639
640 fn calc_pos<T: Integer + Unsigned + NumCast + Copy>(
642 &mut self,
643 text: &[T],
644 pos_types: &PosTypes,
645 ) {
646 let n = text.len();
647 self.pos.clear();
648
649 self.init_bucket_start(text);
650 self.init_bucket_end(text);
651
652 self.pos.resize(n, n);
654
655 for &p in self.lms_pos.iter().rev() {
657 let c: usize = cast(text[p]).unwrap();
658 self.pos[self.bucket_end[c]] = p;
659 self.bucket_end[c] = self.bucket_end[c].wrapping_sub(1);
661 }
662
663 self.init_bucket_end(text);
665
666 for r in 0..n {
668 let p = self.pos[r];
669 if p == n || p == 0 {
671 continue;
672 }
673 let pred = p - 1;
674 if pos_types.is_l_pos(pred) {
675 let c: usize = cast(text[pred]).unwrap();
676 self.pos[self.bucket_start[c]] = pred;
677 self.bucket_start[c] += 1;
678 }
679 }
680
681 for r in (0..n).rev() {
683 let p = self.pos[r];
684 if p == 0 {
685 continue;
686 }
687 let pred = p - 1;
688 if pos_types.is_s_pos(pred) {
689 let c: usize = cast(text[pred]).unwrap();
690 self.pos[self.bucket_end[c]] = pred;
691 self.bucket_end[c] = self.bucket_end[c].wrapping_sub(1);
693 }
694 }
695 }
696}
697
698#[derive(Debug)]
700struct PosTypes {
701 pos_types: BitVec,
702}
703
704impl PosTypes {
705 fn new<T: Integer + Unsigned + NumCast + Copy>(text: &[T]) -> Self {
715 let n = text.len();
716 let mut pos_types = BitVec::new_fill(false, n as u64);
717 pos_types.set_bit(n as u64 - 1, true);
718
719 for p in (0..n - 1).rev() {
720 if text[p] == text[p + 1] {
721 let v = pos_types.get_bit(p as u64 + 1);
724 pos_types.set_bit(p as u64, v);
725 } else {
726 pos_types.set_bit(p as u64, text[p] < text[p + 1]);
727 }
728 }
729
730 PosTypes { pos_types }
731 }
732
733 fn is_s_pos(&self, p: usize) -> bool {
735 self.pos_types.get_bit(p as u64)
736 }
737
738 fn is_l_pos(&self, p: usize) -> bool {
740 !self.pos_types.get_bit(p as u64)
741 }
742
743 fn is_lms_pos(&self, p: usize) -> bool {
745 p != 0 && self.is_s_pos(p) && self.is_l_pos(p - 1)
746 }
747}
748
749#[cfg(test)]
750mod tests {
751 use super::*;
752 use super::{transform_text, PosTypes, Sais};
753 use crate::alphabets::{dna, Alphabet};
754 use crate::data_structures::bwt::{bwt, less};
755 use bv::{BitVec, BitsPush};
756 use rand;
757 use rand::prelude::*;
758 use std::str;
759
760 #[test]
761 fn test_pos_types() {
762 let orig_text = b"GCCTTAACATTATTACGCCTA$";
763 let alphabet = Alphabet::new(orig_text);
764 let text: Vec<u8> = transform_text(orig_text, &alphabet, 1);
765 let n = text.len();
766
767 let pos_types = PosTypes::new(&text);
768 let mut test = BitVec::new();
770 test.push_block(0b001001101100100101100110);
771 test.truncate(n as u64);
772 assert_eq!(pos_types.pos_types, test);
773 let lms_pos: Vec<usize> = (0..n).filter(|&p| pos_types.is_lms_pos(p)).collect();
774 assert_eq!(lms_pos, vec![1, 5, 8, 11, 14, 17, 21]);
775 }
776
777 #[test]
778 fn test_buckets() {
779 let orig_text = b"GCCTTAACATTATTACGCCTA$";
780 let alphabet = Alphabet::new(orig_text);
781 let text: Vec<u8> = transform_text(orig_text, &alphabet, 1);
782 let n = text.len();
783
784 let mut sais = Sais::new(n);
785 sais.init_bucket_start(&text);
786 assert_eq!(sais.bucket_start, vec![0, 1, 7, 13, 15]);
787 sais.init_bucket_end(&text);
788 assert_eq!(sais.bucket_end, vec![0, 6, 12, 14, 21]);
789 }
790
791 #[test]
792 fn test_pos() {
793 let orig_text = b"GCCTTAACATTATTACGCCTA$";
794 let alphabet = Alphabet::new(orig_text);
795 let text: Vec<u8> = transform_text(orig_text, &alphabet, 1);
796 let n = text.len();
797
798 let mut sais = Sais::new(n);
799 let pos_types = PosTypes::new(&text);
800 sais.lms_pos = vec![21, 5, 14, 8, 11, 17, 1];
801 sais.calc_pos(&text, &pos_types);
802 assert_eq!(
803 sais.pos,
804 vec![21, 20, 5, 6, 14, 11, 8, 7, 17, 1, 15, 18, 2, 16, 0, 19, 4, 13, 10, 3, 12, 9,]
805 );
806 }
807
808 #[test]
809 fn test_lms_pos() {
810 let orig_text = b"GCCTTAACATTATTACGCCTA$";
811 let alphabet = Alphabet::new(orig_text);
812 let text: Vec<u8> = transform_text(orig_text, &alphabet, 1);
813 let n = text.len();
814
815 let mut sais = Sais::new(n);
816 let pos_types = PosTypes::new(&text);
817 sais.calc_lms_pos(&text, &pos_types);
818 }
819
820 #[test]
821 fn test_issue10_1() {
822 let text = b"TGTGTGTGTG$";
823 let pos = suffix_array(text);
824 assert_eq!(pos, [10, 9, 7, 5, 3, 1, 8, 6, 4, 2, 0]);
825 }
826
827 #[test]
828 fn test_issue10_2() {
829 let text = b"TGTGTGTG$";
830 let pos = suffix_array(text);
831 assert_eq!(pos, [8, 7, 5, 3, 1, 6, 4, 2, 0]);
832 }
833
834 #[test]
835 fn test_handles_sentinels_properly() {
836 let reads = b"TACTCCGCTAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTT$AAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTAGCGGAGTA$TAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTTGTCCAGTA$TACTGGACAAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTA$ACGCACCCCGGCATTCGTCGACTCTACACTTAGTGGAACATACAAATTCGCTCGCAGGAGCGCCTCATACATTCTAACGCAGTGATCTTCGGCTGAGACT$AGTCTCAGCCGAAGATCACTGCGTTAGAATGTATGAGGCGCTCCTGCGAGCGAATTTGTATGTTCCACTAAGTGTAGAGTCGACGAATGCCGGGGTGCGT$";
837 suffix_array(reads);
838 }
839
840 fn str_from_pos(sa: &[usize], text: &[u8], index: usize) -> String {
841 String::from(
842 str::from_utf8(&text[sa[index]..])
843 .unwrap()
844 .split('$')
845 .next()
846 .unwrap_or(""),
847 ) + "$"
848 }
849
850 fn rand_seqs(num_seqs: usize, seq_len: usize) -> Vec<u8> {
851 let mut rng = rand::rng();
852 let alpha = [b'A', b'T', b'C', b'G', b'N'];
853 let seqs = (0..num_seqs)
854 .map(|_| {
855 let len = rng.random_range((seq_len / 2)..=seq_len);
856 (0..len)
857 .map(|_| *alpha.choose(&mut rng).unwrap())
858 .collect::<Vec<u8>>()
859 })
860 .collect::<Vec<_>>();
861 let mut res = seqs.join(&b'$');
862 res.push(b'$');
863 res
864 }
865
866 #[test]
867 fn test_sorts_lexically() {
868 let mut test_cases = vec![(&b"A$C$G$T$"[..], "simple"),
869 (&b"A$A$T$T$"[..], "duplicates"),
870 (&b"AA$GA$CA$TA$TC$TG$GT$GC$"[..], "two letter"),
871 (&b"AGCCAT$CAGCC$"[..], "substring"),
872 (&b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\
873 AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$"[..],
874 "complex"),
875 (&b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\
876 TTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATTATAATTAGGCCTAC$\
877 AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$\
878 TATTGGAATAGCCATTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATT$"[..],
879 "complex with revcomps"),
880 ];
881 let num_rand = 100;
882 let rand_cases: Vec<_> = (0..num_rand).map(|i| rand_seqs(10, i * 10)).collect();
883 test_cases.extend(
884 rand_cases
885 .iter()
886 .map(|case| (case.as_ref(), "rand test case")),
887 );
888
889 for &(text, test_name) in &test_cases {
890 let pos = suffix_array(text);
891 for i in 0..(pos.len() - 2) {
892 let cur = str_from_pos(&pos, text, i);
894 let next = str_from_pos(&pos, text, i + 1);
895
896 assert!(
897 cur <= next,
898 "Failed:\n{}\n{}\nat positions {} and {} are out of order in \
899 test: {}",
900 cur,
901 next,
902 pos[i],
903 pos[i + 1],
904 test_name
905 );
906 }
907 }
908 }
909
910 #[test]
911 fn test_sampled_matches() {
912 let mut test_cases = vec![(&b"A$C$G$T$"[..], "simple"),
913 (&b"A$A$T$T$"[..], "duplicates"),
914 (&b"AA$GA$CA$TA$TC$TG$GT$GC$"[..], "two letter"),
915 (&b"AGCCAT$\
916 CAGCC$"[..],
917 "substring"),
918 (&b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\
919 AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$"[..],
920 "complex"),
921 (&b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\
922 TTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATTATAATTAGGCCTAC$\
923 AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$\
924 TATTGGAATAGCCATTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATT$"[..],
925 "complex with revcomps"),
926 (&b"GTAG$GCCTAAT$TATAATCAG$"[..], "issue70"),
927 (&b"TGTGTGTGTG$"[..], "repeating"),
928 (&b"TACTCCGCTAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTT$AAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTAGCGGAGTA$TAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTTGTCCAGTA$TACTGGACAAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTA$ACGCACCCCGGCATTCGTCGACTCTACACTTAGTGGAACATACAAATTCGCTCGCAGGAGCGCCTCATACATTCTAACGCAGTGATCTTCGGCTGAGACT$AGTCTCAGCCGAAGATCACTGCGTTAGAATGTATGAGGCGCTCCTGCGAGCGAATTTGTATGTTCCACTAAGTGTAGAGTCGACGAATGCCGGGGTGCGT$"[..], "complex sentinels"),
929 ];
930 let num_rand = 100;
931 let rand_cases: Vec<_> = (0..num_rand).map(|i| rand_seqs(10, i * 10)).collect();
932 test_cases.extend(
933 rand_cases
934 .iter()
935 .map(|case| (case.as_ref(), "rand test case")),
936 );
937
938 for &(text, test_name) in test_cases.iter() {
939 for &sample_rate in &[2, 3, 5, 16] {
940 let alphabet = dna::n_alphabet();
941 let sa = suffix_array(text);
942 let bwt = bwt(text, &sa);
943 let less = less(&bwt, &alphabet);
944 let occ = Occ::new(&bwt, 3, &alphabet);
945 let sampled = sa.sample(text, &bwt, &less, &occ, sample_rate);
946
947 for i in 0..sa.len() {
948 let sa_idx = sa.get(i).unwrap();
949 let sampled_idx = sampled.get(i).unwrap();
950 assert_eq!(
951 sa_idx,
952 sampled_idx,
953 "Failed:\n{}\n{}\nat index {} do not match in test: {} (sample rate: {})",
954 str::from_utf8(&text[sa_idx..]).unwrap(),
955 str::from_utf8(&text[sampled_idx..]).unwrap(),
956 i,
957 test_name,
958 sample_rate
959 );
960 }
961 }
962 }
963 }
964
965 #[test]
966 fn can_construct_sa_for_usize() {
967 let text: Vec<usize> = vec![3, 2, 2, 4, 4, 1, 2, 1, 0];
968 let sa = suffix_array_int(&text);
969 assert_eq!(sa, vec![8, 7, 5, 6, 1, 2, 0, 4, 3]);
970 }
971}