1use crate::error::{SeqError, SeqResult};
50
51#[derive(Debug, Clone)]
68pub struct SuffixArray {
69 sa: Vec<usize>,
71 text: Vec<u8>,
73}
74
75impl SuffixArray {
76 pub fn new(s: &[u8]) -> SeqResult<Self> {
84 if s.is_empty() {
85 return Err(SeqError::EmptyInput);
86 }
87 let sa = build_sais(s);
88 Ok(Self {
89 sa,
90 text: s.to_vec(),
91 })
92 }
93
94 pub fn sa(&self) -> &[usize] {
96 &self.sa
97 }
98
99 pub fn text(&self) -> &[u8] {
101 &self.text
102 }
103
104 pub fn rank(&self) -> Vec<usize> {
106 let n = self.sa.len();
107 let mut rank = vec![0usize; n];
108 for (i, &p) in self.sa.iter().enumerate() {
109 rank[p] = i;
110 }
111 rank
112 }
113
114 pub fn lcp(&self) -> Vec<usize> {
120 let n = self.sa.len();
121 let mut lcp = vec![0usize; n];
122 if n == 0 {
123 return lcp;
124 }
125 let rank = self.rank();
126 let mut h = 0usize; for i in 0..n {
128 if rank[i] == 0 {
129 h = 0;
131 continue;
132 }
133 let j = self.sa[rank[i] - 1]; while i + h < n && j + h < n && self.text[i + h] == self.text[j + h] {
135 h += 1;
136 }
137 lcp[rank[i]] = h;
138 h = h.saturating_sub(1);
139 }
140 lcp
141 }
142
143 pub fn distinct_substring_count(&self) -> usize {
150 let n = self.sa.len();
151 let total = n * (n + 1) / 2;
152 let lcp_sum: usize = self.lcp().iter().sum();
153 total - lcp_sum
154 }
155
156 pub fn search(&self, pattern: &[u8]) -> Vec<usize> {
162 if pattern.is_empty() {
163 return Vec::new();
164 }
165 let n = self.sa.len();
166 let lo = self.lower_bound(pattern);
168 if lo == n {
169 return Vec::new();
170 }
171 let hi = self.upper_bound(pattern);
173 let mut out: Vec<usize> = self.sa[lo..hi].to_vec();
174 out.sort_unstable();
175 out
176 }
177
178 fn lower_bound(&self, pattern: &[u8]) -> usize {
182 let n = self.sa.len();
183 let (mut lo, mut hi) = (0usize, n);
184 while lo < hi {
185 let mid = lo + (hi - lo) / 2;
186 if self.suffix_lt_pattern(self.sa[mid], pattern) {
187 lo = mid + 1;
188 } else {
189 hi = mid;
190 }
191 }
192 lo
193 }
194
195 fn upper_bound(&self, pattern: &[u8]) -> usize {
199 let n = self.sa.len();
200 let (mut lo, mut hi) = (0usize, n);
201 while lo < hi {
202 let mid = lo + (hi - lo) / 2;
203 if self.suffix_le_pattern_prefix(self.sa[mid], pattern) {
204 lo = mid + 1;
205 } else {
206 hi = mid;
207 }
208 }
209 lo
210 }
211
212 fn suffix_lt_pattern(&self, start: usize, pattern: &[u8]) -> bool {
215 let suf = &self.text[start..];
216 let m = pattern.len().min(suf.len());
217 for k in 0..m {
218 if suf[k] != pattern[k] {
219 return suf[k] < pattern[k];
220 }
221 }
222 suf.len() < pattern.len()
224 }
225
226 fn suffix_le_pattern_prefix(&self, start: usize, pattern: &[u8]) -> bool {
229 let suf = &self.text[start..];
230 let m = pattern.len().min(suf.len());
231 for k in 0..m {
232 if suf[k] != pattern[k] {
233 return suf[k] < pattern[k];
234 }
235 }
236 true
239 }
240}
241
242fn build_sais(s: &[u8]) -> Vec<usize> {
245 let n = s.len();
246 let mut work: Vec<usize> = Vec::with_capacity(n + 1);
248 for &b in s {
249 work.push(b as usize + 1);
250 }
251 work.push(0); let sa_full = sais_core(&work, 257);
254 sa_full.into_iter().skip(1).collect()
256}
257
258type SType = bool;
260
261fn sais_core(s: &[usize], alphabet: usize) -> Vec<usize> {
264 let n = s.len();
265 let mut sa = vec![usize::MAX; n];
266 if n == 1 {
267 sa[0] = 0;
268 return sa;
269 }
270 if n == 2 {
271 sa[0] = 1;
273 sa[1] = 0;
274 return sa;
275 }
276
277 let t = classify_types(s);
279
280 let bucket_sizes = bucket_sizes(s, alphabet);
282
283 let lms_positions: Vec<usize> = (1..n).filter(|&i| is_lms(&t, i)).collect();
285
286 place_lms_at_bucket_ends(&mut sa, s, &bucket_sizes, &lms_positions);
287 induce_l(&mut sa, s, &t, &bucket_sizes);
288 induce_s(&mut sa, s, &t, &bucket_sizes);
289
290 let (reduced, names_count, lms_order) = name_lms_substrings(&sa, s, &t);
292
293 let lms_sorted: Vec<usize> = if names_count == lms_order.len() {
295 let mut sorted = vec![0usize; lms_order.len()];
297 for (k, &pos) in lms_order.iter().enumerate() {
298 sorted[reduced[k]] = pos;
299 }
300 sorted
301 } else {
302 let sub_sa = sais_core(&reduced, names_count + 1);
303 sub_sa.into_iter().map(|r| lms_order[r]).collect()
305 };
306
307 for slot in sa.iter_mut() {
309 *slot = usize::MAX;
310 }
311 place_lms_sorted_at_bucket_ends(&mut sa, s, &bucket_sizes, &lms_sorted);
312 induce_l(&mut sa, s, &t, &bucket_sizes);
313 induce_s(&mut sa, s, &t, &bucket_sizes);
314
315 sa
316}
317
318fn classify_types(s: &[usize]) -> Vec<SType> {
320 let n = s.len();
321 let mut t = vec![false; n];
322 t[n - 1] = true; for i in (0..n - 1).rev() {
324 t[i] = match s[i].cmp(&s[i + 1]) {
325 std::cmp::Ordering::Less => true,
326 std::cmp::Ordering::Greater => false,
327 std::cmp::Ordering::Equal => t[i + 1],
328 };
329 }
330 t
331}
332
333fn is_lms(t: &[SType], i: usize) -> bool {
335 i > 0 && t[i] && !t[i - 1]
336}
337
338fn bucket_sizes(s: &[usize], alphabet: usize) -> Vec<usize> {
340 let mut sizes = vec![0usize; alphabet];
341 for &c in s {
342 sizes[c] += 1;
343 }
344 sizes
345}
346
347fn bucket_heads(sizes: &[usize]) -> Vec<usize> {
349 let mut heads = vec![0usize; sizes.len()];
350 let mut sum = 0usize;
351 for (i, &sz) in sizes.iter().enumerate() {
352 heads[i] = sum;
353 sum += sz;
354 }
355 heads
356}
357
358fn bucket_tails(sizes: &[usize]) -> Vec<usize> {
360 let mut tails = vec![0usize; sizes.len()];
361 let mut sum = 0usize;
362 for (i, &sz) in sizes.iter().enumerate() {
363 sum += sz;
364 tails[i] = sum.wrapping_sub(1);
365 }
366 tails
367}
368
369fn place_lms_at_bucket_ends(
371 sa: &mut [usize],
372 s: &[usize],
373 sizes: &[usize],
374 lms_positions: &[usize],
375) {
376 let mut tails = bucket_tails(sizes);
377 for &p in lms_positions {
378 let c = s[p];
379 sa[tails[c]] = p;
380 tails[c] = tails[c].wrapping_sub(1);
381 }
382}
383
384fn place_lms_sorted_at_bucket_ends(
387 sa: &mut [usize],
388 s: &[usize],
389 sizes: &[usize],
390 lms_sorted: &[usize],
391) {
392 let mut tails = bucket_tails(sizes);
393 for &p in lms_sorted.iter().rev() {
394 let c = s[p];
395 sa[tails[c]] = p;
396 tails[c] = tails[c].wrapping_sub(1);
397 }
398}
399
400fn induce_l(sa: &mut [usize], s: &[usize], t: &[SType], sizes: &[usize]) {
402 let n = s.len();
403 let mut heads = bucket_heads(sizes);
404 for i in 0..n {
405 let p = sa[i];
406 if p == usize::MAX || p == 0 {
407 continue;
408 }
409 let j = p - 1;
410 if !t[j] {
411 let c = s[j];
413 sa[heads[c]] = j;
414 heads[c] += 1;
415 }
416 }
417}
418
419fn induce_s(sa: &mut [usize], s: &[usize], t: &[SType], sizes: &[usize]) {
421 let n = s.len();
422 let mut tails = bucket_tails(sizes);
423 for i in (0..n).rev() {
424 let p = sa[i];
425 if p == usize::MAX || p == 0 {
426 continue;
427 }
428 let j = p - 1;
429 if t[j] {
430 let c = s[j];
432 sa[tails[c]] = j;
433 tails[c] = tails[c].wrapping_sub(1);
434 }
435 }
436}
437
438fn name_lms_substrings(sa: &[usize], s: &[usize], t: &[SType]) -> (Vec<usize>, usize, Vec<usize>) {
445 let n = s.len();
446 let mut lms_in_sa: Vec<usize> = Vec::new();
448 for &p in sa {
449 if p != usize::MAX && is_lms(t, p) {
450 lms_in_sa.push(p);
451 }
452 }
453
454 let mut names = vec![usize::MAX; n];
456 let mut current_name = 0usize;
457 names[lms_in_sa[0]] = current_name;
458 let mut prev = lms_in_sa[0];
459 for &cur in lms_in_sa.iter().skip(1) {
460 if !lms_substrings_equal(s, t, prev, cur) {
461 current_name += 1;
462 }
463 names[cur] = current_name;
464 prev = cur;
465 }
466 let names_count = current_name + 1;
467
468 let mut lms_order: Vec<usize> = Vec::new();
471 let mut reduced: Vec<usize> = Vec::new();
472 for i in 0..n {
473 if is_lms(t, i) {
474 lms_order.push(i);
475 reduced.push(names[i]);
476 }
477 }
478 (reduced, names_count, lms_order)
479}
480
481fn lms_substrings_equal(s: &[usize], t: &[SType], a: usize, b: usize) -> bool {
484 let n = s.len();
485 if a == b {
486 return true;
487 }
488 let mut i = 0usize;
489 loop {
490 let ai = a + i;
491 let bi = b + i;
492 if ai >= n || bi >= n {
494 return false;
495 }
496 let a_is_lms = is_lms(t, ai);
497 let b_is_lms = is_lms(t, bi);
498 if i > 0 && a_is_lms && b_is_lms {
499 return true;
501 }
502 if a_is_lms != b_is_lms {
503 return false;
504 }
505 if s[ai] != s[bi] || t[ai] != t[bi] {
506 return false;
507 }
508 i += 1;
509 }
510}
511
512#[cfg(test)]
513mod tests {
514 use super::*;
515
516 fn brute_force_sa(s: &[u8]) -> Vec<usize> {
518 let n = s.len();
519 let mut idx: Vec<usize> = (0..n).collect();
520 idx.sort_by(|&a, &b| s[a..].cmp(&s[b..]));
521 idx
522 }
523
524 fn brute_force_lcp(s: &[u8], sa: &[usize]) -> Vec<usize> {
526 let n = sa.len();
527 let mut lcp = vec![0usize; n];
528 for i in 1..n {
529 let (a, b) = (sa[i - 1], sa[i]);
530 let mut k = 0;
531 while a + k < s.len() && b + k < s.len() && s[a + k] == s[b + k] {
532 k += 1;
533 }
534 lcp[i] = k;
535 }
536 lcp
537 }
538
539 fn brute_force_distinct(s: &[u8]) -> usize {
541 let n = s.len();
542 let mut set = std::collections::BTreeSet::new();
543 for i in 0..n {
544 for j in i + 1..=n {
545 set.insert(s[i..j].to_vec());
546 }
547 }
548 set.len()
549 }
550
551 fn naive_search(p: &[u8], t: &[u8]) -> Vec<usize> {
552 let (m, n) = (p.len(), t.len());
553 if m == 0 || m > n {
554 return Vec::new();
555 }
556 (0..=(n - m)).filter(|&i| &t[i..i + m] == p).collect()
557 }
558
559 fn random_bytes(rng: &mut crate::handle::LcgRng, alphabet: &[u8], len: usize) -> Vec<u8> {
560 (0..len)
561 .map(|_| alphabet[rng.next_usize(alphabet.len())])
562 .collect()
563 }
564
565 #[test]
567 fn sa_matches_brute_force() {
568 for s in [b"banana".as_slice(), b"mississippi", b"abracadabra"] {
570 let sa = SuffixArray::new(s).expect("non-empty");
571 assert_eq!(sa.sa(), brute_force_sa(s).as_slice(), "SA for {s:?}");
572 }
573 let mut rng = crate::handle::LcgRng::new(11);
575 for &alphabet in &[b"a".as_slice(), b"ab", b"abc", b"abcd"] {
576 for _ in 0..400 {
577 let len = 1 + rng.next_usize(40);
578 let s = random_bytes(&mut rng, alphabet, len);
579 let got = SuffixArray::new(&s).expect("non-empty");
580 assert_eq!(got.sa(), brute_force_sa(&s).as_slice(), "SA for {s:?}");
581 }
582 }
583 }
584
585 #[test]
587 fn sa_is_permutation() {
588 let mut rng = crate::handle::LcgRng::new(22);
589 for _ in 0..300 {
590 let len = 1 + rng.next_usize(40);
591 let s = random_bytes(&mut rng, b"abc", len);
592 let sa = SuffixArray::new(&s).expect("non-empty");
593 let n = s.len();
594 let mut seen = vec![false; n];
595 assert_eq!(sa.sa().len(), n);
596 for &p in sa.sa() {
597 assert!(p < n, "index out of range");
598 assert!(!seen[p], "duplicate index {p}");
599 seen[p] = true;
600 }
601 assert!(seen.iter().all(|&b| b), "not all indices present");
602 }
603 }
604
605 #[test]
607 fn lcp_matches_brute_force() {
608 for s in [b"banana".as_slice(), b"mississippi", b"aaaa"] {
609 let sa = SuffixArray::new(s).expect("non-empty");
610 let got = sa.lcp();
611 let want = brute_force_lcp(s, sa.sa());
612 assert_eq!(got, want, "LCP for {s:?}");
613 }
614 let mut rng = crate::handle::LcgRng::new(33);
615 for &alphabet in &[b"ab".as_slice(), b"abc"] {
616 for _ in 0..400 {
617 let len = 1 + rng.next_usize(40);
618 let s = random_bytes(&mut rng, alphabet, len);
619 let sa = SuffixArray::new(&s).expect("non-empty");
620 assert_eq!(sa.lcp(), brute_force_lcp(&s, sa.sa()), "LCP {s:?}");
621 }
622 }
623 }
624
625 #[test]
627 fn repeated_characters() {
628 let sa = SuffixArray::new(b"aaaa").expect("non-empty");
629 assert_eq!(sa.sa(), &[3, 2, 1, 0]);
631 assert_eq!(sa.lcp(), vec![0, 1, 2, 3]);
633
634 let sa = SuffixArray::new(b"aaaaaaaa").expect("non-empty");
635 assert_eq!(sa.sa(), brute_force_sa(b"aaaaaaaa").as_slice());
636 }
637
638 #[test]
640 fn search_matches_naive() {
641 let sa = SuffixArray::new(b"banana").expect("non-empty");
643 assert_eq!(sa.search(b"ana"), vec![1, 3]);
644 assert_eq!(sa.search(b"a"), vec![1, 3, 5]);
645 assert_eq!(sa.search(b"banana"), vec![0]);
646 assert!(sa.search(b"xyz").is_empty());
647 assert!(sa.search(b"").is_empty());
648
649 let mut rng = crate::handle::LcgRng::new(44);
651 for &alphabet in &[b"ab".as_slice(), b"abc"] {
652 for _ in 0..400 {
653 let tlen = 1 + rng.next_usize(40);
654 let plen = 1 + rng.next_usize(5);
655 let t = random_bytes(&mut rng, alphabet, tlen);
656 let p = random_bytes(&mut rng, alphabet, plen);
657 let sa = SuffixArray::new(&t).expect("non-empty");
658 let mut want = naive_search(&p, &t);
659 want.sort_unstable();
660 assert_eq!(sa.search(&p), want, "search p={p:?} t={t:?}");
661 }
662 }
663 }
664
665 #[test]
667 fn distinct_substring_count_matches_brute_force() {
668 for s in [b"banana".as_slice(), b"mississippi", b"aaaa", b"abcabc"] {
669 let sa = SuffixArray::new(s).expect("non-empty");
670 assert_eq!(
671 sa.distinct_substring_count(),
672 brute_force_distinct(s),
673 "distinct for {s:?}"
674 );
675 }
676 let mut rng = crate::handle::LcgRng::new(55);
677 for &alphabet in &[b"ab".as_slice(), b"abc"] {
678 for _ in 0..200 {
679 let len = 1 + rng.next_usize(24);
680 let s = random_bytes(&mut rng, alphabet, len);
681 let sa = SuffixArray::new(&s).expect("non-empty");
682 assert_eq!(
683 sa.distinct_substring_count(),
684 brute_force_distinct(&s),
685 "distinct for {s:?}"
686 );
687 }
688 }
689 }
690
691 #[test]
693 fn empty_input_errors() {
694 assert!(matches!(SuffixArray::new(b""), Err(SeqError::EmptyInput)));
695 }
696
697 #[test]
699 fn single_char() {
700 let sa = SuffixArray::new(b"x").expect("non-empty");
701 assert_eq!(sa.sa(), &[0]);
702 assert_eq!(sa.lcp(), vec![0]);
703 assert_eq!(sa.search(b"x"), vec![0]);
704 assert!(sa.search(b"y").is_empty());
705 assert_eq!(sa.distinct_substring_count(), 1);
706 }
707
708 #[test]
710 fn rank_is_inverse() {
711 let mut rng = crate::handle::LcgRng::new(66);
712 for _ in 0..200 {
713 let len = 1 + rng.next_usize(30);
714 let s = random_bytes(&mut rng, b"abc", len);
715 let sa = SuffixArray::new(&s).expect("non-empty");
716 let rank = sa.rank();
717 for (i, &p) in sa.sa().iter().enumerate() {
718 assert_eq!(rank[p], i);
719 }
720 }
721 }
722}