oxicuda_seq/string/
suffix_array.rs

1//! Suffix array (SA-IS) with Kasai's LCP array and SA binary-search.
2//!
3//! References:
4//! * Ge Nong, Sen Zhang & Wai Hong Chan, *"Linear Suffix Array Construction by
5//!   Almost Pure Induced-Sorting"*, Data Compression Conference (DCC), 2009,
6//!   pp. 193–202 — the **SA-IS** algorithm implemented here.
7//! * Toru Kasai, Gunho Lee, Hiroki Arimura, Setsuo Arikawa & Kunsoo Park,
8//!   *"Linear-Time Longest-Common-Prefix Computation in Suffix Arrays and Its
9//!   Applications"*, CPM 2001, LNCS 2089, pp. 181–192 — the **LCP** array.
10//!
11//! # Suffix array
12//!
13//! The suffix array of a string `s` of length `n` is the permutation
14//! `sa[0..n]` of `0..n` that lists the starting positions of all suffixes of
15//! `s` in lexicographic order: `s[sa[0]..] < s[sa[1]..] < … < s[sa[n-1]..]`.
16//!
17//! SA-IS builds it in `O(n)` time and `O(n)` space by:
18//!
19//! 1. classifying each position as **S-type** (its suffix is lexicographically
20//!    smaller than the next position's suffix) or **L-type** (larger);
21//! 2. identifying **LMS** ("left-most S") positions — an S-type position whose
22//!    predecessor is L-type — which split the string into LMS-substrings;
23//! 3. sorting the LMS-substrings by two **induced-sort** passes (sort L-types
24//!    from sorted S/LMS, then S-types from sorted L-types);
25//! 4. naming the sorted LMS-substrings and recursing on the shorter name string
26//!    if any two share a name, then inducing the final order from the recursive
27//!    answer.
28//!
29//! A unique sentinel smaller than every real byte is appended internally so the
30//! last suffix is always the unique smallest; the returned array excludes it.
31//!
32//! # LCP array (Kasai)
33//!
34//! `lcp[i]` (for `1 ≤ i < n`) is the length of the longest common prefix of the
35//! two adjacent suffixes `s[sa[i-1]..]` and `s[sa[i]..]`; `lcp[0] = 0` by
36//! convention. Kasai's algorithm computes all of them in `O(n)` using the rank
37//! (inverse SA) array and the observation that the LCP of consecutive *text*
38//! positions can only shrink by one as we advance the suffix start.
39//!
40//! # Pattern search
41//!
42//! Because suffixes are sorted, every occurrence of a pattern `p` corresponds to
43//! a contiguous range of the suffix array (the suffixes that have `p` as a
44//! prefix). [`SuffixArray::search`] locates that range with two binary searches
45//! and returns the sorted text positions.
46//!
47//! Inputs are raw bytes (`&[u8]`); the alphabet is the full byte range.
48
49use crate::error::{SeqError, SeqResult};
50
51/// A suffix array together with its source length and (lazily-built) LCP array.
52///
53/// Construct with [`SuffixArray::new`]. The stored `sa` is a permutation of
54/// `0..n` (`n = source length`); [`SuffixArray::lcp`] returns the Kasai LCP
55/// array, and [`SuffixArray::search`] performs SA binary search.
56///
57/// # Examples
58///
59/// ```
60/// use oxicuda_seq::string::SuffixArray;
61///
62/// let sa = SuffixArray::new(b"banana").expect("non-empty");
63/// // Suffixes of "banana" sorted: a, ana, anana, banana, na, nana.
64/// assert_eq!(sa.sa(), &[5, 3, 1, 0, 4, 2]);
65/// assert_eq!(sa.search(b"ana"), vec![1, 3]);
66/// ```
67#[derive(Debug, Clone)]
68pub struct SuffixArray {
69    /// The suffix array proper: a permutation of `0..source_len`.
70    sa: Vec<usize>,
71    /// The source bytes (owned so search/LCP need no external slice).
72    text: Vec<u8>,
73}
74
75impl SuffixArray {
76    /// Build the suffix array of `s` using SA-IS in linear time.
77    ///
78    /// # Errors
79    ///
80    /// Returns [`SeqError::EmptyInput`] for an empty `s`: the suffix array of
81    /// the empty string is empty and is rejected to keep the contract explicit
82    /// and consistent with the sibling string modules.
83    pub fn new(s: &[u8]) -> SeqResult<Self> {
84        if s.is_empty() {
85            return Err(SeqError::EmptyInput);
86        }
87        let sa = build_sais(s);
88        Ok(Self {
89            sa,
90            text: s.to_vec(),
91        })
92    }
93
94    /// Borrow the suffix array (a permutation of `0..n`).
95    pub fn sa(&self) -> &[usize] {
96        &self.sa
97    }
98
99    /// Borrow the source text.
100    pub fn text(&self) -> &[u8] {
101        &self.text
102    }
103
104    /// The rank (inverse suffix) array: `rank[sa[i]] == i`.
105    pub fn rank(&self) -> Vec<usize> {
106        let n = self.sa.len();
107        let mut rank = vec![0usize; n];
108        for (i, &p) in self.sa.iter().enumerate() {
109            rank[p] = i;
110        }
111        rank
112    }
113
114    /// Compute the Kasai LCP array in `O(n)`.
115    ///
116    /// The returned vector has length `n`; `lcp[0] = 0` and, for `1 ≤ i < n`,
117    /// `lcp[i]` is the length of the longest common prefix of the suffixes
118    /// `text[sa[i-1]..]` and `text[sa[i]..]`.
119    pub fn lcp(&self) -> Vec<usize> {
120        let n = self.sa.len();
121        let mut lcp = vec![0usize; n];
122        if n == 0 {
123            return lcp;
124        }
125        let rank = self.rank();
126        let mut h = 0usize; // running LCP length, shrinks by ≤1 per step
127        for i in 0..n {
128            if rank[i] == 0 {
129                // Suffix i is the smallest; no predecessor to compare with.
130                h = 0;
131                continue;
132            }
133            let j = self.sa[rank[i] - 1]; // text position of the predecessor
134            while i + h < n && j + h < n && self.text[i + h] == self.text[j + h] {
135                h += 1;
136            }
137            lcp[rank[i]] = h;
138            h = h.saturating_sub(1);
139        }
140        lcp
141    }
142
143    /// Number of **distinct** non-empty substrings of the source,
144    /// `n(n+1)/2 − Σ lcp`.
145    ///
146    /// Every substring is a prefix of exactly one suffix; summing suffix lengths
147    /// counts all substrings with multiplicity, and the LCP between adjacent
148    /// suffixes is precisely the number of duplicate prefixes to subtract.
149    pub fn distinct_substring_count(&self) -> usize {
150        let n = self.sa.len();
151        let total = n * (n + 1) / 2;
152        let lcp_sum: usize = self.lcp().iter().sum();
153        total - lcp_sum
154    }
155
156    /// Find all occurrences of `pattern` via two binary searches on the suffix
157    /// array, returning the matching text positions in ascending order.
158    ///
159    /// Returns an empty vector for an empty pattern or when there is no match.
160    /// Overlapping occurrences are all reported.
161    pub fn search(&self, pattern: &[u8]) -> Vec<usize> {
162        if pattern.is_empty() {
163            return Vec::new();
164        }
165        let n = self.sa.len();
166        // Lower bound: first suffix that is >= pattern (as a prefix-bounded cmp).
167        let lo = self.lower_bound(pattern);
168        if lo == n {
169            return Vec::new();
170        }
171        // Upper bound: first suffix that does NOT have `pattern` as a prefix.
172        let hi = self.upper_bound(pattern);
173        let mut out: Vec<usize> = self.sa[lo..hi].to_vec();
174        out.sort_unstable();
175        out
176    }
177
178    /// First index `i` in `0..=n` such that `text[sa[i]..]` is ≥ `pattern` when
179    /// compared up to `pattern.len()` bytes (treating a suffix shorter than the
180    /// pattern but matching so far as smaller).
181    fn lower_bound(&self, pattern: &[u8]) -> usize {
182        let n = self.sa.len();
183        let (mut lo, mut hi) = (0usize, n);
184        while lo < hi {
185            let mid = lo + (hi - lo) / 2;
186            if self.suffix_lt_pattern(self.sa[mid], pattern) {
187                lo = mid + 1;
188            } else {
189                hi = mid;
190            }
191        }
192        lo
193    }
194
195    /// First index `i` such that `text[sa[i]..]` does NOT start with `pattern`
196    /// (i.e. is strictly greater than every string having `pattern` as a
197    /// prefix).
198    fn upper_bound(&self, pattern: &[u8]) -> usize {
199        let n = self.sa.len();
200        let (mut lo, mut hi) = (0usize, n);
201        while lo < hi {
202            let mid = lo + (hi - lo) / 2;
203            if self.suffix_le_pattern_prefix(self.sa[mid], pattern) {
204                lo = mid + 1;
205            } else {
206                hi = mid;
207            }
208        }
209        lo
210    }
211
212    /// `true` if the suffix at `start` is strictly less than `pattern` in the
213    /// prefix-bounded order (shorter-but-equal suffix counts as less).
214    fn suffix_lt_pattern(&self, start: usize, pattern: &[u8]) -> bool {
215        let suf = &self.text[start..];
216        let m = pattern.len().min(suf.len());
217        for k in 0..m {
218            if suf[k] != pattern[k] {
219                return suf[k] < pattern[k];
220            }
221        }
222        // Equal on the overlap: the shorter is smaller.
223        suf.len() < pattern.len()
224    }
225
226    /// `true` while the suffix at `start` still has `pattern` as a prefix *or*
227    /// is smaller — used to find the exclusive upper bound of the match range.
228    fn suffix_le_pattern_prefix(&self, start: usize, pattern: &[u8]) -> bool {
229        let suf = &self.text[start..];
230        let m = pattern.len().min(suf.len());
231        for k in 0..m {
232            if suf[k] != pattern[k] {
233                return suf[k] < pattern[k];
234            }
235        }
236        // Suffix has pattern as a prefix (suf.len() >= pattern.len()) → still in
237        // range; a suffix shorter than the pattern but equal so far is < pattern.
238        true
239    }
240}
241
242/// SA-IS entry point over raw bytes. Maps bytes into `1..=256` and reserves `0`
243/// for the sentinel, then dispatches to the generic recursive core.
244fn build_sais(s: &[u8]) -> Vec<usize> {
245    let n = s.len();
246    // Work string with a 0 sentinel appended; alphabet size is 257 (0..=256).
247    let mut work: Vec<usize> = Vec::with_capacity(n + 1);
248    for &b in s {
249        work.push(b as usize + 1);
250    }
251    work.push(0); // sentinel, strictly smallest
252
253    let sa_full = sais_core(&work, 257);
254    // Drop the sentinel suffix (always first) → suffix array of the original.
255    sa_full.into_iter().skip(1).collect()
256}
257
258/// Suffix type: `true` for S-type, `false` for L-type.
259type SType = bool;
260
261/// The generic SA-IS core over an integer alphabet `0..alphabet`, where the last
262/// element of `s` is the unique smallest sentinel.
263fn sais_core(s: &[usize], alphabet: usize) -> Vec<usize> {
264    let n = s.len();
265    let mut sa = vec![usize::MAX; n];
266    if n == 1 {
267        sa[0] = 0;
268        return sa;
269    }
270    if n == 2 {
271        // Two elements; the sentinel s[1] is smallest.
272        sa[0] = 1;
273        sa[1] = 0;
274        return sa;
275    }
276
277    // 1. Classify S/L types from the right. The sentinel is S-type.
278    let t = classify_types(s);
279
280    // Bucket boundaries.
281    let bucket_sizes = bucket_sizes(s, alphabet);
282
283    // 2. Place LMS suffixes into their bucket ends (rough order), then induce.
284    let lms_positions: Vec<usize> = (1..n).filter(|&i| is_lms(&t, i)).collect();
285
286    place_lms_at_bucket_ends(&mut sa, s, &bucket_sizes, &lms_positions);
287    induce_l(&mut sa, s, &t, &bucket_sizes);
288    induce_s(&mut sa, s, &t, &bucket_sizes);
289
290    // 3. Name the sorted LMS substrings.
291    let (reduced, names_count, lms_order) = name_lms_substrings(&sa, s, &t);
292
293    // 4. Recurse if names are not all distinct, else order directly.
294    let lms_sorted: Vec<usize> = if names_count == lms_order.len() {
295        // Already distinct: reduced[k] is the rank; invert to get order.
296        let mut sorted = vec![0usize; lms_order.len()];
297        for (k, &pos) in lms_order.iter().enumerate() {
298            sorted[reduced[k]] = pos;
299        }
300        sorted
301    } else {
302        let sub_sa = sais_core(&reduced, names_count + 1);
303        // Map reduced suffix order back to original LMS positions.
304        sub_sa.into_iter().map(|r| lms_order[r]).collect()
305    };
306
307    // 5. Final induced sort using the correct LMS order.
308    for slot in sa.iter_mut() {
309        *slot = usize::MAX;
310    }
311    place_lms_sorted_at_bucket_ends(&mut sa, s, &bucket_sizes, &lms_sorted);
312    induce_l(&mut sa, s, &t, &bucket_sizes);
313    induce_s(&mut sa, s, &t, &bucket_sizes);
314
315    sa
316}
317
318/// Classify each position as S-type (`true`) or L-type (`false`).
319fn classify_types(s: &[usize]) -> Vec<SType> {
320    let n = s.len();
321    let mut t = vec![false; n];
322    t[n - 1] = true; // sentinel is S-type
323    for i in (0..n - 1).rev() {
324        t[i] = match s[i].cmp(&s[i + 1]) {
325            std::cmp::Ordering::Less => true,
326            std::cmp::Ordering::Greater => false,
327            std::cmp::Ordering::Equal => t[i + 1],
328        };
329    }
330    t
331}
332
333/// `true` if position `i` is an LMS position (S-type with an L-type predecessor).
334fn is_lms(t: &[SType], i: usize) -> bool {
335    i > 0 && t[i] && !t[i - 1]
336}
337
338/// Count occurrences of each symbol → bucket sizes.
339fn bucket_sizes(s: &[usize], alphabet: usize) -> Vec<usize> {
340    let mut sizes = vec![0usize; alphabet];
341    for &c in s {
342        sizes[c] += 1;
343    }
344    sizes
345}
346
347/// Exclusive prefix-sum giving each bucket's start index.
348fn bucket_heads(sizes: &[usize]) -> Vec<usize> {
349    let mut heads = vec![0usize; sizes.len()];
350    let mut sum = 0usize;
351    for (i, &sz) in sizes.iter().enumerate() {
352        heads[i] = sum;
353        sum += sz;
354    }
355    heads
356}
357
358/// Inclusive prefix-sum minus one giving each bucket's last index.
359fn bucket_tails(sizes: &[usize]) -> Vec<usize> {
360    let mut tails = vec![0usize; sizes.len()];
361    let mut sum = 0usize;
362    for (i, &sz) in sizes.iter().enumerate() {
363        sum += sz;
364        tails[i] = sum.wrapping_sub(1);
365    }
366    tails
367}
368
369/// Place LMS suffixes (in text order) at the *ends* of their character buckets.
370fn place_lms_at_bucket_ends(
371    sa: &mut [usize],
372    s: &[usize],
373    sizes: &[usize],
374    lms_positions: &[usize],
375) {
376    let mut tails = bucket_tails(sizes);
377    for &p in lms_positions {
378        let c = s[p];
379        sa[tails[c]] = p;
380        tails[c] = tails[c].wrapping_sub(1);
381    }
382}
383
384/// Place already-sorted LMS suffixes at the ends of their buckets (right→left so
385/// the sorted order is preserved within each bucket).
386fn place_lms_sorted_at_bucket_ends(
387    sa: &mut [usize],
388    s: &[usize],
389    sizes: &[usize],
390    lms_sorted: &[usize],
391) {
392    let mut tails = bucket_tails(sizes);
393    for &p in lms_sorted.iter().rev() {
394        let c = s[p];
395        sa[tails[c]] = p;
396        tails[c] = tails[c].wrapping_sub(1);
397    }
398}
399
400/// Induce L-type suffixes from the current (partial) SA by a left-to-right pass.
401fn induce_l(sa: &mut [usize], s: &[usize], t: &[SType], sizes: &[usize]) {
402    let n = s.len();
403    let mut heads = bucket_heads(sizes);
404    for i in 0..n {
405        let p = sa[i];
406        if p == usize::MAX || p == 0 {
407            continue;
408        }
409        let j = p - 1;
410        if !t[j] {
411            // L-type → place at the head of its bucket.
412            let c = s[j];
413            sa[heads[c]] = j;
414            heads[c] += 1;
415        }
416    }
417}
418
419/// Induce S-type suffixes from the current SA by a right-to-left pass.
420fn induce_s(sa: &mut [usize], s: &[usize], t: &[SType], sizes: &[usize]) {
421    let n = s.len();
422    let mut tails = bucket_tails(sizes);
423    for i in (0..n).rev() {
424        let p = sa[i];
425        if p == usize::MAX || p == 0 {
426            continue;
427        }
428        let j = p - 1;
429        if t[j] {
430            // S-type → place at the tail of its bucket.
431            let c = s[j];
432            sa[tails[c]] = j;
433            tails[c] = tails[c].wrapping_sub(1);
434        }
435    }
436}
437
438/// Name the sorted LMS substrings.
439///
440/// Returns `(reduced, names_count, lms_order)` where `lms_order` lists the LMS
441/// text positions in their first-appearance order within the array, `reduced` is
442/// the name string over those positions in *text* order, and `names_count` is
443/// the number of distinct names assigned.
444fn name_lms_substrings(sa: &[usize], s: &[usize], t: &[SType]) -> (Vec<usize>, usize, Vec<usize>) {
445    let n = s.len();
446    // Collect the LMS positions in the order the induced sort produced them.
447    let mut lms_in_sa: Vec<usize> = Vec::new();
448    for &p in sa {
449        if p != usize::MAX && is_lms(t, p) {
450            lms_in_sa.push(p);
451        }
452    }
453
454    // Assign names by comparing consecutive LMS substrings.
455    let mut names = vec![usize::MAX; n];
456    let mut current_name = 0usize;
457    names[lms_in_sa[0]] = current_name;
458    let mut prev = lms_in_sa[0];
459    for &cur in lms_in_sa.iter().skip(1) {
460        if !lms_substrings_equal(s, t, prev, cur) {
461            current_name += 1;
462        }
463        names[cur] = current_name;
464        prev = cur;
465    }
466    let names_count = current_name + 1;
467
468    // Build the reduced string in text order of LMS positions, and the position
469    // list for inverse mapping.
470    let mut lms_order: Vec<usize> = Vec::new();
471    let mut reduced: Vec<usize> = Vec::new();
472    for i in 0..n {
473        if is_lms(t, i) {
474            lms_order.push(i);
475            reduced.push(names[i]);
476        }
477    }
478    (reduced, names_count, lms_order)
479}
480
481/// Compare two LMS substrings starting at `a` and `b` for equality, including
482/// the type sequence (so that boundaries are respected).
483fn lms_substrings_equal(s: &[usize], t: &[SType], a: usize, b: usize) -> bool {
484    let n = s.len();
485    if a == b {
486        return true;
487    }
488    let mut i = 0usize;
489    loop {
490        let ai = a + i;
491        let bi = b + i;
492        // Past the sentinel for either → only equal if both ended together.
493        if ai >= n || bi >= n {
494            return false;
495        }
496        let a_is_lms = is_lms(t, ai);
497        let b_is_lms = is_lms(t, bi);
498        if i > 0 && a_is_lms && b_is_lms {
499            // Both reached the next LMS boundary simultaneously → equal so far.
500            return true;
501        }
502        if a_is_lms != b_is_lms {
503            return false;
504        }
505        if s[ai] != s[bi] || t[ai] != t[bi] {
506            return false;
507        }
508        i += 1;
509    }
510}
511
512#[cfg(test)]
513mod tests {
514    use super::*;
515
516    /// Brute-force suffix array: sort all suffixes lexicographically.
517    fn brute_force_sa(s: &[u8]) -> Vec<usize> {
518        let n = s.len();
519        let mut idx: Vec<usize> = (0..n).collect();
520        idx.sort_by(|&a, &b| s[a..].cmp(&s[b..]));
521        idx
522    }
523
524    /// Brute-force LCP from the (correct) suffix array.
525    fn brute_force_lcp(s: &[u8], sa: &[usize]) -> Vec<usize> {
526        let n = sa.len();
527        let mut lcp = vec![0usize; n];
528        for i in 1..n {
529            let (a, b) = (sa[i - 1], sa[i]);
530            let mut k = 0;
531            while a + k < s.len() && b + k < s.len() && s[a + k] == s[b + k] {
532                k += 1;
533            }
534            lcp[i] = k;
535        }
536        lcp
537    }
538
539    /// Brute-force distinct substring count over all substrings.
540    fn brute_force_distinct(s: &[u8]) -> usize {
541        let n = s.len();
542        let mut set = std::collections::BTreeSet::new();
543        for i in 0..n {
544            for j in i + 1..=n {
545                set.insert(s[i..j].to_vec());
546            }
547        }
548        set.len()
549    }
550
551    fn naive_search(p: &[u8], t: &[u8]) -> Vec<usize> {
552        let (m, n) = (p.len(), t.len());
553        if m == 0 || m > n {
554            return Vec::new();
555        }
556        (0..=(n - m)).filter(|&i| &t[i..i + m] == p).collect()
557    }
558
559    fn random_bytes(rng: &mut crate::handle::LcgRng, alphabet: &[u8], len: usize) -> Vec<u8> {
560        (0..len)
561            .map(|_| alphabet[rng.next_usize(alphabet.len())])
562            .collect()
563    }
564
565    /// (a) SA equals brute force on random strings and on the canonical cases.
566    #[test]
567    fn sa_matches_brute_force() {
568        // Canonical strings.
569        for s in [b"banana".as_slice(), b"mississippi", b"abracadabra"] {
570            let sa = SuffixArray::new(s).expect("non-empty");
571            assert_eq!(sa.sa(), brute_force_sa(s).as_slice(), "SA for {s:?}");
572        }
573        // Random.
574        let mut rng = crate::handle::LcgRng::new(11);
575        for &alphabet in &[b"a".as_slice(), b"ab", b"abc", b"abcd"] {
576            for _ in 0..400 {
577                let len = 1 + rng.next_usize(40);
578                let s = random_bytes(&mut rng, alphabet, len);
579                let got = SuffixArray::new(&s).expect("non-empty");
580                assert_eq!(got.sa(), brute_force_sa(&s).as_slice(), "SA for {s:?}");
581            }
582        }
583    }
584
585    /// (b) The SA is a permutation of 0..n.
586    #[test]
587    fn sa_is_permutation() {
588        let mut rng = crate::handle::LcgRng::new(22);
589        for _ in 0..300 {
590            let len = 1 + rng.next_usize(40);
591            let s = random_bytes(&mut rng, b"abc", len);
592            let sa = SuffixArray::new(&s).expect("non-empty");
593            let n = s.len();
594            let mut seen = vec![false; n];
595            assert_eq!(sa.sa().len(), n);
596            for &p in sa.sa() {
597                assert!(p < n, "index out of range");
598                assert!(!seen[p], "duplicate index {p}");
599                seen[p] = true;
600            }
601            assert!(seen.iter().all(|&b| b), "not all indices present");
602        }
603    }
604
605    /// (c) The LCP array matches the adjacent-suffix LCPs (brute force).
606    #[test]
607    fn lcp_matches_brute_force() {
608        for s in [b"banana".as_slice(), b"mississippi", b"aaaa"] {
609            let sa = SuffixArray::new(s).expect("non-empty");
610            let got = sa.lcp();
611            let want = brute_force_lcp(s, sa.sa());
612            assert_eq!(got, want, "LCP for {s:?}");
613        }
614        let mut rng = crate::handle::LcgRng::new(33);
615        for &alphabet in &[b"ab".as_slice(), b"abc"] {
616            for _ in 0..400 {
617                let len = 1 + rng.next_usize(40);
618                let s = random_bytes(&mut rng, alphabet, len);
619                let sa = SuffixArray::new(&s).expect("non-empty");
620                assert_eq!(sa.lcp(), brute_force_lcp(&s, sa.sa()), "LCP {s:?}");
621            }
622        }
623    }
624
625    /// (d) Repeated characters handled.
626    #[test]
627    fn repeated_characters() {
628        let sa = SuffixArray::new(b"aaaa").expect("non-empty");
629        // Sorted suffixes: "a" < "aa" < "aaa" < "aaaa" → starts 3,2,1,0.
630        assert_eq!(sa.sa(), &[3, 2, 1, 0]);
631        // LCP between consecutive: 1, 2, 3.
632        assert_eq!(sa.lcp(), vec![0, 1, 2, 3]);
633
634        let sa = SuffixArray::new(b"aaaaaaaa").expect("non-empty");
635        assert_eq!(sa.sa(), brute_force_sa(b"aaaaaaaa").as_slice());
636    }
637
638    /// (e) Pattern search via SA binary search finds occurrences.
639    #[test]
640    fn search_matches_naive() {
641        // Canonical.
642        let sa = SuffixArray::new(b"banana").expect("non-empty");
643        assert_eq!(sa.search(b"ana"), vec![1, 3]);
644        assert_eq!(sa.search(b"a"), vec![1, 3, 5]);
645        assert_eq!(sa.search(b"banana"), vec![0]);
646        assert!(sa.search(b"xyz").is_empty());
647        assert!(sa.search(b"").is_empty());
648
649        // Random cross-check.
650        let mut rng = crate::handle::LcgRng::new(44);
651        for &alphabet in &[b"ab".as_slice(), b"abc"] {
652            for _ in 0..400 {
653                let tlen = 1 + rng.next_usize(40);
654                let plen = 1 + rng.next_usize(5);
655                let t = random_bytes(&mut rng, alphabet, tlen);
656                let p = random_bytes(&mut rng, alphabet, plen);
657                let sa = SuffixArray::new(&t).expect("non-empty");
658                let mut want = naive_search(&p, &t);
659                want.sort_unstable();
660                assert_eq!(sa.search(&p), want, "search p={p:?} t={t:?}");
661            }
662        }
663    }
664
665    /// (f) #distinct substrings = n(n+1)/2 − ΣLCP matches brute force.
666    #[test]
667    fn distinct_substring_count_matches_brute_force() {
668        for s in [b"banana".as_slice(), b"mississippi", b"aaaa", b"abcabc"] {
669            let sa = SuffixArray::new(s).expect("non-empty");
670            assert_eq!(
671                sa.distinct_substring_count(),
672                brute_force_distinct(s),
673                "distinct for {s:?}"
674            );
675        }
676        let mut rng = crate::handle::LcgRng::new(55);
677        for &alphabet in &[b"ab".as_slice(), b"abc"] {
678            for _ in 0..200 {
679                let len = 1 + rng.next_usize(24);
680                let s = random_bytes(&mut rng, alphabet, len);
681                let sa = SuffixArray::new(&s).expect("non-empty");
682                assert_eq!(
683                    sa.distinct_substring_count(),
684                    brute_force_distinct(&s),
685                    "distinct for {s:?}"
686                );
687            }
688        }
689    }
690
691    /// Empty input is rejected.
692    #[test]
693    fn empty_input_errors() {
694        assert!(matches!(SuffixArray::new(b""), Err(SeqError::EmptyInput)));
695    }
696
697    /// Single character.
698    #[test]
699    fn single_char() {
700        let sa = SuffixArray::new(b"x").expect("non-empty");
701        assert_eq!(sa.sa(), &[0]);
702        assert_eq!(sa.lcp(), vec![0]);
703        assert_eq!(sa.search(b"x"), vec![0]);
704        assert!(sa.search(b"y").is_empty());
705        assert_eq!(sa.distinct_substring_count(), 1);
706    }
707
708    /// The rank array is the exact inverse of the SA.
709    #[test]
710    fn rank_is_inverse() {
711        let mut rng = crate::handle::LcgRng::new(66);
712        for _ in 0..200 {
713            let len = 1 + rng.next_usize(30);
714            let s = random_bytes(&mut rng, b"abc", len);
715            let sa = SuffixArray::new(&s).expect("non-empty");
716            let rank = sa.rank();
717            for (i, &p) in sa.sa().iter().enumerate() {
718                assert_eq!(rank[p], i);
719            }
720        }
721    }
722}
oxicuda_seq/string/suffix_array.rs

oxicuda_seq/string/
suffix_array.rs