Skip to main content

cyanea_seq/
suffix.rs

1//! Suffix array construction via the SA-IS algorithm.
2//!
3//! Implements the induced-sorting algorithm of Nong, Zhang & Chan (2009)
4//! for O(n) suffix array construction. Provides binary search for
5//! pattern matching in O(m log n).
6
7/// A suffix array built from a text.
8///
9/// The suffix array stores the starting positions of all suffixes of the
10/// input text, sorted in lexicographic order. This enables efficient
11/// substring search via binary search.
12#[derive(Debug, Clone)]
13pub struct SuffixArray {
14    sa: Vec<usize>,
15}
16
17// ---------------------------------------------------------------------------
18// SA-IS implementation
19// ---------------------------------------------------------------------------
20
21/// Suffix type: S-type or L-type.
22#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23enum SuffixType {
24    S,
25    L,
26}
27
28/// Classify each position as S-type or L-type.
29///
30/// A suffix at position i is S-type if text[i] < text[i+1], or
31/// text[i] == text[i+1] and suffix[i+1] is S-type. The last
32/// position (sentinel) is always S-type.
33fn classify_suffixes(text: &[usize]) -> Vec<SuffixType> {
34    let n = text.len();
35    if n == 0 {
36        return vec![];
37    }
38    let mut types = vec![SuffixType::S; n];
39    // Last character (sentinel) is S-type
40    types[n - 1] = SuffixType::S;
41
42    if n > 1 {
43        // Scan right to left
44        for i in (0..n - 1).rev() {
45            if text[i] > text[i + 1] {
46                types[i] = SuffixType::L;
47            } else if text[i] < text[i + 1] {
48                types[i] = SuffixType::S;
49            } else {
50                types[i] = types[i + 1];
51            }
52        }
53    }
54
55    types
56}
57
58/// Check if position i is an LMS (leftmost S-type) suffix.
59#[inline]
60fn is_lms(types: &[SuffixType], i: usize) -> bool {
61    i > 0 && types[i] == SuffixType::S && types[i - 1] == SuffixType::L
62}
63
64/// Get the bucket sizes for each character in the alphabet.
65fn get_bucket_sizes(text: &[usize], alphabet_size: usize) -> Vec<usize> {
66    let mut buckets = vec![0usize; alphabet_size];
67    for &c in text {
68        buckets[c] += 1;
69    }
70    buckets
71}
72
73/// Get the start positions of each bucket.
74fn get_bucket_starts(bucket_sizes: &[usize]) -> Vec<usize> {
75    let mut starts = vec![0usize; bucket_sizes.len()];
76    let mut sum = 0;
77    for (i, &size) in bucket_sizes.iter().enumerate() {
78        starts[i] = sum;
79        sum += size;
80    }
81    starts
82}
83
84/// Get the end positions (exclusive) of each bucket.
85fn get_bucket_ends(bucket_sizes: &[usize]) -> Vec<usize> {
86    let mut ends = vec![0usize; bucket_sizes.len()];
87    let mut sum = 0;
88    for (i, &size) in bucket_sizes.iter().enumerate() {
89        sum += size;
90        ends[i] = sum;
91    }
92    ends
93}
94
95/// Place LMS suffixes into their bucket tails.
96fn place_lms(
97    sa: &mut [usize],
98    text: &[usize],
99    _types: &[SuffixType],
100    bucket_sizes: &[usize],
101    lms_positions: &[usize],
102) {
103    let n = sa.len();
104    let sentinel = n; // use n as "empty" marker
105    for slot in sa.iter_mut() {
106        *slot = sentinel;
107    }
108
109    let mut tails = get_bucket_ends(bucket_sizes);
110
111    // Place LMS suffixes at the end of their buckets, in reverse order
112    for &pos in lms_positions.iter().rev() {
113        let c = text[pos];
114        tails[c] -= 1;
115        sa[tails[c]] = pos;
116    }
117}
118
119/// Induce L-type suffixes from left to right.
120fn induce_l(
121    sa: &mut [usize],
122    text: &[usize],
123    types: &[SuffixType],
124    bucket_sizes: &[usize],
125) {
126    let n = sa.len();
127    let sentinel = n;
128    let mut heads = get_bucket_starts(bucket_sizes);
129
130    for i in 0..n {
131        if sa[i] == sentinel || sa[i] == 0 {
132            continue;
133        }
134        let j = sa[i] - 1;
135        if types[j] == SuffixType::L {
136            let c = text[j];
137            sa[heads[c]] = j;
138            heads[c] += 1;
139        }
140    }
141}
142
143/// Induce S-type suffixes from right to left.
144fn induce_s(
145    sa: &mut [usize],
146    text: &[usize],
147    types: &[SuffixType],
148    bucket_sizes: &[usize],
149) {
150    let n = sa.len();
151    let sentinel = n;
152    let mut tails = get_bucket_ends(bucket_sizes);
153
154    for i in (0..n).rev() {
155        if sa[i] == sentinel || sa[i] == 0 {
156            continue;
157        }
158        let j = sa[i] - 1;
159        if types[j] == SuffixType::S {
160            let c = text[j];
161            tails[c] -= 1;
162            sa[tails[c]] = j;
163        }
164    }
165}
166
167/// Check if two LMS substrings are equal.
168fn lms_substrings_equal(
169    text: &[usize],
170    types: &[SuffixType],
171    pos1: usize,
172    pos2: usize,
173) -> bool {
174    let n = text.len();
175
176    // Both must be valid positions
177    if pos1 >= n || pos2 >= n {
178        return false;
179    }
180
181    let mut i = 0;
182    loop {
183        let end1 = i > 0 && is_lms(types, pos1 + i);
184        let end2 = i > 0 && is_lms(types, pos2 + i);
185
186        if end1 && end2 {
187            return true; // Both ended at same relative position
188        }
189        if end1 != end2 {
190            return false; // One ended before the other
191        }
192        if text[pos1 + i] != text[pos2 + i] {
193            return false;
194        }
195        if types[pos1 + i] != types[pos2 + i] {
196            return false;
197        }
198
199        i += 1;
200
201        if pos1 + i >= n || pos2 + i >= n {
202            return false;
203        }
204    }
205}
206
207/// Core SA-IS algorithm on integer alphabet.
208fn sais(text: &[usize], alphabet_size: usize) -> Vec<usize> {
209    let n = text.len();
210
211    if n == 0 {
212        return vec![];
213    }
214    if n == 1 {
215        return vec![0];
216    }
217    if n == 2 {
218        if text[0] <= text[1] {
219            return vec![0, 1];
220        } else {
221            return vec![1, 0];
222        }
223    }
224
225    // Step 1: Classify suffixes
226    let types = classify_suffixes(text);
227
228    // Step 2: Find LMS positions
229    let lms_positions: Vec<usize> = (0..n).filter(|&i| is_lms(&types, i)).collect();
230
231    // Step 3: Get bucket sizes
232    let bucket_sizes = get_bucket_sizes(text, alphabet_size);
233
234    // Step 4: Initial placement and induced sorting to sort LMS substrings
235    let mut sa = vec![0usize; n];
236    place_lms(&mut sa, text, &types, &bucket_sizes, &lms_positions);
237    induce_l(&mut sa, text, &types, &bucket_sizes);
238    induce_s(&mut sa, text, &types, &bucket_sizes);
239
240    // Step 5: Compact sorted LMS suffixes and assign names
241    let sentinel = n;
242    // Collect the sorted LMS suffixes from sa
243    let sorted_lms: Vec<usize> = sa.iter().copied().filter(|&x| x != sentinel && is_lms(&types, x)).collect();
244
245    // Assign names (ranks) to LMS substrings
246    let mut names = vec![sentinel; n];
247    let mut current_name = 0;
248
249    if !sorted_lms.is_empty() {
250        names[sorted_lms[0]] = current_name;
251        for i in 1..sorted_lms.len() {
252            if !lms_substrings_equal(text, &types, sorted_lms[i - 1], sorted_lms[i]) {
253                current_name += 1;
254            }
255            names[sorted_lms[i]] = current_name;
256        }
257    }
258
259    let num_names = current_name + 1;
260
261    // Step 6: If names are not unique, recurse
262    let lms_count = lms_positions.len();
263    let reduced: Vec<usize> = lms_positions.iter().map(|&pos| names[pos]).collect();
264
265    let sorted_lms_indices = if num_names < lms_count {
266        // Recurse
267        let sub_sa = sais(&reduced, num_names);
268        sub_sa
269    } else {
270        // Names are unique, directly compute order
271        let mut order = vec![0usize; lms_count];
272        for (i, &name) in reduced.iter().enumerate() {
273            order[name] = i;
274        }
275        order
276    };
277
278    // Step 7: Final induced sort using correctly ordered LMS suffixes
279    let sorted_lms_positions: Vec<usize> = sorted_lms_indices
280        .iter()
281        .map(|&i| lms_positions[i])
282        .collect();
283
284    place_lms(&mut sa, text, &types, &bucket_sizes, &sorted_lms_positions);
285    induce_l(&mut sa, text, &types, &bucket_sizes);
286    induce_s(&mut sa, text, &types, &bucket_sizes);
287
288    sa
289}
290
291impl SuffixArray {
292    /// Build a suffix array from the given text using the SA-IS algorithm.
293    ///
294    /// Appends a sentinel character (value 0, lexicographically smallest)
295    /// to ensure correct suffix ordering.
296    ///
297    /// Time complexity: O(n) where n is the length of the text.
298    ///
299    /// # Example
300    ///
301    /// ```
302    /// use cyanea_seq::suffix::SuffixArray;
303    ///
304    /// let sa = SuffixArray::build(b"banana");
305    /// let positions = sa.search(b"banana", b"ana");
306    /// assert_eq!(positions.len(), 2);
307    /// ```
308    pub fn build(text: &[u8]) -> Self {
309        if text.is_empty() {
310            return Self { sa: vec![0] }; // Just the sentinel
311        }
312
313        // Map bytes to integer alphabet with sentinel = 0
314        // We need to map the byte values to a compact integer alphabet.
315        // Sentinel gets value 0, all other characters get values 1..=alphabet_size
316        let mut chars_present = [false; 256];
317        for &b in text {
318            chars_present[b as usize] = true;
319        }
320
321        // Create mapping: byte value -> compact integer (1-indexed, 0 is sentinel)
322        let mut mapping = [0usize; 256];
323        let mut next_id = 1usize;
324        for i in 0..256 {
325            if chars_present[i] {
326                mapping[i] = next_id;
327                next_id += 1;
328            }
329        }
330        let alphabet_size = next_id; // includes sentinel
331
332        // Build integer text with sentinel appended
333        let mut int_text: Vec<usize> = Vec::with_capacity(text.len() + 1);
334        for &b in text {
335            int_text.push(mapping[b as usize]);
336        }
337        int_text.push(0); // sentinel
338
339        let sa = sais(&int_text, alphabet_size);
340
341        Self { sa }
342    }
343
344    /// Search for all occurrences of `pattern` in `text`.
345    ///
346    /// The `text` must be the same text used to build the suffix array
347    /// (without the sentinel). Returns positions in arbitrary order.
348    ///
349    /// Time complexity: O(m log n) where m is the pattern length and n is the text length.
350    pub fn search(&self, text: &[u8], pattern: &[u8]) -> Vec<usize> {
351        if pattern.is_empty() || text.is_empty() {
352            return vec![];
353        }
354
355        let n = text.len();
356
357        // The SA includes the sentinel position, so sa.len() == text.len() + 1
358        // We need to compare suffixes against the pattern.
359
360        // Binary search for the leftmost occurrence
361        let left = self.lower_bound(text, pattern);
362        let right = self.upper_bound(text, pattern);
363
364        if left >= right {
365            return vec![];
366        }
367
368        let mut positions: Vec<usize> = (left..right)
369            .map(|i| self.sa[i])
370            .filter(|&pos| pos < n) // exclude sentinel position
371            .collect();
372        positions.sort_unstable();
373        positions
374    }
375
376    /// Find the leftmost SA index where the suffix >= pattern.
377    fn lower_bound(&self, text: &[u8], pattern: &[u8]) -> usize {
378        let mut lo = 0;
379        let mut hi = self.sa.len();
380
381        while lo < hi {
382            let mid = lo + (hi - lo) / 2;
383            let pos = self.sa[mid];
384            let suffix = if pos < text.len() {
385                &text[pos..]
386            } else {
387                &[] // sentinel
388            };
389
390            if compare_prefix(suffix, pattern) == std::cmp::Ordering::Less {
391                lo = mid + 1;
392            } else {
393                hi = mid;
394            }
395        }
396
397        lo
398    }
399
400    /// Find the first SA index where the suffix > pattern (as a prefix).
401    fn upper_bound(&self, text: &[u8], pattern: &[u8]) -> usize {
402        let mut lo = 0;
403        let mut hi = self.sa.len();
404
405        while lo < hi {
406            let mid = lo + (hi - lo) / 2;
407            let pos = self.sa[mid];
408            let suffix = if pos < text.len() {
409                &text[pos..]
410            } else {
411                &[] // sentinel
412            };
413
414            if compare_prefix(suffix, pattern) == std::cmp::Ordering::Greater {
415                hi = mid;
416            } else {
417                lo = mid + 1;
418            }
419        }
420
421        lo
422    }
423
424    /// Number of entries in the suffix array (text length + 1 for sentinel).
425    pub fn len(&self) -> usize {
426        self.sa.len()
427    }
428}
429
430/// Compare a suffix against a pattern, treating the pattern as a prefix.
431///
432/// Returns Ordering::Equal if the suffix starts with the pattern.
433fn compare_prefix(suffix: &[u8], pattern: &[u8]) -> std::cmp::Ordering {
434    let len = suffix.len().min(pattern.len());
435    match suffix[..len].cmp(&pattern[..len]) {
436        std::cmp::Ordering::Equal => {
437            if suffix.len() >= pattern.len() {
438                std::cmp::Ordering::Equal
439            } else {
440                std::cmp::Ordering::Less
441            }
442        }
443        other => other,
444    }
445}
446
447#[cfg(test)]
448mod tests {
449    use super::*;
450
451    #[test]
452    fn build_banana() {
453        let text = b"banana";
454        let sa = SuffixArray::build(text);
455        // SA should have 7 entries (6 chars + sentinel)
456        assert_eq!(sa.len(), 7);
457
458        // Verify suffixes are sorted
459        let suffixes: Vec<&[u8]> = sa
460            .sa
461            .iter()
462            .map(|&pos| {
463                if pos < text.len() {
464                    &text[pos..]
465                } else {
466                    b"" as &[u8] // sentinel
467                }
468            })
469            .collect();
470
471        for i in 1..suffixes.len() {
472            assert!(
473                suffixes[i - 1] <= suffixes[i],
474                "suffixes not sorted at {}: {:?} > {:?}",
475                i,
476                std::str::from_utf8(suffixes[i - 1]),
477                std::str::from_utf8(suffixes[i])
478            );
479        }
480    }
481
482    #[test]
483    fn search_banana_ana() {
484        let text = b"banana";
485        let sa = SuffixArray::build(text);
486        let mut positions = sa.search(text, b"ana");
487        positions.sort();
488        assert_eq!(positions, vec![1, 3]);
489    }
490
491    #[test]
492    fn search_banana_ban() {
493        let text = b"banana";
494        let sa = SuffixArray::build(text);
495        let positions = sa.search(text, b"ban");
496        assert_eq!(positions, vec![0]);
497    }
498
499    #[test]
500    fn search_banana_a() {
501        let text = b"banana";
502        let sa = SuffixArray::build(text);
503        let mut positions = sa.search(text, b"a");
504        positions.sort();
505        assert_eq!(positions, vec![1, 3, 5]);
506    }
507
508    #[test]
509    fn search_no_match() {
510        let text = b"banana";
511        let sa = SuffixArray::build(text);
512        let positions = sa.search(text, b"xyz");
513        assert!(positions.is_empty());
514    }
515
516    #[test]
517    fn search_full_text() {
518        let text = b"banana";
519        let sa = SuffixArray::build(text);
520        let positions = sa.search(text, b"banana");
521        assert_eq!(positions, vec![0]);
522    }
523
524    #[test]
525    fn search_empty_pattern() {
526        let text = b"banana";
527        let sa = SuffixArray::build(text);
528        let positions = sa.search(text, b"");
529        assert!(positions.is_empty());
530    }
531
532    #[test]
533    fn build_single_char() {
534        let text = b"a";
535        let sa = SuffixArray::build(text);
536        assert_eq!(sa.len(), 2); // 'a' + sentinel
537        let positions = sa.search(text, b"a");
538        assert_eq!(positions, vec![0]);
539    }
540
541    #[test]
542    fn build_repeated_chars() {
543        let text = b"aaaa";
544        let sa = SuffixArray::build(text);
545        assert_eq!(sa.len(), 5);
546
547        // Verify sorted
548        let suffixes: Vec<&[u8]> = sa
549            .sa
550            .iter()
551            .map(|&pos| {
552                if pos < text.len() {
553                    &text[pos..]
554                } else {
555                    b"" as &[u8]
556                }
557            })
558            .collect();
559
560        for i in 1..suffixes.len() {
561            assert!(suffixes[i - 1] <= suffixes[i]);
562        }
563
564        let mut positions = sa.search(text, b"aa");
565        positions.sort();
566        assert_eq!(positions, vec![0, 1, 2]);
567    }
568
569    #[test]
570    fn build_dna_sequence() {
571        let text = b"ACGTACGT";
572        let sa = SuffixArray::build(text);
573
574        let mut positions = sa.search(text, b"ACG");
575        positions.sort();
576        assert_eq!(positions, vec![0, 4]);
577
578        let positions = sa.search(text, b"TACG");
579        assert_eq!(positions, vec![3]);
580    }
581
582    #[test]
583    fn build_empty_text() {
584        let text = b"";
585        let sa = SuffixArray::build(text);
586        assert_eq!(sa.len(), 1); // just sentinel
587        let positions = sa.search(text, b"a");
588        assert!(positions.is_empty());
589    }
590
591    #[test]
592    fn search_longer_than_text() {
593        let text = b"abc";
594        let sa = SuffixArray::build(text);
595        let positions = sa.search(text, b"abcdefg");
596        assert!(positions.is_empty());
597    }
598
599    #[test]
600    fn build_mississippi() {
601        let text = b"mississippi";
602        let sa = SuffixArray::build(text);
603
604        // Verify sorted
605        let suffixes: Vec<&[u8]> = sa
606            .sa
607            .iter()
608            .map(|&pos| {
609                if pos < text.len() {
610                    &text[pos..]
611                } else {
612                    b"" as &[u8]
613                }
614            })
615            .collect();
616        for i in 1..suffixes.len() {
617            assert!(
618                suffixes[i - 1] <= suffixes[i],
619                "not sorted at {}: {:?} vs {:?}",
620                i,
621                std::str::from_utf8(suffixes[i - 1]),
622                std::str::from_utf8(suffixes[i])
623            );
624        }
625
626        let mut positions = sa.search(text, b"issi");
627        positions.sort();
628        assert_eq!(positions, vec![1, 4]);
629
630        let mut positions = sa.search(text, b"ss");
631        positions.sort();
632        assert_eq!(positions, vec![2, 5]);
633    }
634}