Skip to main content

oxicuda_seq/string/
suffix_automaton.rs

1//! Suffix automaton (directed acyclic word graph, DAWG).
2//!
3//! References:
4//! * Anselm Blumer, Janet Blumer, David Haussler, Andrzej Ehrenfeucht, M. T.
5//!   Chen & Joel Seiferas, *"The smallest automaton recognizing the subwords of
6//!   a text"*, Theoretical Computer Science 40, 1985, pp. 31–55.
7//! * Maxime Crochemore, *"Transducers and repetitions"*, TCS 45, 1986 — the
8//!   linear online construction with state cloning on split, used here.
9//!
10//! # What a suffix automaton is
11//!
12//! The suffix automaton of a string `s` is the smallest deterministic finite
13//! automaton (with at most `2|s| − 1` states for `|s| ≥ 2`) whose recognised
14//! language is *exactly* the set of substrings of `s`. Every state `v` carries:
15//!
16//! * `len[v]` — the length of the **longest** string in `v`'s
17//!   *endpos-equivalence class* (two substrings are equivalent when they end at
18//!   the same set of positions in `s`), and
19//! * `link[v]` — the **suffix link** to the state of the longest proper suffix
20//!   of `v`'s strings that lies in a *different* endpos class.
21//!
22//! The strings recognised at state `v` are precisely the suffixes of its
23//! longest string whose lengths lie in `(len[link[v]], len[v]]`.
24//!
25//! # Online construction
26//!
27//! Characters are appended one at a time. Extending by `c` creates a new state
28//! `cur` with `len[cur] = len[last] + 1` and walks the suffix-link chain from
29//! `last` adding `c`-transitions to `cur`. The interesting case is **cloning**:
30//! when the chain reaches a state `p` that already has a `c`-transition to some
31//! `q` with `len[q] != len[p] + 1`, a copy `clone` of `q` is inserted with
32//! `len[clone] = len[p] + 1`, redirecting the affected transitions and suffix
33//! links. This split is what keeps the automaton minimal; it is exercised
34//! directly by the tests (e.g. `"abcbc"`).
35//!
36//! # Provided queries
37//!
38//! * [`SuffixAutomaton::contains`] — substring membership.
39//! * [`SuffixAutomaton::distinct_substring_count`] — number of distinct
40//!   non-empty substrings, `Σ_v (len[v] − len[link[v]])`.
41//! * [`SuffixAutomaton::occurrences`] — number of (possibly overlapping)
42//!   occurrences of a pattern, via the size of each state's endpos set.
43//! * [`longest_common_substring`] — the longest common substring of two strings
44//!   by running the automaton of one over the other.
45//!
46//! Input is treated as raw bytes (`&[u8]`); for ASCII this matches the usual
47//! character-level notion of substring.
48
49use crate::error::{SeqError, SeqResult};
50use std::collections::BTreeMap;
51
52/// Sentinel suffix link for the initial state (it has no parent).
53const NO_LINK: isize = -1;
54
55/// A single state of the suffix automaton.
56#[derive(Debug, Clone)]
57struct State {
58    /// Length of the longest string in this state's endpos class.
59    len: usize,
60    /// Suffix link, or [`NO_LINK`] for the initial state.
61    link: isize,
62    /// Outgoing transitions keyed by byte. A `BTreeMap` keeps the structure
63    /// compact for sparse alphabets while remaining deterministic to iterate.
64    next: BTreeMap<u8, usize>,
65    /// `endpos`-set size contribution used for occurrence counting. Original
66    /// (non-clone) states are created with `cnt = 1`; clones with `cnt = 0`.
67    /// After construction these are summed along suffix links in
68    /// reverse-`len` order so that `cnt[v]` becomes `|endpos(v)|`.
69    cnt: usize,
70    /// `true` if this state was produced by a clone-on-split. Recorded so the
71    /// clone path is observable and testable.
72    is_clone: bool,
73}
74
75impl State {
76    fn new(len: usize, link: isize) -> Self {
77        Self {
78            len,
79            link,
80            next: BTreeMap::new(),
81            cnt: 0,
82            is_clone: false,
83        }
84    }
85}
86
87/// A suffix automaton (DAWG) built online from a byte string.
88///
89/// Construct it with [`SuffixAutomaton::new`]; the build is `O(n log |Σ|)` for
90/// the `BTreeMap` transition tables (`O(n |Σ|)` with array tables). The
91/// automaton stores the length of the source string so that occurrence counts
92/// and the distinct-substring total can be reported directly.
93///
94/// # Examples
95///
96/// ```
97/// use oxicuda_seq::string::SuffixAutomaton;
98///
99/// let sam = SuffixAutomaton::new(b"abcbc");
100/// assert!(sam.contains(b"bcb"));
101/// assert!(!sam.contains(b"acb"));
102/// // "abcbc" has 12 distinct non-empty substrings.
103/// assert_eq!(sam.distinct_substring_count(), 12);
104/// ```
105#[derive(Debug, Clone)]
106pub struct SuffixAutomaton {
107    states: Vec<State>,
108    /// Index of the initial state (always 0).
109    init: usize,
110    /// Index of the state reached after consuming the whole source string.
111    last: usize,
112    /// Length of the source string in bytes.
113    source_len: usize,
114    /// Lazily-computed `cnt` finalisation flag for occurrence counting.
115    counts_finalised: bool,
116}
117
118impl SuffixAutomaton {
119    /// Build the suffix automaton of `s`.
120    ///
121    /// An empty `s` yields an automaton with only the initial state; all
122    /// queries then behave consistently (no non-empty substring exists, the
123    /// distinct count is zero).
124    pub fn new(s: &[u8]) -> Self {
125        // Pre-reserve the worst-case 2n state budget.
126        let mut states: Vec<State> = Vec::with_capacity(2 * s.len().max(1));
127        states.push(State::new(0, NO_LINK)); // initial state
128
129        let mut sam = Self {
130            states,
131            init: 0,
132            last: 0,
133            source_len: 0,
134            counts_finalised: false,
135        };
136
137        for &c in s {
138            sam.extend(c);
139        }
140        sam.source_len = s.len();
141        sam
142    }
143
144    /// Append a single byte `c`, growing the automaton (Crochemore online step).
145    fn extend(&mut self, c: u8) {
146        let cur = self.states.len();
147        {
148            let cur_len = self.states[self.last].len + 1;
149            let mut st = State::new(cur_len, NO_LINK);
150            st.cnt = 1; // a genuine new prefix state ends at one position
151            self.states.push(st);
152        }
153
154        // Walk the suffix-link chain from `last`, adding c-transitions to `cur`
155        // until we reach the initial state or a state that already has one.
156        let mut p: isize = self.last as isize;
157        while p != NO_LINK && !self.states[p as usize].next.contains_key(&c) {
158            self.states[p as usize].next.insert(c, cur);
159            p = self.states[p as usize].link;
160        }
161
162        if p == NO_LINK {
163            // Reached the initial state without an existing c-edge.
164            self.states[cur].link = self.init as isize;
165        } else {
166            let q = self.states[p as usize].next[&c];
167            if self.states[p as usize].len + 1 == self.states[q].len {
168                // `q` is the right state already.
169                self.states[cur].link = q as isize;
170            } else {
171                // --- Clone-on-split. ---
172                let clone = self.states.len();
173                {
174                    let q_state = &self.states[q];
175                    let mut cl = State::new(self.states[p as usize].len + 1, q_state.link);
176                    cl.next = q_state.next.clone();
177                    cl.is_clone = true;
178                    cl.cnt = 0; // a clone contributes no fresh endpos by itself
179                    self.states.push(cl);
180                }
181
182                // Redirect the c-edges that pointed at `q` (and lay on the chain
183                // continuing from `p`) to the clone.
184                while p != NO_LINK {
185                    match self.states[p as usize].next.get(&c) {
186                        Some(&target) if target == q => {
187                            self.states[p as usize].next.insert(c, clone);
188                            p = self.states[p as usize].link;
189                        }
190                        _ => break,
191                    }
192                }
193
194                self.states[q].link = clone as isize;
195                self.states[cur].link = clone as isize;
196            }
197        }
198
199        self.last = cur;
200    }
201
202    /// Finalise the endpos-set sizes (`cnt`) used for occurrence counting.
203    ///
204    /// Sums `cnt` from longer to shorter states along suffix links. Idempotent:
205    /// after the first call the flag short-circuits further work. Implemented as
206    /// an internal mutation guarded by `&mut self`; callers reach it through
207    /// [`occurrences`](Self::occurrences) which borrows mutably.
208    fn finalise_counts(&mut self) {
209        if self.counts_finalised {
210            return;
211        }
212        // Counting sort of state indices by `len` (max len = source_len).
213        let max_len = self.source_len;
214        let mut bucket = vec![0usize; max_len + 2];
215        for st in &self.states {
216            bucket[st.len] += 1;
217        }
218        for i in 1..bucket.len() {
219            bucket[i] += bucket[i - 1];
220        }
221        let mut order = vec![0usize; self.states.len()];
222        // Stable placement; iterate states in index order, fill from the back.
223        for (idx, st) in self.states.iter().enumerate() {
224            bucket[st.len] -= 1;
225            order[bucket[st.len]] = idx;
226        }
227
228        // Process in decreasing `len`, pushing counts up the suffix link.
229        for &v in order.iter().rev() {
230            let link = self.states[v].link;
231            if link != NO_LINK {
232                let add = self.states[v].cnt;
233                self.states[link as usize].cnt += add;
234            }
235        }
236        self.counts_finalised = true;
237    }
238
239    /// Walk the automaton on `pattern`, returning the index of the state reached
240    /// after consuming all of it, or `None` if `pattern` is not a substring.
241    fn walk(&self, pattern: &[u8]) -> Option<usize> {
242        let mut state = self.init;
243        for &c in pattern {
244            match self.states[state].next.get(&c) {
245                Some(&next) => state = next,
246                None => return None,
247            }
248        }
249        Some(state)
250    }
251
252    /// Return `true` iff `pattern` is a substring of the source string.
253    ///
254    /// The empty pattern is a substring of every string (including the empty
255    /// string), so this returns `true` for an empty `pattern`.
256    pub fn contains(&self, pattern: &[u8]) -> bool {
257        self.walk(pattern).is_some()
258    }
259
260    /// Number of **distinct** non-empty substrings of the source string.
261    ///
262    /// Equals `Σ_v (len[v] − len[link[v]])` over all non-initial states `v`,
263    /// because each state recognises exactly `len[v] − len[link[v]]` distinct
264    /// substrings (the suffixes of its longest string down to, but excluding,
265    /// its suffix-link length).
266    pub fn distinct_substring_count(&self) -> usize {
267        let mut total = 0usize;
268        for (idx, st) in self.states.iter().enumerate() {
269            if idx == self.init {
270                continue;
271            }
272            let link_len = if st.link == NO_LINK {
273                0
274            } else {
275                self.states[st.link as usize].len
276            };
277            total += st.len - link_len;
278        }
279        total
280    }
281
282    /// Number of (possibly overlapping) occurrences of `pattern` in the source.
283    ///
284    /// Returns `0` when `pattern` is not a substring. The empty pattern is
285    /// reported as occurring `source_len + 1` times (once at each gap), matching
286    /// the usual convention.
287    ///
288    /// This finalises the endpos counters on first use (and caches them), hence
289    /// the `&mut self` receiver.
290    pub fn occurrences(&mut self, pattern: &[u8]) -> usize {
291        if pattern.is_empty() {
292            return self.source_len + 1;
293        }
294        self.finalise_counts();
295        match self.walk(pattern) {
296            Some(state) => self.states[state].cnt,
297            None => 0,
298        }
299    }
300
301    /// Number of states in the automaton, including the initial state.
302    pub fn state_count(&self) -> usize {
303        self.states.len()
304    }
305
306    /// Number of clone states produced during construction.
307    ///
308    /// Exposed so tests can confirm that an input which forces a split actually
309    /// triggered the clone path.
310    pub fn clone_count(&self) -> usize {
311        self.states.iter().filter(|s| s.is_clone).count()
312    }
313
314    /// Length of the source string the automaton was built from.
315    pub fn source_len(&self) -> usize {
316        self.source_len
317    }
318}
319
320/// Longest common substring of `a` and `b`.
321///
322/// Builds the suffix automaton of `a` and streams `b` through it, tracking the
323/// length of the current match and resetting via suffix links on a mismatch.
324/// Runs in `O(|a| + |b|)` after the `O(|a|)` construction. Returns the
325/// `(start_in_b, length)` of a longest common substring; on ties the match
326/// ending earliest in `b` is returned. A length of `0` means the two strings
327/// share no character.
328///
329/// # Errors
330///
331/// Returns [`SeqError::EmptyInput`] if **either** input is empty, since the
332/// longest common substring is then trivially empty and `start` is undefined.
333/// Callers that prefer to treat that as length `0` can check emptiness first.
334pub fn longest_common_substring(a: &[u8], b: &[u8]) -> SeqResult<(usize, usize)> {
335    if a.is_empty() || b.is_empty() {
336        return Err(SeqError::EmptyInput);
337    }
338
339    let sam = SuffixAutomaton::new(a);
340
341    let mut state = sam.init;
342    let mut length = 0usize; // current matched suffix length
343    let mut best_len = 0usize;
344    let mut best_end = 0usize; // index in `b` one past the best match
345
346    for (i, &c) in b.iter().enumerate() {
347        // Try to extend the current match by `c`.
348        loop {
349            if let Some(&next) = sam.states[state].next.get(&c) {
350                state = next;
351                length += 1;
352                break;
353            }
354            // Mismatch: follow suffix links until a c-edge exists or we hit the
355            // root, shrinking the matched length to that state's `len`.
356            if sam.states[state].link == NO_LINK {
357                // Back at the initial state; cannot match `c` here.
358                length = 0;
359                break;
360            }
361            state = sam.states[state].link as usize;
362            length = sam.states[state].len;
363        }
364
365        if length > best_len {
366            best_len = length;
367            best_end = i + 1;
368        }
369    }
370
371    // Recover the start position in `b`.
372    let start = best_end - best_len;
373    Ok((start, best_len))
374}
375
376#[cfg(test)]
377mod tests {
378    use super::*;
379    use crate::handle::LcgRng;
380    use std::collections::HashSet;
381
382    /// Brute-force set of all distinct non-empty substrings of `s`.
383    fn brute_distinct(s: &[u8]) -> HashSet<Vec<u8>> {
384        let n = s.len();
385        let mut set = HashSet::new();
386        for i in 0..n {
387            for j in (i + 1)..=n {
388                set.insert(s[i..j].to_vec());
389            }
390        }
391        set
392    }
393
394    /// Brute-force count of (overlapping) occurrences of `pat` in `s`.
395    fn brute_occurrences(s: &[u8], pat: &[u8]) -> usize {
396        if pat.is_empty() {
397            return s.len() + 1;
398        }
399        if pat.len() > s.len() {
400            return 0;
401        }
402        (0..=(s.len() - pat.len()))
403            .filter(|&start| &s[start..start + pat.len()] == pat)
404            .count()
405    }
406
407    /// Brute-force longest common substring length (DP).
408    fn brute_lcs_len(a: &[u8], b: &[u8]) -> usize {
409        let (m, n) = (a.len(), b.len());
410        if m == 0 || n == 0 {
411            return 0;
412        }
413        let mut prev = vec![0usize; n + 1];
414        let mut curr = vec![0usize; n + 1];
415        let mut best = 0usize;
416        for i in 1..=m {
417            for j in 1..=n {
418                curr[j] = if a[i - 1] == b[j - 1] {
419                    prev[j - 1] + 1
420                } else {
421                    0
422                };
423                best = best.max(curr[j]);
424            }
425            std::mem::swap(&mut prev, &mut curr);
426            for v in curr.iter_mut() {
427                *v = 0;
428            }
429        }
430        best
431    }
432
433    fn random_bytes(rng: &mut LcgRng, alphabet: &[u8], len: usize) -> Vec<u8> {
434        (0..len)
435            .map(|_| alphabet[rng.next_usize(alphabet.len())])
436            .collect()
437    }
438
439    /// (a) Distinct-substring count equals brute force, including the
440    /// split-forcing "abcbc", "aaaa", and "abracadabra".
441    #[test]
442    fn distinct_count_matches_brute_force() {
443        for s in [
444            b"abcbc".as_slice(),
445            b"aaaa",
446            b"abracadabra",
447            b"banana",
448            b"mississippi",
449            b"a",
450            b"ab",
451        ] {
452            let sam = SuffixAutomaton::new(s);
453            let got = sam.distinct_substring_count();
454            let expect = brute_distinct(s).len();
455            assert_eq!(got, expect, "distinct count for {s:?}");
456        }
457
458        // Randomised.
459        let mut rng = LcgRng::new(99);
460        for &alphabet in &[b"ab".as_slice(), b"abc"] {
461            for _ in 0..300 {
462                let len = rng.next_usize(18);
463                let s = random_bytes(&mut rng, alphabet, len);
464                let sam = SuffixAutomaton::new(&s);
465                assert_eq!(
466                    sam.distinct_substring_count(),
467                    brute_distinct(&s).len(),
468                    "random distinct count for {s:?}"
469                );
470            }
471        }
472    }
473
474    /// (b) Membership: positives AND negatives versus brute force.
475    #[test]
476    fn membership_positive_and_negative() {
477        let s = b"abracadabra";
478        let sam = SuffixAutomaton::new(s);
479        let all = brute_distinct(s);
480
481        // Every real substring is contained.
482        for sub in &all {
483            assert!(sam.contains(sub), "should contain {sub:?}");
484        }
485
486        // Some non-substrings are rejected.
487        for bad in [
488            b"xyz".as_slice(),
489            b"aa",
490            b"brc",
491            b"cadabrra",
492            b"z",
493            b"abrad",
494        ] {
495            let present = all.contains(bad);
496            assert_eq!(sam.contains(bad), present, "membership of {bad:?}");
497        }
498
499        // Empty pattern is always contained.
500        assert!(sam.contains(b""));
501
502        // Randomised positive/negative probing.
503        let mut rng = LcgRng::new(7);
504        for _ in 0..200 {
505            let len = 1 + rng.next_usize(12);
506            let text = random_bytes(&mut rng, b"abc", len);
507            let sam = SuffixAutomaton::new(&text);
508            let set = brute_distinct(&text);
509            // Probe a handful of random short strings.
510            for _ in 0..8 {
511                let qlen = 1 + rng.next_usize(5);
512                let q = random_bytes(&mut rng, b"abcd", qlen); // 'd' forces misses
513                assert_eq!(
514                    sam.contains(&q),
515                    set.contains(&q),
516                    "probe {q:?} against {text:?}"
517                );
518            }
519        }
520    }
521
522    /// (c) Longest common substring matches the DP oracle.
523    #[test]
524    fn lcs_matches_dp() {
525        let cases: &[(&[u8], &[u8])] = &[
526            (b"abcde", b"cdefg"),
527            (b"abcbc", b"bcbcd"),
528            (b"banana", b"ananas"),
529            (b"xxxx", b"yyyy"),
530            (b"hello", b"yellow"),
531            (b"a", b"a"),
532            (b"a", b"b"),
533        ];
534        for &(a, b) in cases {
535            let (start, len) = longest_common_substring(a, b).expect("non-empty");
536            let expect = brute_lcs_len(a, b);
537            assert_eq!(len, expect, "lcs length for {a:?},{b:?}");
538            // The reported slice of `b` is genuinely a substring of `a`.
539            let sub = &b[start..start + len];
540            let sam = SuffixAutomaton::new(a);
541            assert!(sam.contains(sub), "lcs slice {sub:?} must occur in {a:?}");
542        }
543
544        // Randomised cross-check.
545        let mut rng = LcgRng::new(2024);
546        for _ in 0..300 {
547            let la = 1 + rng.next_usize(14);
548            let lb = 1 + rng.next_usize(14);
549            let a = random_bytes(&mut rng, b"abc", la);
550            let b = random_bytes(&mut rng, b"abc", lb);
551            let (start, len) = longest_common_substring(&a, &b).expect("non-empty");
552            assert_eq!(len, brute_lcs_len(&a, &b), "random lcs {a:?},{b:?}");
553            let sub = &b[start..start + len];
554            let sam = SuffixAutomaton::new(&a);
555            assert!(sam.contains(sub), "random lcs slice {sub:?} in {a:?}");
556        }
557    }
558
559    /// (d) Clone-state correctness: "abcbc" forces a split, and all queries
560    /// still agree with brute force.
561    #[test]
562    fn clone_split_is_exercised() {
563        let sam = SuffixAutomaton::new(b"abcbc");
564        // The split path must have produced at least one clone.
565        assert!(sam.clone_count() >= 1, "abcbc must force a clone");
566        // State count stays within the 2n-1 bound.
567        assert!(sam.state_count() <= 2 * 5);
568        // Distinct count is the canonical 12 for "abcbc".
569        assert_eq!(sam.distinct_substring_count(), 12);
570        assert_eq!(
571            sam.distinct_substring_count(),
572            brute_distinct(b"abcbc").len()
573        );
574
575        // Another classic splitter.
576        let sam2 = SuffixAutomaton::new(b"abcbcba");
577        assert!(sam2.clone_count() >= 1);
578        assert_eq!(
579            sam2.distinct_substring_count(),
580            brute_distinct(b"abcbcba").len()
581        );
582    }
583
584    /// (e) Occurrence counting matches brute force (overlapping occurrences).
585    #[test]
586    fn occurrence_counting_matches_brute_force() {
587        let s = b"abracadabra";
588        let mut sam = SuffixAutomaton::new(s);
589        for pat in [
590            b"a".as_slice(),
591            b"ab",
592            b"abra",
593            b"bra",
594            b"ra",
595            b"cad",
596            b"z",
597            b"abrad",
598        ] {
599            assert_eq!(
600                sam.occurrences(pat),
601                brute_occurrences(s, pat),
602                "occurrences of {pat:?}"
603            );
604        }
605        // Overlapping case: "aa" in "aaaa" occurs 3 times.
606        let mut sam_a = SuffixAutomaton::new(b"aaaa");
607        assert_eq!(sam_a.occurrences(b"aa"), 3);
608        assert_eq!(sam_a.occurrences(b"aa"), brute_occurrences(b"aaaa", b"aa"));
609        assert_eq!(sam_a.occurrences(b"aaaa"), 1);
610
611        // Empty pattern occurs source_len + 1 times.
612        assert_eq!(sam_a.occurrences(b""), 5);
613
614        // Randomised occurrence cross-check.
615        let mut rng = LcgRng::new(555);
616        for _ in 0..150 {
617            let len = 1 + rng.next_usize(14);
618            let text = random_bytes(&mut rng, b"ab", len);
619            let mut sam = SuffixAutomaton::new(&text);
620            for _ in 0..6 {
621                let qlen = 1 + rng.next_usize(4);
622                let q = random_bytes(&mut rng, b"abc", qlen);
623                assert_eq!(
624                    sam.occurrences(&q),
625                    brute_occurrences(&text, &q),
626                    "occ {q:?} in {text:?}"
627                );
628            }
629        }
630    }
631
632    /// (f) Empty-string and single-character edge cases.
633    #[test]
634    fn edge_cases_empty_and_single() {
635        // Empty source: only the initial state, no substrings.
636        let sam_empty = SuffixAutomaton::new(b"");
637        assert_eq!(sam_empty.state_count(), 1);
638        assert_eq!(sam_empty.distinct_substring_count(), 0);
639        assert!(sam_empty.contains(b"")); // empty pattern
640        assert!(!sam_empty.contains(b"a"));
641        assert_eq!(sam_empty.source_len(), 0);
642
643        // Single character.
644        let mut sam_one = SuffixAutomaton::new(b"a");
645        assert_eq!(sam_one.distinct_substring_count(), 1);
646        assert!(sam_one.contains(b"a"));
647        assert!(!sam_one.contains(b"b"));
648        assert_eq!(sam_one.occurrences(b"a"), 1);
649        assert_eq!(sam_one.occurrences(b"b"), 0);
650
651        // Empty inputs to LCS are an error.
652        assert!(matches!(
653            longest_common_substring(b"", b"abc"),
654            Err(SeqError::EmptyInput)
655        ));
656        assert!(matches!(
657            longest_common_substring(b"abc", b""),
658            Err(SeqError::EmptyInput)
659        ));
660    }
661}