oxicuda_seq/string/suffix_automaton.rs
1//! Suffix automaton (directed acyclic word graph, DAWG).
2//!
3//! References:
4//! * Anselm Blumer, Janet Blumer, David Haussler, Andrzej Ehrenfeucht, M. T.
5//! Chen & Joel Seiferas, *"The smallest automaton recognizing the subwords of
6//! a text"*, Theoretical Computer Science 40, 1985, pp. 31–55.
7//! * Maxime Crochemore, *"Transducers and repetitions"*, TCS 45, 1986 — the
8//! linear online construction with state cloning on split, used here.
9//!
10//! # What a suffix automaton is
11//!
12//! The suffix automaton of a string `s` is the smallest deterministic finite
13//! automaton (with at most `2|s| − 1` states for `|s| ≥ 2`) whose recognised
14//! language is *exactly* the set of substrings of `s`. Every state `v` carries:
15//!
16//! * `len[v]` — the length of the **longest** string in `v`'s
17//! *endpos-equivalence class* (two substrings are equivalent when they end at
18//! the same set of positions in `s`), and
19//! * `link[v]` — the **suffix link** to the state of the longest proper suffix
20//! of `v`'s strings that lies in a *different* endpos class.
21//!
22//! The strings recognised at state `v` are precisely the suffixes of its
23//! longest string whose lengths lie in `(len[link[v]], len[v]]`.
24//!
25//! # Online construction
26//!
27//! Characters are appended one at a time. Extending by `c` creates a new state
28//! `cur` with `len[cur] = len[last] + 1` and walks the suffix-link chain from
29//! `last` adding `c`-transitions to `cur`. The interesting case is **cloning**:
30//! when the chain reaches a state `p` that already has a `c`-transition to some
31//! `q` with `len[q] != len[p] + 1`, a copy `clone` of `q` is inserted with
32//! `len[clone] = len[p] + 1`, redirecting the affected transitions and suffix
33//! links. This split is what keeps the automaton minimal; it is exercised
34//! directly by the tests (e.g. `"abcbc"`).
35//!
36//! # Provided queries
37//!
38//! * [`SuffixAutomaton::contains`] — substring membership.
39//! * [`SuffixAutomaton::distinct_substring_count`] — number of distinct
40//! non-empty substrings, `Σ_v (len[v] − len[link[v]])`.
41//! * [`SuffixAutomaton::occurrences`] — number of (possibly overlapping)
42//! occurrences of a pattern, via the size of each state's endpos set.
43//! * [`longest_common_substring`] — the longest common substring of two strings
44//! by running the automaton of one over the other.
45//!
46//! Input is treated as raw bytes (`&[u8]`); for ASCII this matches the usual
47//! character-level notion of substring.
48
49use crate::error::{SeqError, SeqResult};
50use std::collections::BTreeMap;
51
52/// Sentinel suffix link for the initial state (it has no parent).
53const NO_LINK: isize = -1;
54
55/// A single state of the suffix automaton.
56#[derive(Debug, Clone)]
57struct State {
58 /// Length of the longest string in this state's endpos class.
59 len: usize,
60 /// Suffix link, or [`NO_LINK`] for the initial state.
61 link: isize,
62 /// Outgoing transitions keyed by byte. A `BTreeMap` keeps the structure
63 /// compact for sparse alphabets while remaining deterministic to iterate.
64 next: BTreeMap<u8, usize>,
65 /// `endpos`-set size contribution used for occurrence counting. Original
66 /// (non-clone) states are created with `cnt = 1`; clones with `cnt = 0`.
67 /// After construction these are summed along suffix links in
68 /// reverse-`len` order so that `cnt[v]` becomes `|endpos(v)|`.
69 cnt: usize,
70 /// `true` if this state was produced by a clone-on-split. Recorded so the
71 /// clone path is observable and testable.
72 is_clone: bool,
73}
74
75impl State {
76 fn new(len: usize, link: isize) -> Self {
77 Self {
78 len,
79 link,
80 next: BTreeMap::new(),
81 cnt: 0,
82 is_clone: false,
83 }
84 }
85}
86
87/// A suffix automaton (DAWG) built online from a byte string.
88///
89/// Construct it with [`SuffixAutomaton::new`]; the build is `O(n log |Σ|)` for
90/// the `BTreeMap` transition tables (`O(n |Σ|)` with array tables). The
91/// automaton stores the length of the source string so that occurrence counts
92/// and the distinct-substring total can be reported directly.
93///
94/// # Examples
95///
96/// ```
97/// use oxicuda_seq::string::SuffixAutomaton;
98///
99/// let sam = SuffixAutomaton::new(b"abcbc");
100/// assert!(sam.contains(b"bcb"));
101/// assert!(!sam.contains(b"acb"));
102/// // "abcbc" has 12 distinct non-empty substrings.
103/// assert_eq!(sam.distinct_substring_count(), 12);
104/// ```
105#[derive(Debug, Clone)]
106pub struct SuffixAutomaton {
107 states: Vec<State>,
108 /// Index of the initial state (always 0).
109 init: usize,
110 /// Index of the state reached after consuming the whole source string.
111 last: usize,
112 /// Length of the source string in bytes.
113 source_len: usize,
114 /// Lazily-computed `cnt` finalisation flag for occurrence counting.
115 counts_finalised: bool,
116}
117
118impl SuffixAutomaton {
119 /// Build the suffix automaton of `s`.
120 ///
121 /// An empty `s` yields an automaton with only the initial state; all
122 /// queries then behave consistently (no non-empty substring exists, the
123 /// distinct count is zero).
124 pub fn new(s: &[u8]) -> Self {
125 // Pre-reserve the worst-case 2n state budget.
126 let mut states: Vec<State> = Vec::with_capacity(2 * s.len().max(1));
127 states.push(State::new(0, NO_LINK)); // initial state
128
129 let mut sam = Self {
130 states,
131 init: 0,
132 last: 0,
133 source_len: 0,
134 counts_finalised: false,
135 };
136
137 for &c in s {
138 sam.extend(c);
139 }
140 sam.source_len = s.len();
141 sam
142 }
143
144 /// Append a single byte `c`, growing the automaton (Crochemore online step).
145 fn extend(&mut self, c: u8) {
146 let cur = self.states.len();
147 {
148 let cur_len = self.states[self.last].len + 1;
149 let mut st = State::new(cur_len, NO_LINK);
150 st.cnt = 1; // a genuine new prefix state ends at one position
151 self.states.push(st);
152 }
153
154 // Walk the suffix-link chain from `last`, adding c-transitions to `cur`
155 // until we reach the initial state or a state that already has one.
156 let mut p: isize = self.last as isize;
157 while p != NO_LINK && !self.states[p as usize].next.contains_key(&c) {
158 self.states[p as usize].next.insert(c, cur);
159 p = self.states[p as usize].link;
160 }
161
162 if p == NO_LINK {
163 // Reached the initial state without an existing c-edge.
164 self.states[cur].link = self.init as isize;
165 } else {
166 let q = self.states[p as usize].next[&c];
167 if self.states[p as usize].len + 1 == self.states[q].len {
168 // `q` is the right state already.
169 self.states[cur].link = q as isize;
170 } else {
171 // --- Clone-on-split. ---
172 let clone = self.states.len();
173 {
174 let q_state = &self.states[q];
175 let mut cl = State::new(self.states[p as usize].len + 1, q_state.link);
176 cl.next = q_state.next.clone();
177 cl.is_clone = true;
178 cl.cnt = 0; // a clone contributes no fresh endpos by itself
179 self.states.push(cl);
180 }
181
182 // Redirect the c-edges that pointed at `q` (and lay on the chain
183 // continuing from `p`) to the clone.
184 while p != NO_LINK {
185 match self.states[p as usize].next.get(&c) {
186 Some(&target) if target == q => {
187 self.states[p as usize].next.insert(c, clone);
188 p = self.states[p as usize].link;
189 }
190 _ => break,
191 }
192 }
193
194 self.states[q].link = clone as isize;
195 self.states[cur].link = clone as isize;
196 }
197 }
198
199 self.last = cur;
200 }
201
202 /// Finalise the endpos-set sizes (`cnt`) used for occurrence counting.
203 ///
204 /// Sums `cnt` from longer to shorter states along suffix links. Idempotent:
205 /// after the first call the flag short-circuits further work. Implemented as
206 /// an internal mutation guarded by `&mut self`; callers reach it through
207 /// [`occurrences`](Self::occurrences) which borrows mutably.
208 fn finalise_counts(&mut self) {
209 if self.counts_finalised {
210 return;
211 }
212 // Counting sort of state indices by `len` (max len = source_len).
213 let max_len = self.source_len;
214 let mut bucket = vec![0usize; max_len + 2];
215 for st in &self.states {
216 bucket[st.len] += 1;
217 }
218 for i in 1..bucket.len() {
219 bucket[i] += bucket[i - 1];
220 }
221 let mut order = vec![0usize; self.states.len()];
222 // Stable placement; iterate states in index order, fill from the back.
223 for (idx, st) in self.states.iter().enumerate() {
224 bucket[st.len] -= 1;
225 order[bucket[st.len]] = idx;
226 }
227
228 // Process in decreasing `len`, pushing counts up the suffix link.
229 for &v in order.iter().rev() {
230 let link = self.states[v].link;
231 if link != NO_LINK {
232 let add = self.states[v].cnt;
233 self.states[link as usize].cnt += add;
234 }
235 }
236 self.counts_finalised = true;
237 }
238
239 /// Walk the automaton on `pattern`, returning the index of the state reached
240 /// after consuming all of it, or `None` if `pattern` is not a substring.
241 fn walk(&self, pattern: &[u8]) -> Option<usize> {
242 let mut state = self.init;
243 for &c in pattern {
244 match self.states[state].next.get(&c) {
245 Some(&next) => state = next,
246 None => return None,
247 }
248 }
249 Some(state)
250 }
251
252 /// Return `true` iff `pattern` is a substring of the source string.
253 ///
254 /// The empty pattern is a substring of every string (including the empty
255 /// string), so this returns `true` for an empty `pattern`.
256 pub fn contains(&self, pattern: &[u8]) -> bool {
257 self.walk(pattern).is_some()
258 }
259
260 /// Number of **distinct** non-empty substrings of the source string.
261 ///
262 /// Equals `Σ_v (len[v] − len[link[v]])` over all non-initial states `v`,
263 /// because each state recognises exactly `len[v] − len[link[v]]` distinct
264 /// substrings (the suffixes of its longest string down to, but excluding,
265 /// its suffix-link length).
266 pub fn distinct_substring_count(&self) -> usize {
267 let mut total = 0usize;
268 for (idx, st) in self.states.iter().enumerate() {
269 if idx == self.init {
270 continue;
271 }
272 let link_len = if st.link == NO_LINK {
273 0
274 } else {
275 self.states[st.link as usize].len
276 };
277 total += st.len - link_len;
278 }
279 total
280 }
281
282 /// Number of (possibly overlapping) occurrences of `pattern` in the source.
283 ///
284 /// Returns `0` when `pattern` is not a substring. The empty pattern is
285 /// reported as occurring `source_len + 1` times (once at each gap), matching
286 /// the usual convention.
287 ///
288 /// This finalises the endpos counters on first use (and caches them), hence
289 /// the `&mut self` receiver.
290 pub fn occurrences(&mut self, pattern: &[u8]) -> usize {
291 if pattern.is_empty() {
292 return self.source_len + 1;
293 }
294 self.finalise_counts();
295 match self.walk(pattern) {
296 Some(state) => self.states[state].cnt,
297 None => 0,
298 }
299 }
300
301 /// Number of states in the automaton, including the initial state.
302 pub fn state_count(&self) -> usize {
303 self.states.len()
304 }
305
306 /// Number of clone states produced during construction.
307 ///
308 /// Exposed so tests can confirm that an input which forces a split actually
309 /// triggered the clone path.
310 pub fn clone_count(&self) -> usize {
311 self.states.iter().filter(|s| s.is_clone).count()
312 }
313
314 /// Length of the source string the automaton was built from.
315 pub fn source_len(&self) -> usize {
316 self.source_len
317 }
318}
319
320/// Longest common substring of `a` and `b`.
321///
322/// Builds the suffix automaton of `a` and streams `b` through it, tracking the
323/// length of the current match and resetting via suffix links on a mismatch.
324/// Runs in `O(|a| + |b|)` after the `O(|a|)` construction. Returns the
325/// `(start_in_b, length)` of a longest common substring; on ties the match
326/// ending earliest in `b` is returned. A length of `0` means the two strings
327/// share no character.
328///
329/// # Errors
330///
331/// Returns [`SeqError::EmptyInput`] if **either** input is empty, since the
332/// longest common substring is then trivially empty and `start` is undefined.
333/// Callers that prefer to treat that as length `0` can check emptiness first.
334pub fn longest_common_substring(a: &[u8], b: &[u8]) -> SeqResult<(usize, usize)> {
335 if a.is_empty() || b.is_empty() {
336 return Err(SeqError::EmptyInput);
337 }
338
339 let sam = SuffixAutomaton::new(a);
340
341 let mut state = sam.init;
342 let mut length = 0usize; // current matched suffix length
343 let mut best_len = 0usize;
344 let mut best_end = 0usize; // index in `b` one past the best match
345
346 for (i, &c) in b.iter().enumerate() {
347 // Try to extend the current match by `c`.
348 loop {
349 if let Some(&next) = sam.states[state].next.get(&c) {
350 state = next;
351 length += 1;
352 break;
353 }
354 // Mismatch: follow suffix links until a c-edge exists or we hit the
355 // root, shrinking the matched length to that state's `len`.
356 if sam.states[state].link == NO_LINK {
357 // Back at the initial state; cannot match `c` here.
358 length = 0;
359 break;
360 }
361 state = sam.states[state].link as usize;
362 length = sam.states[state].len;
363 }
364
365 if length > best_len {
366 best_len = length;
367 best_end = i + 1;
368 }
369 }
370
371 // Recover the start position in `b`.
372 let start = best_end - best_len;
373 Ok((start, best_len))
374}
375
376#[cfg(test)]
377mod tests {
378 use super::*;
379 use crate::handle::LcgRng;
380 use std::collections::HashSet;
381
382 /// Brute-force set of all distinct non-empty substrings of `s`.
383 fn brute_distinct(s: &[u8]) -> HashSet<Vec<u8>> {
384 let n = s.len();
385 let mut set = HashSet::new();
386 for i in 0..n {
387 for j in (i + 1)..=n {
388 set.insert(s[i..j].to_vec());
389 }
390 }
391 set
392 }
393
394 /// Brute-force count of (overlapping) occurrences of `pat` in `s`.
395 fn brute_occurrences(s: &[u8], pat: &[u8]) -> usize {
396 if pat.is_empty() {
397 return s.len() + 1;
398 }
399 if pat.len() > s.len() {
400 return 0;
401 }
402 (0..=(s.len() - pat.len()))
403 .filter(|&start| &s[start..start + pat.len()] == pat)
404 .count()
405 }
406
407 /// Brute-force longest common substring length (DP).
408 fn brute_lcs_len(a: &[u8], b: &[u8]) -> usize {
409 let (m, n) = (a.len(), b.len());
410 if m == 0 || n == 0 {
411 return 0;
412 }
413 let mut prev = vec![0usize; n + 1];
414 let mut curr = vec![0usize; n + 1];
415 let mut best = 0usize;
416 for i in 1..=m {
417 for j in 1..=n {
418 curr[j] = if a[i - 1] == b[j - 1] {
419 prev[j - 1] + 1
420 } else {
421 0
422 };
423 best = best.max(curr[j]);
424 }
425 std::mem::swap(&mut prev, &mut curr);
426 for v in curr.iter_mut() {
427 *v = 0;
428 }
429 }
430 best
431 }
432
433 fn random_bytes(rng: &mut LcgRng, alphabet: &[u8], len: usize) -> Vec<u8> {
434 (0..len)
435 .map(|_| alphabet[rng.next_usize(alphabet.len())])
436 .collect()
437 }
438
439 /// (a) Distinct-substring count equals brute force, including the
440 /// split-forcing "abcbc", "aaaa", and "abracadabra".
441 #[test]
442 fn distinct_count_matches_brute_force() {
443 for s in [
444 b"abcbc".as_slice(),
445 b"aaaa",
446 b"abracadabra",
447 b"banana",
448 b"mississippi",
449 b"a",
450 b"ab",
451 ] {
452 let sam = SuffixAutomaton::new(s);
453 let got = sam.distinct_substring_count();
454 let expect = brute_distinct(s).len();
455 assert_eq!(got, expect, "distinct count for {s:?}");
456 }
457
458 // Randomised.
459 let mut rng = LcgRng::new(99);
460 for &alphabet in &[b"ab".as_slice(), b"abc"] {
461 for _ in 0..300 {
462 let len = rng.next_usize(18);
463 let s = random_bytes(&mut rng, alphabet, len);
464 let sam = SuffixAutomaton::new(&s);
465 assert_eq!(
466 sam.distinct_substring_count(),
467 brute_distinct(&s).len(),
468 "random distinct count for {s:?}"
469 );
470 }
471 }
472 }
473
474 /// (b) Membership: positives AND negatives versus brute force.
475 #[test]
476 fn membership_positive_and_negative() {
477 let s = b"abracadabra";
478 let sam = SuffixAutomaton::new(s);
479 let all = brute_distinct(s);
480
481 // Every real substring is contained.
482 for sub in &all {
483 assert!(sam.contains(sub), "should contain {sub:?}");
484 }
485
486 // Some non-substrings are rejected.
487 for bad in [
488 b"xyz".as_slice(),
489 b"aa",
490 b"brc",
491 b"cadabrra",
492 b"z",
493 b"abrad",
494 ] {
495 let present = all.contains(bad);
496 assert_eq!(sam.contains(bad), present, "membership of {bad:?}");
497 }
498
499 // Empty pattern is always contained.
500 assert!(sam.contains(b""));
501
502 // Randomised positive/negative probing.
503 let mut rng = LcgRng::new(7);
504 for _ in 0..200 {
505 let len = 1 + rng.next_usize(12);
506 let text = random_bytes(&mut rng, b"abc", len);
507 let sam = SuffixAutomaton::new(&text);
508 let set = brute_distinct(&text);
509 // Probe a handful of random short strings.
510 for _ in 0..8 {
511 let qlen = 1 + rng.next_usize(5);
512 let q = random_bytes(&mut rng, b"abcd", qlen); // 'd' forces misses
513 assert_eq!(
514 sam.contains(&q),
515 set.contains(&q),
516 "probe {q:?} against {text:?}"
517 );
518 }
519 }
520 }
521
522 /// (c) Longest common substring matches the DP oracle.
523 #[test]
524 fn lcs_matches_dp() {
525 let cases: &[(&[u8], &[u8])] = &[
526 (b"abcde", b"cdefg"),
527 (b"abcbc", b"bcbcd"),
528 (b"banana", b"ananas"),
529 (b"xxxx", b"yyyy"),
530 (b"hello", b"yellow"),
531 (b"a", b"a"),
532 (b"a", b"b"),
533 ];
534 for &(a, b) in cases {
535 let (start, len) = longest_common_substring(a, b).expect("non-empty");
536 let expect = brute_lcs_len(a, b);
537 assert_eq!(len, expect, "lcs length for {a:?},{b:?}");
538 // The reported slice of `b` is genuinely a substring of `a`.
539 let sub = &b[start..start + len];
540 let sam = SuffixAutomaton::new(a);
541 assert!(sam.contains(sub), "lcs slice {sub:?} must occur in {a:?}");
542 }
543
544 // Randomised cross-check.
545 let mut rng = LcgRng::new(2024);
546 for _ in 0..300 {
547 let la = 1 + rng.next_usize(14);
548 let lb = 1 + rng.next_usize(14);
549 let a = random_bytes(&mut rng, b"abc", la);
550 let b = random_bytes(&mut rng, b"abc", lb);
551 let (start, len) = longest_common_substring(&a, &b).expect("non-empty");
552 assert_eq!(len, brute_lcs_len(&a, &b), "random lcs {a:?},{b:?}");
553 let sub = &b[start..start + len];
554 let sam = SuffixAutomaton::new(&a);
555 assert!(sam.contains(sub), "random lcs slice {sub:?} in {a:?}");
556 }
557 }
558
559 /// (d) Clone-state correctness: "abcbc" forces a split, and all queries
560 /// still agree with brute force.
561 #[test]
562 fn clone_split_is_exercised() {
563 let sam = SuffixAutomaton::new(b"abcbc");
564 // The split path must have produced at least one clone.
565 assert!(sam.clone_count() >= 1, "abcbc must force a clone");
566 // State count stays within the 2n-1 bound.
567 assert!(sam.state_count() <= 2 * 5);
568 // Distinct count is the canonical 12 for "abcbc".
569 assert_eq!(sam.distinct_substring_count(), 12);
570 assert_eq!(
571 sam.distinct_substring_count(),
572 brute_distinct(b"abcbc").len()
573 );
574
575 // Another classic splitter.
576 let sam2 = SuffixAutomaton::new(b"abcbcba");
577 assert!(sam2.clone_count() >= 1);
578 assert_eq!(
579 sam2.distinct_substring_count(),
580 brute_distinct(b"abcbcba").len()
581 );
582 }
583
584 /// (e) Occurrence counting matches brute force (overlapping occurrences).
585 #[test]
586 fn occurrence_counting_matches_brute_force() {
587 let s = b"abracadabra";
588 let mut sam = SuffixAutomaton::new(s);
589 for pat in [
590 b"a".as_slice(),
591 b"ab",
592 b"abra",
593 b"bra",
594 b"ra",
595 b"cad",
596 b"z",
597 b"abrad",
598 ] {
599 assert_eq!(
600 sam.occurrences(pat),
601 brute_occurrences(s, pat),
602 "occurrences of {pat:?}"
603 );
604 }
605 // Overlapping case: "aa" in "aaaa" occurs 3 times.
606 let mut sam_a = SuffixAutomaton::new(b"aaaa");
607 assert_eq!(sam_a.occurrences(b"aa"), 3);
608 assert_eq!(sam_a.occurrences(b"aa"), brute_occurrences(b"aaaa", b"aa"));
609 assert_eq!(sam_a.occurrences(b"aaaa"), 1);
610
611 // Empty pattern occurs source_len + 1 times.
612 assert_eq!(sam_a.occurrences(b""), 5);
613
614 // Randomised occurrence cross-check.
615 let mut rng = LcgRng::new(555);
616 for _ in 0..150 {
617 let len = 1 + rng.next_usize(14);
618 let text = random_bytes(&mut rng, b"ab", len);
619 let mut sam = SuffixAutomaton::new(&text);
620 for _ in 0..6 {
621 let qlen = 1 + rng.next_usize(4);
622 let q = random_bytes(&mut rng, b"abc", qlen);
623 assert_eq!(
624 sam.occurrences(&q),
625 brute_occurrences(&text, &q),
626 "occ {q:?} in {text:?}"
627 );
628 }
629 }
630 }
631
632 /// (f) Empty-string and single-character edge cases.
633 #[test]
634 fn edge_cases_empty_and_single() {
635 // Empty source: only the initial state, no substrings.
636 let sam_empty = SuffixAutomaton::new(b"");
637 assert_eq!(sam_empty.state_count(), 1);
638 assert_eq!(sam_empty.distinct_substring_count(), 0);
639 assert!(sam_empty.contains(b"")); // empty pattern
640 assert!(!sam_empty.contains(b"a"));
641 assert_eq!(sam_empty.source_len(), 0);
642
643 // Single character.
644 let mut sam_one = SuffixAutomaton::new(b"a");
645 assert_eq!(sam_one.distinct_substring_count(), 1);
646 assert!(sam_one.contains(b"a"));
647 assert!(!sam_one.contains(b"b"));
648 assert_eq!(sam_one.occurrences(b"a"), 1);
649 assert_eq!(sam_one.occurrences(b"b"), 0);
650
651 // Empty inputs to LCS are an error.
652 assert!(matches!(
653 longest_common_substring(b"", b"abc"),
654 Err(SeqError::EmptyInput)
655 ));
656 assert!(matches!(
657 longest_common_substring(b"abc", b""),
658 Err(SeqError::EmptyInput)
659 ));
660 }
661}