Skip to main content

mollify_core/
suffix.rs

1//! Linear-time suffix array (SA-IS) + LCP (Kasai), over an integer alphabet.
2//!
3//! This is the engine behind exact, maximal token-clone detection: concatenate
4//! every file's normalized token stream (separated by unique sentinels so no
5//! match can cross a file boundary), build the suffix array in O(n) via SA-IS
6//! (Nong, Zhang & Chan 2009, *Linear Suffix Array Construction by Almost Pure
7//! Induced-Sorting*), then derive the LCP array in O(n) via Kasai et al. Runs
8//! of LCP ≥ threshold are exact maximal repeats — the clone classes.
9//!
10//! The input must be a sequence of symbols in `0..alphabet_size` whose **last
11//! element is `0` and `0` appears nowhere else** (the unique smallest
12//! sentinel). `dupes` guarantees this.
13//!
14// Index-based loops are intrinsic to induced sorting and Kasai's LCP; the
15// iterator rewrites clippy suggests would obscure the algorithm.
16#![allow(clippy::needless_range_loop)]
17
18/// Build the suffix array of `s` using SA-IS.
19///
20/// `alphabet_size` is the exclusive upper bound on symbol values (i.e. symbols
21/// are in `0..alphabet_size`). `s` must be non-empty, end in `0`, and contain
22/// `0` exactly once (at the end).
23pub fn suffix_array(s: &[u32], alphabet_size: usize) -> Vec<u32> {
24    let n = s.len();
25    let mut sa = vec![0u32; n];
26    if n <= 1 {
27        return sa;
28    }
29    let s_usize: Vec<usize> = s.iter().map(|&x| x as usize).collect();
30    let mut sa_usize = vec![0usize; n];
31    sais(&s_usize, &mut sa_usize, alphabet_size);
32    for (dst, &v) in sa.iter_mut().zip(sa_usize.iter()) {
33        *dst = v as u32;
34    }
35    sa
36}
37
38#[inline]
39fn is_lms(t: &[bool], i: usize) -> bool {
40    i > 0 && t[i] && !t[i - 1]
41}
42
43/// Whether the two LMS substrings starting at `a` and `b` are identical
44/// (same characters and same L/S types, same length). The sentinel-only LMS
45/// at `n-1` is unique.
46fn lms_substrings_equal(s: &[usize], t: &[bool], a: usize, b: usize) -> bool {
47    let n = s.len();
48    if a == n - 1 || b == n - 1 {
49        return a == b;
50    }
51    let mut i = 0;
52    loop {
53        let ai = a + i;
54        let bi = b + i;
55        if ai >= n || bi >= n {
56            return false;
57        }
58        let a_lms = is_lms(t, ai);
59        let b_lms = is_lms(t, bi);
60        if i > 0 && a_lms && b_lms {
61            return true; // both reached their next LMS at the same offset
62        }
63        if a_lms != b_lms {
64            return false; // one ended before the other
65        }
66        if s[ai] != s[bi] || t[ai] != t[bi] {
67            return false;
68        }
69        i += 1;
70    }
71}
72
73/// Bucket head/tail offsets. `tail=true` → one-past-the-end of each bucket.
74fn buckets(sizes: &[usize], tail: bool) -> Vec<usize> {
75    let mut b = vec![0usize; sizes.len()];
76    let mut sum = 0usize;
77    for (i, &sz) in sizes.iter().enumerate() {
78        sum += sz;
79        b[i] = if tail { sum } else { sum - sz };
80    }
81    b
82}
83
84fn bucket_sizes(s: &[usize], k: usize) -> Vec<usize> {
85    let mut sizes = vec![0usize; k];
86    for &c in s {
87        sizes[c] += 1;
88    }
89    sizes
90}
91
92/// Induce L-type then S-type suffixes from already-placed LMS suffixes.
93fn induce(s: &[usize], sa: &mut [usize], t: &[bool], sizes: &[usize], k: usize) {
94    let n = s.len();
95    let none = usize::MAX;
96    // L-type: left→right, bucket heads.
97    let mut head = buckets(sizes, false);
98    for i in 0..n {
99        let j = sa[i];
100        if j != none && j > 0 && !t[j - 1] {
101            let c = s[j - 1];
102            sa[head[c]] = j - 1;
103            head[c] += 1;
104        }
105    }
106    let _ = k;
107    // S-type: right→left, bucket tails.
108    let mut tail = buckets(sizes, true);
109    for i in (0..n).rev() {
110        let j = sa[i];
111        if j != none && j > 0 && t[j - 1] {
112            let c = s[j - 1];
113            tail[c] -= 1;
114            sa[tail[c]] = j - 1;
115        }
116    }
117}
118
119fn sais(s: &[usize], sa: &mut [usize], k: usize) {
120    let n = s.len();
121    let none = usize::MAX;
122    if n == 1 {
123        sa[0] = 0;
124        return;
125    }
126    // Type map: true = S-type, false = L-type.
127    let mut t = vec![false; n];
128    t[n - 1] = true;
129    for i in (0..n - 1).rev() {
130        t[i] = s[i] < s[i + 1] || (s[i] == s[i + 1] && t[i + 1]);
131    }
132
133    let sizes = bucket_sizes(s, k);
134
135    // --- Step 1: place LMS suffixes at bucket tails, then induce. ---
136    for v in sa.iter_mut() {
137        *v = none;
138    }
139    let mut tail = buckets(&sizes, true);
140    for i in 1..n {
141        if is_lms(&t, i) {
142            let c = s[i];
143            tail[c] -= 1;
144            sa[tail[c]] = i;
145        }
146    }
147    induce(s, sa, &t, &sizes, k);
148
149    // --- Step 2: collect sorted LMS positions, name them. ---
150    let mut lms_sorted: Vec<usize> = Vec::new();
151    for i in 0..n {
152        let j = sa[i];
153        if j != none && is_lms(&t, j) {
154            lms_sorted.push(j);
155        }
156    }
157    let mut names = vec![none; n];
158    let mut name = 0usize;
159    names[lms_sorted[0]] = 0;
160    let mut prev = lms_sorted[0];
161    for &cur in lms_sorted.iter().skip(1) {
162        if !lms_substrings_equal(s, &t, prev, cur) {
163            name += 1;
164        }
165        names[cur] = name;
166        prev = cur;
167    }
168    let num_names = name + 1;
169
170    // Reduced string in text order of LMS positions.
171    let mut lms_positions: Vec<usize> = (1..n).filter(|&i| is_lms(&t, i)).collect();
172    lms_positions.sort_unstable();
173    let reduced: Vec<usize> = lms_positions.iter().map(|&p| names[p]).collect();
174
175    // --- Step 3: recurse (or base case) to sort the reduced string. ---
176    let mut reduced_sa = vec![0usize; reduced.len()];
177    if num_names == reduced.len() {
178        // All names unique → SA is the inverse permutation.
179        for (i, &nm) in reduced.iter().enumerate() {
180            reduced_sa[nm] = i;
181        }
182    } else {
183        sais(&reduced, &mut reduced_sa, num_names);
184    }
185
186    // --- Step 4: place LMS suffixes in true sorted order, induce final SA. ---
187    for v in sa.iter_mut() {
188        *v = none;
189    }
190    let mut tail = buckets(&sizes, true);
191    // Walk reduced_sa to get LMS positions in sorted order, place at bucket tails.
192    for idx in (0..reduced_sa.len()).rev() {
193        let p = lms_positions[reduced_sa[idx]];
194        let c = s[p];
195        tail[c] -= 1;
196        sa[tail[c]] = p;
197    }
198    induce(s, sa, &t, &sizes, k);
199}
200
201/// LCP array via Kasai et al.: `lcp[i]` is the longest common prefix length of
202/// the suffixes at `sa[i-1]` and `sa[i]`; `lcp[0] == 0`.
203pub fn lcp_kasai(s: &[u32], sa: &[u32]) -> Vec<u32> {
204    let n = s.len();
205    let mut lcp = vec![0u32; n];
206    if n == 0 {
207        return lcp;
208    }
209    let mut rank = vec![0usize; n];
210    for (i, &p) in sa.iter().enumerate() {
211        rank[p as usize] = i;
212    }
213    let mut h = 0usize;
214    for i in 0..n {
215        if rank[i] > 0 {
216            let j = sa[rank[i] - 1] as usize;
217            while i + h < n && j + h < n && s[i + h] == s[j + h] {
218                h += 1;
219            }
220            lcp[rank[i]] = h as u32;
221            h = h.saturating_sub(1);
222        } else {
223            h = 0;
224        }
225    }
226    lcp
227}
228
229#[cfg(test)]
230mod tests {
231    use super::*;
232
233    /// Naive O(n² log n) suffix array for cross-checking.
234    fn naive_sa(s: &[u32]) -> Vec<u32> {
235        let n = s.len();
236        let mut idx: Vec<u32> = (0..n as u32).collect();
237        idx.sort_by(|&a, &b| s[a as usize..].cmp(&s[b as usize..]));
238        idx
239    }
240
241    fn naive_lcp(s: &[u32], sa: &[u32]) -> Vec<u32> {
242        let mut lcp = vec![0u32; sa.len()];
243        for i in 1..sa.len() {
244            let a = &s[sa[i - 1] as usize..];
245            let b = &s[sa[i] as usize..];
246            let mut k = 0;
247            while k < a.len() && k < b.len() && a[k] == b[k] {
248                k += 1;
249            }
250            lcp[i] = k as u32;
251        }
252        lcp
253    }
254
255    /// Build a valid SA-IS input: symbols in 1..=max, terminated by a unique 0.
256    fn with_sentinel(body: &[u32]) -> (Vec<u32>, usize) {
257        let mut s: Vec<u32> = body.iter().map(|&x| x + 1).collect();
258        s.push(0);
259        let k = s.iter().copied().max().unwrap() as usize + 1;
260        (s, k)
261    }
262
263    #[test]
264    fn matches_naive_on_small_strings() {
265        let cases: &[&[u32]] = &[
266            &[],
267            &[0],
268            &[1, 1, 1, 1],
269            &[3, 1, 2, 1, 2, 3],
270            &[1, 2, 1, 2, 1, 2, 1],
271            &[5, 4, 3, 2, 1],
272            &[1, 2, 3, 4, 5],
273        ];
274        for c in cases {
275            let (s, k) = with_sentinel(c);
276            let sa = suffix_array(&s, k);
277            assert_eq!(sa, naive_sa(&s), "SA mismatch on {c:?}");
278            assert_eq!(
279                lcp_kasai(&s, &sa),
280                naive_lcp(&s, &sa),
281                "LCP mismatch on {c:?}"
282            );
283        }
284    }
285
286    #[test]
287    fn matches_naive_on_random_strings() {
288        // Deterministic LCG so the test is reproducible.
289        let mut state: u64 = 0x9E3779B97F4A7C15;
290        let mut next = || {
291            state = state
292                .wrapping_mul(6364136223846793005)
293                .wrapping_add(1442695040888963407);
294            (state >> 33) as u32
295        };
296        for _ in 0..400 {
297            let len = (next() % 60) as usize + 1;
298            let alpha = (next() % 4) + 1; // small alphabet → many repeats
299            let body: Vec<u32> = (0..len).map(|_| next() % alpha).collect();
300            let (s, k) = with_sentinel(&body);
301            let sa = suffix_array(&s, k);
302            assert_eq!(sa, naive_sa(&s), "SA mismatch on {body:?}");
303            assert_eq!(
304                lcp_kasai(&s, &sa),
305                naive_lcp(&s, &sa),
306                "LCP mismatch on {body:?}"
307            );
308        }
309    }
310
311    #[test]
312    fn finds_repeat_via_lcp() {
313        // "abcabc" → the repeat "abc" (len 3) shows up as an LCP ≥ 3.
314        let (s, k) = with_sentinel(&[1, 2, 3, 1, 2, 3]);
315        let sa = suffix_array(&s, k);
316        let lcp = lcp_kasai(&s, &sa);
317        assert!(
318            lcp.iter().any(|&l| l >= 3),
319            "expected an LCP ≥ 3, got {lcp:?}"
320        );
321    }
322}