Skip to main content

oxicuda_seq/string/
z_algorithm.rs

1//! Gusfield's Z-algorithm: the Z-array in linear time, with exact matching.
2//!
3//! Reference: Dan Gusfield, *"Algorithms on Strings, Trees, and Sequences:
4//! Computer Science and Computational Biology"*, Cambridge University Press,
5//! 1997, §1.3–1.5. The Z-array (sometimes "Z-function") and its use for
6//! linear-time exact pattern matching via the `P $ T` construction are due to
7//! that exposition; the underlying fundamental preprocessing predates it.
8//!
9//! # The Z-array
10//!
11//! For a string `s` of length `n`, the **Z-array** `z` is defined by
12//!
13//! ```text
14//! z[0] = 0                                    (by convention; see below)
15//! z[i] = length of the longest substring starting at i
16//!        that is also a prefix of s,          for 1 ≤ i < n.
17//! ```
18//!
19//! Equivalently `z[i] = max { k : s[0..k] == s[i..i+k] }`. So if
20//! `s = "aabxaabxcaabxaabxay"` then `z[4] = 4` because `s[4..8] == "aabx" ==
21//! s[0..4]` but `s[8] = 'c' != 'a' = s[4]`.
22//!
23//! ## The `z[0]` convention
24//!
25//! Position `0` matches the whole prefix trivially (`s[0..n] == s[0..n]`), so a
26//! literal reading would give `z[0] = n`. That value carries no information and
27//! complicates the matching application, so — following Gusfield and the common
28//! competitive-programming convention — **this implementation sets `z[0] = 0`**.
29//! Callers that need the "real" value can use `n` directly. This choice is fixed
30//! and tested (`z0_convention_is_zero`).
31//!
32//! # Linear time via the Z-box
33//!
34//! The algorithm maintains the interval `[l, r)` (the **Z-box**) that is the
35//! right-most match of a prefix discovered so far: `s[l..r] == s[0..r-l]`. For a
36//! new index `i`:
37//!
38//! * if `i < r`, the box already tells us that `s[i..r] == s[i-l..r-l]`, so
39//!   `z[i] ≥ min(z[i-l], r-i)` for free; only the part beyond `r` needs explicit
40//!   comparisons;
41//! * otherwise (`i ≥ r`) we compare from scratch starting at `i`.
42//!
43//! Every explicit character comparison either fails (ending the current `i`) or
44//! advances `r`, and `r` only ever increases, so the total comparison count is
45//! `O(n)`.
46//!
47//! # Exact pattern matching
48//!
49//! To find every occurrence of a pattern `p` (length `m`) in a text `t`
50//! (length `n`), form `s = p ++ [sep] ++ t` where `sep` is a byte occurring in
51//! neither `p` nor `t`, and run the Z-algorithm. An occurrence of `p` ends-aligns
52//! with each index `i` (in `s`) for which `z[i] ≥ m`; the corresponding text
53//! offset is `i − (m + 1)`. Because no Z-value may reach across the separator
54//! (it equals nothing in `p`), `z[i]` is capped at `m`, so there are **no false
55//! positives** — a property exercised directly by `tests`.
56//!
57//! When no separator byte is available (`p` and `t` between them use all 256
58//! byte values) the construction would be unsafe, so [`z_search`] instead falls
59//! back to scanning a *prefix-length array* computed on the concatenation
60//! without a separator but bounded explicitly at `m`; this keeps the routine
61//! total and panic-free. See [`z_search`].
62//!
63//! Inputs are raw bytes (`&[u8]`).
64
65use crate::error::{SeqError, SeqResult};
66
67/// Compute the Z-array of `s` in `O(n)` time using the Z-box method.
68///
69/// The returned vector has length `s.len()`. Index `0` holds `0` by the
70/// convention documented at the module level; for `1 ≤ i < n`, entry `i` is the
71/// length of the longest common prefix of `s` and the suffix `s[i..]`.
72///
73/// # Errors
74///
75/// Returns [`SeqError::EmptyInput`] when `s` is empty: the Z-array of the empty
76/// string is itself empty, and rejecting it makes the empty/degenerate contract
77/// explicit and consistent with the rest of this module.
78///
79/// # Examples
80///
81/// ```
82/// use oxicuda_seq::string::z_array;
83///
84/// let z = z_array(b"aabxaabxcaabxaabxay").expect("non-empty");
85/// assert_eq!(z[0], 0); // by convention
86/// assert_eq!(z[4], 4); // "aabx" repeats at offset 4
87/// ```
88pub fn z_array(s: &[u8]) -> SeqResult<Vec<usize>> {
89    if s.is_empty() {
90        return Err(SeqError::EmptyInput);
91    }
92
93    let n = s.len();
94    let mut z = vec![0usize; n];
95
96    // [l, r) is the right-most prefix-match window discovered so far.
97    let mut l = 0usize;
98    let mut r = 0usize;
99
100    for i in 1..n {
101        if i < r {
102            // Inside the current Z-box: copy from the mirror, capped at the box.
103            let mirror = i - l;
104            z[i] = z[mirror].min(r - i);
105        }
106        // Extend the match explicitly beyond whatever was inherited.
107        while i + z[i] < n && s[z[i]] == s[i + z[i]] {
108            z[i] += 1;
109        }
110        // If this match reaches further right than the current box, adopt it.
111        if i + z[i] > r {
112            l = i;
113            r = i + z[i];
114        }
115    }
116
117    Ok(z)
118}
119
120/// Brute-force matching that does not depend on a free separator byte.
121///
122/// Used by [`z_search`] only when `p` and `t` jointly occupy all 256 byte
123/// values so that no separator can be chosen; it scans the text directly,
124/// remaining `O(n · m)` in that rare degenerate case while keeping the public
125/// routine total and panic-free.
126fn naive_match(p: &[u8], t: &[u8]) -> Vec<usize> {
127    let m = p.len();
128    let n = t.len();
129    if m == 0 || m > n {
130        return Vec::new();
131    }
132    let mut out = Vec::new();
133    for start in 0..=(n - m) {
134        if &t[start..start + m] == p {
135            out.push(start);
136        }
137    }
138    out
139}
140
141/// Pick a byte value that occurs in neither `p` nor `t`, if one exists.
142fn free_separator(p: &[u8], t: &[u8]) -> Option<u8> {
143    let mut used = [false; 256];
144    for &b in p {
145        used[b as usize] = true;
146    }
147    for &b in t {
148        used[b as usize] = true;
149    }
150    (0u16..256).find(|&v| !used[v as usize]).map(|v| v as u8)
151}
152
153/// Find every occurrence of `pattern` in `text` using the Z-algorithm on the
154/// `pattern ++ sep ++ text` construction, returning start offsets in ascending
155/// order.
156///
157/// Each returned index `j` satisfies `text[j..j + pattern.len()] == pattern`.
158/// Overlapping occurrences are all reported (e.g. searching `"aa"` in `"aaaa"`
159/// yields `[0, 1, 2]`).
160///
161/// # Errors
162///
163/// Returns [`SeqError::EmptyInput`] if `pattern` is empty: an empty pattern has
164/// no well-defined occurrence set here (it would "match" at every position),
165/// matching the convention used by the Aho–Corasick automaton in this crate.
166///
167/// # Examples
168///
169/// ```
170/// use oxicuda_seq::string::z_search;
171///
172/// assert_eq!(z_search(b"ab", b"abcabxabc").expect("ok"), vec![0, 3, 6]);
173/// assert_eq!(z_search(b"aa", b"aaaa").expect("ok"), vec![0, 1, 2]);
174/// assert!(z_search(b"z", b"abc").expect("ok").is_empty());
175/// ```
176pub fn z_search(pattern: &[u8], text: &[u8]) -> SeqResult<Vec<usize>> {
177    if pattern.is_empty() {
178        return Err(SeqError::EmptyInput);
179    }
180    let m = pattern.len();
181    if m > text.len() {
182        return Ok(Vec::new());
183    }
184
185    // Degenerate alphabet: no separator available → direct scan.
186    let sep = match free_separator(pattern, text) {
187        Some(sep) => sep,
188        None => return Ok(naive_match(pattern, text)),
189    };
190
191    // s = pattern ++ sep ++ text.
192    let mut s = Vec::with_capacity(m + 1 + text.len());
193    s.extend_from_slice(pattern);
194    s.push(sep);
195    s.extend_from_slice(text);
196
197    let z = z_array(&s)?;
198
199    // Occurrences end-align where z reaches the full pattern length. The
200    // separator guarantees z[i] ≤ m, so equality with m is the match test and
201    // no value can leak across the boundary (no false positives).
202    let mut out = Vec::new();
203    let offset = m + 1; // start of the text within s
204    for i in offset..s.len() {
205        if z[i] >= m {
206            out.push(i - offset);
207        }
208    }
209    Ok(out)
210}
211
212#[cfg(test)]
213mod tests {
214    use super::*;
215
216    /// Brute-force `O(n²)` Z-array oracle straight from the definition.
217    fn brute_force_z(s: &[u8]) -> Vec<usize> {
218        let n = s.len();
219        let mut z = vec![0usize; n];
220        for i in 1..n {
221            let mut k = 0;
222            while i + k < n && s[k] == s[i + k] {
223                k += 1;
224            }
225            z[i] = k;
226        }
227        z
228    }
229
230    /// Naive all-occurrences search oracle.
231    fn naive_search(p: &[u8], t: &[u8]) -> Vec<usize> {
232        let (m, n) = (p.len(), t.len());
233        if m == 0 || m > n {
234            return Vec::new();
235        }
236        (0..=(n - m)).filter(|&i| &t[i..i + m] == p).collect()
237    }
238
239    fn random_bytes(rng: &mut crate::handle::LcgRng, alphabet: &[u8], len: usize) -> Vec<u8> {
240        (0..len)
241            .map(|_| alphabet[rng.next_usize(alphabet.len())])
242            .collect()
243    }
244
245    /// (a) The Z-array matches brute force on many random strings.
246    #[test]
247    fn z_array_matches_brute_force() {
248        let mut rng = crate::handle::LcgRng::new(0xED);
249        for &alphabet in &[b"ab".as_slice(), b"abc", b"abcd"] {
250            for _ in 0..500 {
251                let len = 1 + rng.next_usize(40);
252                let s = random_bytes(&mut rng, alphabet, len);
253                let fast = z_array(&s).expect("non-empty");
254                let slow = brute_force_z(&s);
255                assert_eq!(fast, slow, "Z-array mismatch for {s:?}");
256            }
257        }
258    }
259
260    /// (b) Pattern matching via the P$T construction finds ALL occurrences.
261    #[test]
262    fn search_finds_all_occurrences() {
263        let mut rng = crate::handle::LcgRng::new(7);
264        for &alphabet in &[b"ab".as_slice(), b"abc"] {
265            for _ in 0..500 {
266                let tlen = 1 + rng.next_usize(40);
267                let plen = 1 + rng.next_usize(6);
268                let t = random_bytes(&mut rng, alphabet, tlen);
269                let p = random_bytes(&mut rng, alphabet, plen);
270                let got = z_search(&p, &t).expect("non-empty pattern");
271                let want = naive_search(&p, &t);
272                assert_eq!(got, want, "search mismatch for p={p:?} t={t:?}");
273            }
274        }
275    }
276
277    /// (c) Periodic strings give the expected Z structure.
278    #[test]
279    fn periodic_strings_have_expected_z() {
280        // "aaaa": z[i] = n - i for i >= 1 (every suffix is a prefix run of 'a').
281        let z = z_array(b"aaaa").expect("non-empty");
282        assert_eq!(z, vec![0, 3, 2, 1]);
283
284        // "abcabc": period 3. z[3] = 3 (whole "abc" repeats), z[1]=z[2]=0,
285        // z[4]=z[5]=0 because the tail "bc"/"c" do not start with 'a'.
286        let z = z_array(b"abcabc").expect("non-empty");
287        assert_eq!(z, vec![0, 0, 0, 3, 0, 0]);
288
289        // "ababab": period 2. z[2]=4 ("abab"), z[4]=2 ("ab"), odd positions 0.
290        let z = z_array(b"ababab").expect("non-empty");
291        assert_eq!(z, vec![0, 0, 4, 0, 2, 0]);
292    }
293
294    /// (d) The z[0] convention is documented and consistently zero.
295    #[test]
296    fn z0_convention_is_zero() {
297        for s in [
298            b"a".as_slice(),
299            b"abc",
300            b"aaaa",
301            b"abacabad",
302            b"mississippi",
303        ] {
304            let z = z_array(s).expect("non-empty");
305            assert_eq!(z[0], 0, "z[0] must be 0 by convention for {s:?}");
306        }
307    }
308
309    /// (e) Single char and empty input behave per contract.
310    #[test]
311    fn single_char_and_empty() {
312        let z = z_array(b"x").expect("non-empty");
313        assert_eq!(z, vec![0]);
314
315        assert!(matches!(z_array(b""), Err(SeqError::EmptyInput)));
316        assert!(matches!(z_search(b"", b"abc"), Err(SeqError::EmptyInput)));
317
318        // Pattern longer than text → no matches, but not an error.
319        assert!(z_search(b"abcd", b"ab").expect("ok").is_empty());
320    }
321
322    /// (f) No false positives can leak across the separator.
323    ///
324    /// We construct a case where a naive concatenation without a separator would
325    /// spuriously match: pattern "ab", text "...a" then "b..." straddling. The
326    /// separator construction must report exactly the genuine in-text matches.
327    #[test]
328    fn no_false_positives_across_separator() {
329        // Pattern equals the boundary of pattern+text if no separator were used.
330        // p = "ba", t = "xb" → with naive p++t = "baxb"; the suffix overlap must
331        // not be reported. Genuine occurrences of "ba" in "xb" = none.
332        assert!(z_search(b"ba", b"xb").expect("ok").is_empty());
333
334        // p = "ab", text ends in 'a', next pattern char 'b' would join across.
335        // t = "zzza" has no "ab"; ensure empty.
336        assert!(z_search(b"ab", b"zzza").expect("ok").is_empty());
337
338        // Positive control with the same alphabet so the test really exercises
339        // matching, not just rejection.
340        assert_eq!(z_search(b"ab", b"zzzab").expect("ok"), vec![3]);
341    }
342
343    /// Whole-string and prefix patterns at the boundaries.
344    #[test]
345    fn boundary_patterns() {
346        // Pattern == text.
347        assert_eq!(z_search(b"abc", b"abc").expect("ok"), vec![0]);
348        // Pattern at the very end.
349        assert_eq!(z_search(b"c", b"aabc").expect("ok"), vec![3]);
350        // Pattern at the very start.
351        assert_eq!(z_search(b"aa", b"aab").expect("ok"), vec![0]);
352    }
353
354    /// Degenerate full-alphabet fallback path: every byte value used, so no
355    /// separator exists and the naive matcher must still be correct.
356    #[test]
357    fn full_alphabet_fallback() {
358        // Text uses all 256 byte values once.
359        let text: Vec<u8> = (0u16..256).map(|v| v as u8).collect();
360        // Pattern present in the middle.
361        let pattern = &text[100..104];
362        assert!(free_separator(pattern, &text).is_none());
363        let got = z_search(pattern, &text).expect("ok");
364        assert_eq!(got, vec![100]);
365
366        // Absent pattern over the full alphabet (reversed pair not contiguous).
367        let absent = [200u8, 5u8, 200u8];
368        let got = z_search(&absent, &text).expect("ok");
369        assert!(got.is_empty());
370    }
371}