oxicuda_seq/string/z_algorithm.rs
1//! Gusfield's Z-algorithm: the Z-array in linear time, with exact matching.
2//!
3//! Reference: Dan Gusfield, *"Algorithms on Strings, Trees, and Sequences:
4//! Computer Science and Computational Biology"*, Cambridge University Press,
5//! 1997, §1.3–1.5. The Z-array (sometimes "Z-function") and its use for
6//! linear-time exact pattern matching via the `P $ T` construction are due to
7//! that exposition; the underlying fundamental preprocessing predates it.
8//!
9//! # The Z-array
10//!
11//! For a string `s` of length `n`, the **Z-array** `z` is defined by
12//!
13//! ```text
14//! z[0] = 0 (by convention; see below)
15//! z[i] = length of the longest substring starting at i
16//! that is also a prefix of s, for 1 ≤ i < n.
17//! ```
18//!
19//! Equivalently `z[i] = max { k : s[0..k] == s[i..i+k] }`. So if
20//! `s = "aabxaabxcaabxaabxay"` then `z[4] = 4` because `s[4..8] == "aabx" ==
21//! s[0..4]` but `s[8] = 'c' != 'a' = s[4]`.
22//!
23//! ## The `z[0]` convention
24//!
25//! Position `0` matches the whole prefix trivially (`s[0..n] == s[0..n]`), so a
26//! literal reading would give `z[0] = n`. That value carries no information and
27//! complicates the matching application, so — following Gusfield and the common
28//! competitive-programming convention — **this implementation sets `z[0] = 0`**.
29//! Callers that need the "real" value can use `n` directly. This choice is fixed
30//! and tested (`z0_convention_is_zero`).
31//!
32//! # Linear time via the Z-box
33//!
34//! The algorithm maintains the interval `[l, r)` (the **Z-box**) that is the
35//! right-most match of a prefix discovered so far: `s[l..r] == s[0..r-l]`. For a
36//! new index `i`:
37//!
38//! * if `i < r`, the box already tells us that `s[i..r] == s[i-l..r-l]`, so
39//! `z[i] ≥ min(z[i-l], r-i)` for free; only the part beyond `r` needs explicit
40//! comparisons;
41//! * otherwise (`i ≥ r`) we compare from scratch starting at `i`.
42//!
43//! Every explicit character comparison either fails (ending the current `i`) or
44//! advances `r`, and `r` only ever increases, so the total comparison count is
45//! `O(n)`.
46//!
47//! # Exact pattern matching
48//!
49//! To find every occurrence of a pattern `p` (length `m`) in a text `t`
50//! (length `n`), form `s = p ++ [sep] ++ t` where `sep` is a byte occurring in
51//! neither `p` nor `t`, and run the Z-algorithm. An occurrence of `p` ends-aligns
52//! with each index `i` (in `s`) for which `z[i] ≥ m`; the corresponding text
53//! offset is `i − (m + 1)`. Because no Z-value may reach across the separator
54//! (it equals nothing in `p`), `z[i]` is capped at `m`, so there are **no false
55//! positives** — a property exercised directly by `tests`.
56//!
57//! When no separator byte is available (`p` and `t` between them use all 256
58//! byte values) the construction would be unsafe, so [`z_search`] instead falls
59//! back to scanning a *prefix-length array* computed on the concatenation
60//! without a separator but bounded explicitly at `m`; this keeps the routine
61//! total and panic-free. See [`z_search`].
62//!
63//! Inputs are raw bytes (`&[u8]`).
64
65use crate::error::{SeqError, SeqResult};
66
67/// Compute the Z-array of `s` in `O(n)` time using the Z-box method.
68///
69/// The returned vector has length `s.len()`. Index `0` holds `0` by the
70/// convention documented at the module level; for `1 ≤ i < n`, entry `i` is the
71/// length of the longest common prefix of `s` and the suffix `s[i..]`.
72///
73/// # Errors
74///
75/// Returns [`SeqError::EmptyInput`] when `s` is empty: the Z-array of the empty
76/// string is itself empty, and rejecting it makes the empty/degenerate contract
77/// explicit and consistent with the rest of this module.
78///
79/// # Examples
80///
81/// ```
82/// use oxicuda_seq::string::z_array;
83///
84/// let z = z_array(b"aabxaabxcaabxaabxay").expect("non-empty");
85/// assert_eq!(z[0], 0); // by convention
86/// assert_eq!(z[4], 4); // "aabx" repeats at offset 4
87/// ```
88pub fn z_array(s: &[u8]) -> SeqResult<Vec<usize>> {
89 if s.is_empty() {
90 return Err(SeqError::EmptyInput);
91 }
92
93 let n = s.len();
94 let mut z = vec![0usize; n];
95
96 // [l, r) is the right-most prefix-match window discovered so far.
97 let mut l = 0usize;
98 let mut r = 0usize;
99
100 for i in 1..n {
101 if i < r {
102 // Inside the current Z-box: copy from the mirror, capped at the box.
103 let mirror = i - l;
104 z[i] = z[mirror].min(r - i);
105 }
106 // Extend the match explicitly beyond whatever was inherited.
107 while i + z[i] < n && s[z[i]] == s[i + z[i]] {
108 z[i] += 1;
109 }
110 // If this match reaches further right than the current box, adopt it.
111 if i + z[i] > r {
112 l = i;
113 r = i + z[i];
114 }
115 }
116
117 Ok(z)
118}
119
120/// Brute-force matching that does not depend on a free separator byte.
121///
122/// Used by [`z_search`] only when `p` and `t` jointly occupy all 256 byte
123/// values so that no separator can be chosen; it scans the text directly,
124/// remaining `O(n · m)` in that rare degenerate case while keeping the public
125/// routine total and panic-free.
126fn naive_match(p: &[u8], t: &[u8]) -> Vec<usize> {
127 let m = p.len();
128 let n = t.len();
129 if m == 0 || m > n {
130 return Vec::new();
131 }
132 let mut out = Vec::new();
133 for start in 0..=(n - m) {
134 if &t[start..start + m] == p {
135 out.push(start);
136 }
137 }
138 out
139}
140
141/// Pick a byte value that occurs in neither `p` nor `t`, if one exists.
142fn free_separator(p: &[u8], t: &[u8]) -> Option<u8> {
143 let mut used = [false; 256];
144 for &b in p {
145 used[b as usize] = true;
146 }
147 for &b in t {
148 used[b as usize] = true;
149 }
150 (0u16..256).find(|&v| !used[v as usize]).map(|v| v as u8)
151}
152
153/// Find every occurrence of `pattern` in `text` using the Z-algorithm on the
154/// `pattern ++ sep ++ text` construction, returning start offsets in ascending
155/// order.
156///
157/// Each returned index `j` satisfies `text[j..j + pattern.len()] == pattern`.
158/// Overlapping occurrences are all reported (e.g. searching `"aa"` in `"aaaa"`
159/// yields `[0, 1, 2]`).
160///
161/// # Errors
162///
163/// Returns [`SeqError::EmptyInput`] if `pattern` is empty: an empty pattern has
164/// no well-defined occurrence set here (it would "match" at every position),
165/// matching the convention used by the Aho–Corasick automaton in this crate.
166///
167/// # Examples
168///
169/// ```
170/// use oxicuda_seq::string::z_search;
171///
172/// assert_eq!(z_search(b"ab", b"abcabxabc").expect("ok"), vec![0, 3, 6]);
173/// assert_eq!(z_search(b"aa", b"aaaa").expect("ok"), vec![0, 1, 2]);
174/// assert!(z_search(b"z", b"abc").expect("ok").is_empty());
175/// ```
176pub fn z_search(pattern: &[u8], text: &[u8]) -> SeqResult<Vec<usize>> {
177 if pattern.is_empty() {
178 return Err(SeqError::EmptyInput);
179 }
180 let m = pattern.len();
181 if m > text.len() {
182 return Ok(Vec::new());
183 }
184
185 // Degenerate alphabet: no separator available → direct scan.
186 let sep = match free_separator(pattern, text) {
187 Some(sep) => sep,
188 None => return Ok(naive_match(pattern, text)),
189 };
190
191 // s = pattern ++ sep ++ text.
192 let mut s = Vec::with_capacity(m + 1 + text.len());
193 s.extend_from_slice(pattern);
194 s.push(sep);
195 s.extend_from_slice(text);
196
197 let z = z_array(&s)?;
198
199 // Occurrences end-align where z reaches the full pattern length. The
200 // separator guarantees z[i] ≤ m, so equality with m is the match test and
201 // no value can leak across the boundary (no false positives).
202 let mut out = Vec::new();
203 let offset = m + 1; // start of the text within s
204 for i in offset..s.len() {
205 if z[i] >= m {
206 out.push(i - offset);
207 }
208 }
209 Ok(out)
210}
211
212#[cfg(test)]
213mod tests {
214 use super::*;
215
216 /// Brute-force `O(n²)` Z-array oracle straight from the definition.
217 fn brute_force_z(s: &[u8]) -> Vec<usize> {
218 let n = s.len();
219 let mut z = vec![0usize; n];
220 for i in 1..n {
221 let mut k = 0;
222 while i + k < n && s[k] == s[i + k] {
223 k += 1;
224 }
225 z[i] = k;
226 }
227 z
228 }
229
230 /// Naive all-occurrences search oracle.
231 fn naive_search(p: &[u8], t: &[u8]) -> Vec<usize> {
232 let (m, n) = (p.len(), t.len());
233 if m == 0 || m > n {
234 return Vec::new();
235 }
236 (0..=(n - m)).filter(|&i| &t[i..i + m] == p).collect()
237 }
238
239 fn random_bytes(rng: &mut crate::handle::LcgRng, alphabet: &[u8], len: usize) -> Vec<u8> {
240 (0..len)
241 .map(|_| alphabet[rng.next_usize(alphabet.len())])
242 .collect()
243 }
244
245 /// (a) The Z-array matches brute force on many random strings.
246 #[test]
247 fn z_array_matches_brute_force() {
248 let mut rng = crate::handle::LcgRng::new(0xED);
249 for &alphabet in &[b"ab".as_slice(), b"abc", b"abcd"] {
250 for _ in 0..500 {
251 let len = 1 + rng.next_usize(40);
252 let s = random_bytes(&mut rng, alphabet, len);
253 let fast = z_array(&s).expect("non-empty");
254 let slow = brute_force_z(&s);
255 assert_eq!(fast, slow, "Z-array mismatch for {s:?}");
256 }
257 }
258 }
259
260 /// (b) Pattern matching via the P$T construction finds ALL occurrences.
261 #[test]
262 fn search_finds_all_occurrences() {
263 let mut rng = crate::handle::LcgRng::new(7);
264 for &alphabet in &[b"ab".as_slice(), b"abc"] {
265 for _ in 0..500 {
266 let tlen = 1 + rng.next_usize(40);
267 let plen = 1 + rng.next_usize(6);
268 let t = random_bytes(&mut rng, alphabet, tlen);
269 let p = random_bytes(&mut rng, alphabet, plen);
270 let got = z_search(&p, &t).expect("non-empty pattern");
271 let want = naive_search(&p, &t);
272 assert_eq!(got, want, "search mismatch for p={p:?} t={t:?}");
273 }
274 }
275 }
276
277 /// (c) Periodic strings give the expected Z structure.
278 #[test]
279 fn periodic_strings_have_expected_z() {
280 // "aaaa": z[i] = n - i for i >= 1 (every suffix is a prefix run of 'a').
281 let z = z_array(b"aaaa").expect("non-empty");
282 assert_eq!(z, vec![0, 3, 2, 1]);
283
284 // "abcabc": period 3. z[3] = 3 (whole "abc" repeats), z[1]=z[2]=0,
285 // z[4]=z[5]=0 because the tail "bc"/"c" do not start with 'a'.
286 let z = z_array(b"abcabc").expect("non-empty");
287 assert_eq!(z, vec![0, 0, 0, 3, 0, 0]);
288
289 // "ababab": period 2. z[2]=4 ("abab"), z[4]=2 ("ab"), odd positions 0.
290 let z = z_array(b"ababab").expect("non-empty");
291 assert_eq!(z, vec![0, 0, 4, 0, 2, 0]);
292 }
293
294 /// (d) The z[0] convention is documented and consistently zero.
295 #[test]
296 fn z0_convention_is_zero() {
297 for s in [
298 b"a".as_slice(),
299 b"abc",
300 b"aaaa",
301 b"abacabad",
302 b"mississippi",
303 ] {
304 let z = z_array(s).expect("non-empty");
305 assert_eq!(z[0], 0, "z[0] must be 0 by convention for {s:?}");
306 }
307 }
308
309 /// (e) Single char and empty input behave per contract.
310 #[test]
311 fn single_char_and_empty() {
312 let z = z_array(b"x").expect("non-empty");
313 assert_eq!(z, vec![0]);
314
315 assert!(matches!(z_array(b""), Err(SeqError::EmptyInput)));
316 assert!(matches!(z_search(b"", b"abc"), Err(SeqError::EmptyInput)));
317
318 // Pattern longer than text → no matches, but not an error.
319 assert!(z_search(b"abcd", b"ab").expect("ok").is_empty());
320 }
321
322 /// (f) No false positives can leak across the separator.
323 ///
324 /// We construct a case where a naive concatenation without a separator would
325 /// spuriously match: pattern "ab", text "...a" then "b..." straddling. The
326 /// separator construction must report exactly the genuine in-text matches.
327 #[test]
328 fn no_false_positives_across_separator() {
329 // Pattern equals the boundary of pattern+text if no separator were used.
330 // p = "ba", t = "xb" → with naive p++t = "baxb"; the suffix overlap must
331 // not be reported. Genuine occurrences of "ba" in "xb" = none.
332 assert!(z_search(b"ba", b"xb").expect("ok").is_empty());
333
334 // p = "ab", text ends in 'a', next pattern char 'b' would join across.
335 // t = "zzza" has no "ab"; ensure empty.
336 assert!(z_search(b"ab", b"zzza").expect("ok").is_empty());
337
338 // Positive control with the same alphabet so the test really exercises
339 // matching, not just rejection.
340 assert_eq!(z_search(b"ab", b"zzzab").expect("ok"), vec![3]);
341 }
342
343 /// Whole-string and prefix patterns at the boundaries.
344 #[test]
345 fn boundary_patterns() {
346 // Pattern == text.
347 assert_eq!(z_search(b"abc", b"abc").expect("ok"), vec![0]);
348 // Pattern at the very end.
349 assert_eq!(z_search(b"c", b"aabc").expect("ok"), vec![3]);
350 // Pattern at the very start.
351 assert_eq!(z_search(b"aa", b"aab").expect("ok"), vec![0]);
352 }
353
354 /// Degenerate full-alphabet fallback path: every byte value used, so no
355 /// separator exists and the naive matcher must still be correct.
356 #[test]
357 fn full_alphabet_fallback() {
358 // Text uses all 256 byte values once.
359 let text: Vec<u8> = (0u16..256).map(|v| v as u8).collect();
360 // Pattern present in the middle.
361 let pattern = &text[100..104];
362 assert!(free_separator(pattern, &text).is_none());
363 let got = z_search(pattern, &text).expect("ok");
364 assert_eq!(got, vec![100]);
365
366 // Absent pattern over the full alphabet (reversed pair not contiguous).
367 let absent = [200u8, 5u8, 200u8];
368 let got = z_search(&absent, &text).expect("ok");
369 assert!(got.is_empty());
370 }
371}