oxicuda_seq/string/fm_index.rs
1//! Burrows–Wheeler transform and FM-index with backward search.
2//!
3//! References:
4//! * Michael Burrows & David J. Wheeler, *"A Block-sorting Lossless Data
5//! Compression Algorithm"*, Digital SRC Research Report 124, 1994 — the BWT
6//! and its LF-mapping inverse.
7//! * Paolo Ferragina & Giovanni Manzini, *"Opportunistic Data Structures with
8//! Applications"*, FOCS 2000, pp. 390–398 — the FM-index (`C` array + `Occ`
9//! rank) and backward search for `count`/`locate`.
10//!
11//! # Construction
12//!
13//! A unique sentinel `$` — modelled here as a rank smaller than every real byte
14//! — is appended to the text. The BWT is then read off the suffix array (built
15//! by [`crate::string::SuffixArray`], reused verbatim): for each suffix position
16//! `sa[i]`, the BWT character is the byte *preceding* that suffix cyclically,
17//! `T[sa[i] − 1]`, with the position `0` mapping to the sentinel.
18//!
19//! ```text
20//! T = "banana", T$ = "banana$"
21//! sorted rotations → BWT = "annb$aa" (last column)
22//! ```
23//!
24//! # FM-index
25//!
26//! Two precomputed tables turn the BWT into a searchable index:
27//!
28//! * `C[c]` — the number of characters in `T$` that are **strictly smaller**
29//! than `c`; equivalently the index of the first row beginning with `c`.
30//! * `Occ(c, i)` — the number of occurrences of `c` in `BWT[0..i]` (a rank
31//! query). Stored here as a full `|Σ| × (n+1)` prefix-sum table for `O(1)`
32//! rank, which is `O(n |Σ|)` space — appropriate for the byte alphabet and
33//! exact, deterministic queries.
34//!
35//! The **LF-mapping** `LF(i) = C[BWT[i]] + Occ(BWT[i], i)` sends row `i` to the
36//! row obtained by rotating its first column to the last; iterating it from the
37//! sentinel row reconstructs `T` right-to-left, which is exactly the inverse
38//! BWT.
39//!
40//! # Backward search
41//!
42//! Matching a pattern `p` proceeds from its last character to its first,
43//! maintaining the half-open SA range `[lo, hi)` of rows whose suffix is
44//! prefixed by the processed pattern tail:
45//!
46//! ```text
47//! lo ← C[c] + Occ(c, lo)
48//! hi ← C[c] + Occ(c, hi)
49//! ```
50//!
51//! When the range becomes empty the pattern is absent. [`FmIndex::count`]
52//! returns `hi − lo`; [`FmIndex::locate`] maps each row in the final range back
53//! to a text position through the stored suffix array.
54//!
55//! Inputs are raw bytes (`&[u8]`).
56
57use crate::error::{SeqError, SeqResult};
58use crate::string::SuffixArray;
59
60/// Alphabet cardinality including the sentinel: 256 byte values plus `$`.
61const SIGMA: usize = 257;
62
63/// Symbol code for the sentinel `$` (strictly smallest).
64const SENTINEL: usize = 0;
65
66/// Map a data byte to its FM-index symbol code (`1..=256`, leaving `0` for `$`).
67fn code_of(byte: u8) -> usize {
68 byte as usize + 1
69}
70
71/// An FM-index over a byte string: BWT, `C` array, and `Occ` rank table, with
72/// the suffix array retained for `locate`.
73///
74/// Build with [`FmIndex::new`]. The index supports exact-occurrence
75/// [`FmIndex::count`], position [`FmIndex::locate`], and full text recovery via
76/// [`FmIndex::inverse_bwt`] (the BWT round-trips).
77///
78/// # Examples
79///
80/// ```
81/// use oxicuda_seq::string::FmIndex;
82///
83/// let fm = FmIndex::new(b"banana").expect("non-empty");
84/// assert_eq!(fm.count(b"ana"), 2);
85/// assert_eq!(fm.locate(b"ana"), vec![1, 3]);
86/// assert_eq!(fm.count(b"xyz"), 0);
87/// assert_eq!(fm.inverse_bwt(), b"banana");
88/// ```
89#[derive(Debug, Clone)]
90pub struct FmIndex {
91 /// BWT as symbol codes over `T$` (length `n + 1`).
92 bwt: Vec<usize>,
93 /// Suffix array of `T$` (length `n + 1`); `sa[i]` is a text position into
94 /// `T$`, where position `n` denotes the sentinel.
95 sa: Vec<usize>,
96 /// `C[c]` = number of symbols in `T$` strictly less than `c`.
97 c: Vec<usize>,
98 /// `occ[c][i]` = occurrences of symbol `c` in `bwt[0..i]`, for `i ≤ n+1`.
99 occ: Vec<Vec<usize>>,
100 /// Length of the original text (without the sentinel).
101 text_len: usize,
102}
103
104impl FmIndex {
105 /// Build the FM-index of `s`.
106 ///
107 /// Internally appends a unique sentinel, derives the BWT from the reused
108 /// SA-IS suffix array, and precomputes the `C` and `Occ` tables.
109 ///
110 /// # Errors
111 ///
112 /// Returns [`SeqError::EmptyInput`] for an empty `s`, consistent with the
113 /// sibling string modules.
114 pub fn new(s: &[u8]) -> SeqResult<Self> {
115 if s.is_empty() {
116 return Err(SeqError::EmptyInput);
117 }
118 let n = s.len();
119
120 // Suffix array of T (without sentinel), reusing module 2.
121 let sa_no_sentinel = SuffixArray::new(s)?;
122 let sa_t = sa_no_sentinel.sa();
123
124 // Build the suffix array of T$ by hand: the sentinel suffix (position n)
125 // is lexicographically smallest, so it leads, followed by the suffixes
126 // of T in the same relative order (the sentinel only ever appears at the
127 // very end, so it cannot change the order among the real suffixes).
128 let mut sa: Vec<usize> = Vec::with_capacity(n + 1);
129 sa.push(n); // the sentinel suffix "$"
130 sa.extend_from_slice(sa_t);
131
132 // BWT over T$: bwt[i] = symbol preceding suffix sa[i] cyclically.
133 let mut bwt = vec![0usize; n + 1];
134 for (i, &p) in sa.iter().enumerate() {
135 bwt[i] = if p == 0 {
136 SENTINEL // wraps to the sentinel
137 } else {
138 code_of(s[p - 1])
139 };
140 }
141
142 // C array: counts of each symbol in T$, then exclusive prefix sum.
143 let mut counts = vec![0usize; SIGMA];
144 counts[SENTINEL] += 1; // the single sentinel
145 for &b in s {
146 counts[code_of(b)] += 1;
147 }
148 let mut c = vec![0usize; SIGMA];
149 let mut acc = 0usize;
150 for sym in 0..SIGMA {
151 c[sym] = acc;
152 acc += counts[sym];
153 }
154
155 // Occ table: occ[sym][i] = occurrences of sym in bwt[0..i].
156 let mut occ = vec![vec![0usize; n + 2]; SIGMA];
157 for i in 0..=n {
158 let sym = bwt[i];
159 for s_idx in 0..SIGMA {
160 occ[s_idx][i + 1] = occ[s_idx][i];
161 }
162 occ[sym][i + 1] += 1;
163 }
164
165 Ok(Self {
166 bwt,
167 sa,
168 c,
169 occ,
170 text_len: n,
171 })
172 }
173
174 /// Length of the original text (excluding the sentinel).
175 pub fn text_len(&self) -> usize {
176 self.text_len
177 }
178
179 /// Borrow the BWT as raw bytes, mapping the sentinel to `sentinel_byte`.
180 ///
181 /// The sentinel has no byte value of its own, so the caller supplies the
182 /// placeholder used to render it. The placeholder is *not* required to be
183 /// absent from the text; it is purely cosmetic for inspection/printing.
184 pub fn bwt_bytes(&self, sentinel_byte: u8) -> Vec<u8> {
185 self.bwt
186 .iter()
187 .map(|&sym| {
188 if sym == SENTINEL {
189 sentinel_byte
190 } else {
191 (sym - 1) as u8
192 }
193 })
194 .collect()
195 }
196
197 /// `Occ(sym, i)`: occurrences of symbol `sym` in `BWT[0..i]`.
198 fn occ(&self, sym: usize, i: usize) -> usize {
199 self.occ[sym][i]
200 }
201
202 /// The LF-mapping `LF(i) = C[BWT[i]] + Occ(BWT[i], i)`.
203 fn lf(&self, i: usize) -> usize {
204 let sym = self.bwt[i];
205 self.c[sym] + self.occ(sym, i)
206 }
207
208 /// Recover the original text by inverting the BWT through the LF-mapping.
209 ///
210 /// Starts at the sentinel row (row `0`, since the sentinel sorts first) and
211 /// walks LF backwards, emitting characters right-to-left.
212 pub fn inverse_bwt(&self) -> Vec<u8> {
213 let n = self.text_len;
214 let mut out = vec![0u8; n];
215 // Row 0 is the sentinel row (T$ sorted ⇒ "$..." is first). The character
216 // BWT[0] is the last real character of T; walking LF reveals the rest.
217 let mut row = 0usize;
218 for k in (0..n).rev() {
219 let sym = self.bwt[row];
220 // sym is never the sentinel here for the first n steps because the
221 // sentinel appears exactly once and is reached only after all n real
222 // characters have been emitted.
223 out[k] = (sym - 1) as u8;
224 row = self.lf(row);
225 }
226 out
227 }
228
229 /// Backward-search the half-open SA range `[lo, hi)` of rows whose suffix is
230 /// prefixed by `pattern`. Returns `None` for an empty pattern (no defined
231 /// range) and an empty range `lo == hi` when absent.
232 fn backward_search(&self, pattern: &[u8]) -> Option<(usize, usize)> {
233 if pattern.is_empty() {
234 return None;
235 }
236 let mut lo = 0usize;
237 let mut hi = self.sa.len(); // n + 1
238 for &b in pattern.iter().rev() {
239 let sym = code_of(b);
240 lo = self.c[sym] + self.occ(sym, lo);
241 hi = self.c[sym] + self.occ(sym, hi);
242 if lo >= hi {
243 return Some((lo, lo)); // empty range; pattern absent
244 }
245 }
246 Some((lo, hi))
247 }
248
249 /// Number of occurrences of `pattern` in the text (backward search).
250 ///
251 /// Returns `0` for an empty pattern or when the pattern does not occur.
252 pub fn count(&self, pattern: &[u8]) -> usize {
253 match self.backward_search(pattern) {
254 Some((lo, hi)) => hi - lo,
255 None => 0,
256 }
257 }
258
259 /// Sorted text positions of every occurrence of `pattern`.
260 ///
261 /// Each row in the final backward-search range maps to a text position
262 /// through the stored suffix array. Returns an empty vector for an empty or
263 /// absent pattern.
264 pub fn locate(&self, pattern: &[u8]) -> Vec<usize> {
265 match self.backward_search(pattern) {
266 Some((lo, hi)) if lo < hi => {
267 let mut positions: Vec<usize> = self.sa[lo..hi].to_vec();
268 positions.sort_unstable();
269 positions
270 }
271 _ => Vec::new(),
272 }
273 }
274}
275
276#[cfg(test)]
277mod tests {
278 use super::*;
279
280 fn naive_search(p: &[u8], t: &[u8]) -> Vec<usize> {
281 let (m, n) = (p.len(), t.len());
282 if m == 0 || m > n {
283 return Vec::new();
284 }
285 (0..=(n - m)).filter(|&i| &t[i..i + m] == p).collect()
286 }
287
288 fn random_bytes(rng: &mut crate::handle::LcgRng, alphabet: &[u8], len: usize) -> Vec<u8> {
289 (0..len)
290 .map(|_| alphabet[rng.next_usize(alphabet.len())])
291 .collect()
292 }
293
294 /// (a) The BWT is invertible: inverse-BWT recovers the original exactly.
295 #[test]
296 fn bwt_round_trips() {
297 for s in [
298 b"banana".as_slice(),
299 b"mississippi",
300 b"abracadabra",
301 b"aaaa",
302 b"a",
303 b"the quick brown fox",
304 ] {
305 let fm = FmIndex::new(s).expect("non-empty");
306 assert_eq!(fm.inverse_bwt(), s, "round-trip for {s:?}");
307 }
308 let mut rng = crate::handle::LcgRng::new(101);
309 for &alphabet in &[b"a".as_slice(), b"ab", b"abc", b"abcd"] {
310 for _ in 0..400 {
311 let len = 1 + rng.next_usize(40);
312 let s = random_bytes(&mut rng, alphabet, len);
313 let fm = FmIndex::new(&s).expect("non-empty");
314 assert_eq!(fm.inverse_bwt(), s, "round-trip for {s:?}");
315 }
316 }
317 }
318
319 /// (b) Backward-search count equals the true number of occurrences,
320 /// including absent patterns (→ 0).
321 #[test]
322 fn count_matches_naive() {
323 let fm = FmIndex::new(b"mississippi").expect("non-empty");
324 assert_eq!(fm.count(b"issi"), 2);
325 assert_eq!(fm.count(b"ss"), 2);
326 assert_eq!(fm.count(b"i"), 4);
327 assert_eq!(fm.count(b"mississippi"), 1);
328 assert_eq!(fm.count(b"xyz"), 0); // absent
329 assert_eq!(fm.count(b"ppp"), 0); // absent
330 assert_eq!(fm.count(b""), 0); // empty pattern
331
332 let mut rng = crate::handle::LcgRng::new(202);
333 for &alphabet in &[b"ab".as_slice(), b"abc"] {
334 for _ in 0..400 {
335 let tlen = 1 + rng.next_usize(40);
336 let plen = 1 + rng.next_usize(5);
337 let t = random_bytes(&mut rng, alphabet, tlen);
338 let p = random_bytes(&mut rng, alphabet, plen);
339 let fm = FmIndex::new(&t).expect("non-empty");
340 assert_eq!(fm.count(&p), naive_search(&p, &t).len(), "p={p:?} t={t:?}");
341 }
342 }
343 }
344
345 /// (c) Locate returns the correct sorted positions.
346 #[test]
347 fn locate_matches_naive() {
348 let fm = FmIndex::new(b"banana").expect("non-empty");
349 assert_eq!(fm.locate(b"ana"), vec![1, 3]);
350 assert_eq!(fm.locate(b"a"), vec![1, 3, 5]);
351 assert_eq!(fm.locate(b"na"), vec![2, 4]);
352 assert!(fm.locate(b"xyz").is_empty());
353 assert!(fm.locate(b"").is_empty());
354
355 let mut rng = crate::handle::LcgRng::new(303);
356 for &alphabet in &[b"ab".as_slice(), b"abc"] {
357 for _ in 0..400 {
358 let tlen = 1 + rng.next_usize(40);
359 let plen = 1 + rng.next_usize(5);
360 let t = random_bytes(&mut rng, alphabet, tlen);
361 let p = random_bytes(&mut rng, alphabet, plen);
362 let fm = FmIndex::new(&t).expect("non-empty");
363 let mut want = naive_search(&p, &t);
364 want.sort_unstable();
365 assert_eq!(fm.locate(&p), want, "p={p:?} t={t:?}");
366 }
367 }
368 }
369
370 /// (d) The LF-mapping is a permutation of the rows.
371 #[test]
372 fn lf_is_permutation() {
373 let mut rng = crate::handle::LcgRng::new(404);
374 for _ in 0..300 {
375 let len = 1 + rng.next_usize(30);
376 let s = random_bytes(&mut rng, b"abc", len);
377 let fm = FmIndex::new(&s).expect("non-empty");
378 let rows = fm.sa.len(); // n + 1
379 let mut seen = vec![false; rows];
380 for i in 0..rows {
381 let target = fm.lf(i);
382 assert!(target < rows, "LF out of range");
383 assert!(!seen[target], "LF not injective at {i}");
384 seen[target] = true;
385 }
386 assert!(seen.iter().all(|&b| b), "LF not surjective");
387 }
388 }
389
390 /// (e) C/Occ consistency: Occ monotone non-decreasing in i, C cumulative.
391 #[test]
392 fn c_and_occ_consistent() {
393 let mut rng = crate::handle::LcgRng::new(505);
394 for _ in 0..100 {
395 let len = 1 + rng.next_usize(30);
396 let s = random_bytes(&mut rng, b"abc", len);
397 let fm = FmIndex::new(&s).expect("non-empty");
398 let rows = fm.sa.len();
399
400 // C is cumulative (non-decreasing) and starts at 0.
401 assert_eq!(fm.c[SENTINEL], 0);
402 for sym in 1..SIGMA {
403 assert!(fm.c[sym] >= fm.c[sym - 1], "C not cumulative");
404 }
405 // C[max] + (count of max sym) == rows; equivalently the last bucket
406 // boundary plus its size equals the total number of rows.
407 assert!(*fm.c.last().expect("non-empty C") <= rows);
408
409 // Occ monotone in i for every symbol, and Occ(sym, rows) totals to
410 // the count of sym, whose sum over symbols equals rows.
411 let mut total = 0usize;
412 for sym in 0..SIGMA {
413 for i in 0..rows {
414 assert!(fm.occ(sym, i + 1) >= fm.occ(sym, i), "Occ not monotone");
415 }
416 total += fm.occ(sym, rows);
417 }
418 assert_eq!(total, rows, "Occ totals must sum to row count");
419
420 // Cross-check C against Occ at the end: C[sym] equals the number of
421 // BWT symbols strictly less than sym, i.e. Σ_{k<sym} Occ(k, rows).
422 let mut acc = 0usize;
423 for sym in 0..SIGMA {
424 assert_eq!(fm.c[sym], acc, "C vs Occ mismatch at {sym}");
425 acc += fm.occ(sym, rows);
426 }
427 }
428 }
429
430 /// (f) A sentinel-terminated string round-trips (text containing the byte we
431 /// later render the sentinel as — the internal sentinel is distinct, so this
432 /// still works).
433 #[test]
434 fn sentinel_rendering_does_not_collide() {
435 // Use a text that contains byte 0 to prove the internal sentinel is a
436 // separate symbol from any data byte (the BWT is over symbol codes).
437 let s = &[0u8, 1u8, 0u8, 2u8, 0u8];
438 let fm = FmIndex::new(s).expect("non-empty");
439 assert_eq!(fm.inverse_bwt(), s, "round-trip with embedded zero bytes");
440 assert_eq!(fm.count(&[0u8]), 3);
441 assert_eq!(fm.locate(&[0u8]), vec![0, 2, 4]);
442 assert_eq!(fm.count(&[0u8, 0u8]), 0); // no adjacent zeros
443
444 // The rendered BWT with '$' as a placeholder has the sentinel exactly
445 // once even though byte 0 occurs three times in the source.
446 let rendered = fm.bwt_bytes(b'$');
447 assert_eq!(rendered.iter().filter(|&&b| b == b'$').count(), 1);
448 }
449
450 /// Empty input is rejected.
451 #[test]
452 fn empty_input_errors() {
453 assert!(matches!(FmIndex::new(b""), Err(SeqError::EmptyInput)));
454 }
455
456 /// The BWT of "banana" is the textbook "annb$aa".
457 #[test]
458 fn banana_bwt_textbook() {
459 let fm = FmIndex::new(b"banana").expect("non-empty");
460 let rendered = fm.bwt_bytes(b'$');
461 assert_eq!(rendered.as_slice(), b"annb$aa");
462 }
463}