Skip to main content

oxicuda_seq/string/
fm_index.rs

1//! Burrows–Wheeler transform and FM-index with backward search.
2//!
3//! References:
4//! * Michael Burrows & David J. Wheeler, *"A Block-sorting Lossless Data
5//!   Compression Algorithm"*, Digital SRC Research Report 124, 1994 — the BWT
6//!   and its LF-mapping inverse.
7//! * Paolo Ferragina & Giovanni Manzini, *"Opportunistic Data Structures with
8//!   Applications"*, FOCS 2000, pp. 390–398 — the FM-index (`C` array + `Occ`
9//!   rank) and backward search for `count`/`locate`.
10//!
11//! # Construction
12//!
13//! A unique sentinel `$` — modelled here as a rank smaller than every real byte
14//! — is appended to the text. The BWT is then read off the suffix array (built
15//! by [`crate::string::SuffixArray`], reused verbatim): for each suffix position
16//! `sa[i]`, the BWT character is the byte *preceding* that suffix cyclically,
17//! `T[sa[i] − 1]`, with the position `0` mapping to the sentinel.
18//!
19//! ```text
20//! T = "banana", T$ = "banana$"
21//! sorted rotations →  BWT = "annb$aa"   (last column)
22//! ```
23//!
24//! # FM-index
25//!
26//! Two precomputed tables turn the BWT into a searchable index:
27//!
28//! * `C[c]` — the number of characters in `T$` that are **strictly smaller**
29//!   than `c`; equivalently the index of the first row beginning with `c`.
30//! * `Occ(c, i)` — the number of occurrences of `c` in `BWT[0..i]` (a rank
31//!   query). Stored here as a full `|Σ| × (n+1)` prefix-sum table for `O(1)`
32//!   rank, which is `O(n |Σ|)` space — appropriate for the byte alphabet and
33//!   exact, deterministic queries.
34//!
35//! The **LF-mapping** `LF(i) = C[BWT[i]] + Occ(BWT[i], i)` sends row `i` to the
36//! row obtained by rotating its first column to the last; iterating it from the
37//! sentinel row reconstructs `T` right-to-left, which is exactly the inverse
38//! BWT.
39//!
40//! # Backward search
41//!
42//! Matching a pattern `p` proceeds from its last character to its first,
43//! maintaining the half-open SA range `[lo, hi)` of rows whose suffix is
44//! prefixed by the processed pattern tail:
45//!
46//! ```text
47//! lo ← C[c] + Occ(c, lo)
48//! hi ← C[c] + Occ(c, hi)
49//! ```
50//!
51//! When the range becomes empty the pattern is absent. [`FmIndex::count`]
52//! returns `hi − lo`; [`FmIndex::locate`] maps each row in the final range back
53//! to a text position through the stored suffix array.
54//!
55//! Inputs are raw bytes (`&[u8]`).
56
57use crate::error::{SeqError, SeqResult};
58use crate::string::SuffixArray;
59
60/// Alphabet cardinality including the sentinel: 256 byte values plus `$`.
61const SIGMA: usize = 257;
62
63/// Symbol code for the sentinel `$` (strictly smallest).
64const SENTINEL: usize = 0;
65
66/// Map a data byte to its FM-index symbol code (`1..=256`, leaving `0` for `$`).
67fn code_of(byte: u8) -> usize {
68    byte as usize + 1
69}
70
71/// An FM-index over a byte string: BWT, `C` array, and `Occ` rank table, with
72/// the suffix array retained for `locate`.
73///
74/// Build with [`FmIndex::new`]. The index supports exact-occurrence
75/// [`FmIndex::count`], position [`FmIndex::locate`], and full text recovery via
76/// [`FmIndex::inverse_bwt`] (the BWT round-trips).
77///
78/// # Examples
79///
80/// ```
81/// use oxicuda_seq::string::FmIndex;
82///
83/// let fm = FmIndex::new(b"banana").expect("non-empty");
84/// assert_eq!(fm.count(b"ana"), 2);
85/// assert_eq!(fm.locate(b"ana"), vec![1, 3]);
86/// assert_eq!(fm.count(b"xyz"), 0);
87/// assert_eq!(fm.inverse_bwt(), b"banana");
88/// ```
89#[derive(Debug, Clone)]
90pub struct FmIndex {
91    /// BWT as symbol codes over `T$` (length `n + 1`).
92    bwt: Vec<usize>,
93    /// Suffix array of `T$` (length `n + 1`); `sa[i]` is a text position into
94    /// `T$`, where position `n` denotes the sentinel.
95    sa: Vec<usize>,
96    /// `C[c]` = number of symbols in `T$` strictly less than `c`.
97    c: Vec<usize>,
98    /// `occ[c][i]` = occurrences of symbol `c` in `bwt[0..i]`, for `i ≤ n+1`.
99    occ: Vec<Vec<usize>>,
100    /// Length of the original text (without the sentinel).
101    text_len: usize,
102}
103
104impl FmIndex {
105    /// Build the FM-index of `s`.
106    ///
107    /// Internally appends a unique sentinel, derives the BWT from the reused
108    /// SA-IS suffix array, and precomputes the `C` and `Occ` tables.
109    ///
110    /// # Errors
111    ///
112    /// Returns [`SeqError::EmptyInput`] for an empty `s`, consistent with the
113    /// sibling string modules.
114    pub fn new(s: &[u8]) -> SeqResult<Self> {
115        if s.is_empty() {
116            return Err(SeqError::EmptyInput);
117        }
118        let n = s.len();
119
120        // Suffix array of T (without sentinel), reusing module 2.
121        let sa_no_sentinel = SuffixArray::new(s)?;
122        let sa_t = sa_no_sentinel.sa();
123
124        // Build the suffix array of T$ by hand: the sentinel suffix (position n)
125        // is lexicographically smallest, so it leads, followed by the suffixes
126        // of T in the same relative order (the sentinel only ever appears at the
127        // very end, so it cannot change the order among the real suffixes).
128        let mut sa: Vec<usize> = Vec::with_capacity(n + 1);
129        sa.push(n); // the sentinel suffix "$"
130        sa.extend_from_slice(sa_t);
131
132        // BWT over T$: bwt[i] = symbol preceding suffix sa[i] cyclically.
133        let mut bwt = vec![0usize; n + 1];
134        for (i, &p) in sa.iter().enumerate() {
135            bwt[i] = if p == 0 {
136                SENTINEL // wraps to the sentinel
137            } else {
138                code_of(s[p - 1])
139            };
140        }
141
142        // C array: counts of each symbol in T$, then exclusive prefix sum.
143        let mut counts = vec![0usize; SIGMA];
144        counts[SENTINEL] += 1; // the single sentinel
145        for &b in s {
146            counts[code_of(b)] += 1;
147        }
148        let mut c = vec![0usize; SIGMA];
149        let mut acc = 0usize;
150        for sym in 0..SIGMA {
151            c[sym] = acc;
152            acc += counts[sym];
153        }
154
155        // Occ table: occ[sym][i] = occurrences of sym in bwt[0..i].
156        let mut occ = vec![vec![0usize; n + 2]; SIGMA];
157        for i in 0..=n {
158            let sym = bwt[i];
159            for s_idx in 0..SIGMA {
160                occ[s_idx][i + 1] = occ[s_idx][i];
161            }
162            occ[sym][i + 1] += 1;
163        }
164
165        Ok(Self {
166            bwt,
167            sa,
168            c,
169            occ,
170            text_len: n,
171        })
172    }
173
174    /// Length of the original text (excluding the sentinel).
175    pub fn text_len(&self) -> usize {
176        self.text_len
177    }
178
179    /// Borrow the BWT as raw bytes, mapping the sentinel to `sentinel_byte`.
180    ///
181    /// The sentinel has no byte value of its own, so the caller supplies the
182    /// placeholder used to render it. The placeholder is *not* required to be
183    /// absent from the text; it is purely cosmetic for inspection/printing.
184    pub fn bwt_bytes(&self, sentinel_byte: u8) -> Vec<u8> {
185        self.bwt
186            .iter()
187            .map(|&sym| {
188                if sym == SENTINEL {
189                    sentinel_byte
190                } else {
191                    (sym - 1) as u8
192                }
193            })
194            .collect()
195    }
196
197    /// `Occ(sym, i)`: occurrences of symbol `sym` in `BWT[0..i]`.
198    fn occ(&self, sym: usize, i: usize) -> usize {
199        self.occ[sym][i]
200    }
201
202    /// The LF-mapping `LF(i) = C[BWT[i]] + Occ(BWT[i], i)`.
203    fn lf(&self, i: usize) -> usize {
204        let sym = self.bwt[i];
205        self.c[sym] + self.occ(sym, i)
206    }
207
208    /// Recover the original text by inverting the BWT through the LF-mapping.
209    ///
210    /// Starts at the sentinel row (row `0`, since the sentinel sorts first) and
211    /// walks LF backwards, emitting characters right-to-left.
212    pub fn inverse_bwt(&self) -> Vec<u8> {
213        let n = self.text_len;
214        let mut out = vec![0u8; n];
215        // Row 0 is the sentinel row (T$ sorted ⇒ "$..." is first). The character
216        // BWT[0] is the last real character of T; walking LF reveals the rest.
217        let mut row = 0usize;
218        for k in (0..n).rev() {
219            let sym = self.bwt[row];
220            // sym is never the sentinel here for the first n steps because the
221            // sentinel appears exactly once and is reached only after all n real
222            // characters have been emitted.
223            out[k] = (sym - 1) as u8;
224            row = self.lf(row);
225        }
226        out
227    }
228
229    /// Backward-search the half-open SA range `[lo, hi)` of rows whose suffix is
230    /// prefixed by `pattern`. Returns `None` for an empty pattern (no defined
231    /// range) and an empty range `lo == hi` when absent.
232    fn backward_search(&self, pattern: &[u8]) -> Option<(usize, usize)> {
233        if pattern.is_empty() {
234            return None;
235        }
236        let mut lo = 0usize;
237        let mut hi = self.sa.len(); // n + 1
238        for &b in pattern.iter().rev() {
239            let sym = code_of(b);
240            lo = self.c[sym] + self.occ(sym, lo);
241            hi = self.c[sym] + self.occ(sym, hi);
242            if lo >= hi {
243                return Some((lo, lo)); // empty range; pattern absent
244            }
245        }
246        Some((lo, hi))
247    }
248
249    /// Number of occurrences of `pattern` in the text (backward search).
250    ///
251    /// Returns `0` for an empty pattern or when the pattern does not occur.
252    pub fn count(&self, pattern: &[u8]) -> usize {
253        match self.backward_search(pattern) {
254            Some((lo, hi)) => hi - lo,
255            None => 0,
256        }
257    }
258
259    /// Sorted text positions of every occurrence of `pattern`.
260    ///
261    /// Each row in the final backward-search range maps to a text position
262    /// through the stored suffix array. Returns an empty vector for an empty or
263    /// absent pattern.
264    pub fn locate(&self, pattern: &[u8]) -> Vec<usize> {
265        match self.backward_search(pattern) {
266            Some((lo, hi)) if lo < hi => {
267                let mut positions: Vec<usize> = self.sa[lo..hi].to_vec();
268                positions.sort_unstable();
269                positions
270            }
271            _ => Vec::new(),
272        }
273    }
274}
275
276#[cfg(test)]
277mod tests {
278    use super::*;
279
280    fn naive_search(p: &[u8], t: &[u8]) -> Vec<usize> {
281        let (m, n) = (p.len(), t.len());
282        if m == 0 || m > n {
283            return Vec::new();
284        }
285        (0..=(n - m)).filter(|&i| &t[i..i + m] == p).collect()
286    }
287
288    fn random_bytes(rng: &mut crate::handle::LcgRng, alphabet: &[u8], len: usize) -> Vec<u8> {
289        (0..len)
290            .map(|_| alphabet[rng.next_usize(alphabet.len())])
291            .collect()
292    }
293
294    /// (a) The BWT is invertible: inverse-BWT recovers the original exactly.
295    #[test]
296    fn bwt_round_trips() {
297        for s in [
298            b"banana".as_slice(),
299            b"mississippi",
300            b"abracadabra",
301            b"aaaa",
302            b"a",
303            b"the quick brown fox",
304        ] {
305            let fm = FmIndex::new(s).expect("non-empty");
306            assert_eq!(fm.inverse_bwt(), s, "round-trip for {s:?}");
307        }
308        let mut rng = crate::handle::LcgRng::new(101);
309        for &alphabet in &[b"a".as_slice(), b"ab", b"abc", b"abcd"] {
310            for _ in 0..400 {
311                let len = 1 + rng.next_usize(40);
312                let s = random_bytes(&mut rng, alphabet, len);
313                let fm = FmIndex::new(&s).expect("non-empty");
314                assert_eq!(fm.inverse_bwt(), s, "round-trip for {s:?}");
315            }
316        }
317    }
318
319    /// (b) Backward-search count equals the true number of occurrences,
320    /// including absent patterns (→ 0).
321    #[test]
322    fn count_matches_naive() {
323        let fm = FmIndex::new(b"mississippi").expect("non-empty");
324        assert_eq!(fm.count(b"issi"), 2);
325        assert_eq!(fm.count(b"ss"), 2);
326        assert_eq!(fm.count(b"i"), 4);
327        assert_eq!(fm.count(b"mississippi"), 1);
328        assert_eq!(fm.count(b"xyz"), 0); // absent
329        assert_eq!(fm.count(b"ppp"), 0); // absent
330        assert_eq!(fm.count(b""), 0); // empty pattern
331
332        let mut rng = crate::handle::LcgRng::new(202);
333        for &alphabet in &[b"ab".as_slice(), b"abc"] {
334            for _ in 0..400 {
335                let tlen = 1 + rng.next_usize(40);
336                let plen = 1 + rng.next_usize(5);
337                let t = random_bytes(&mut rng, alphabet, tlen);
338                let p = random_bytes(&mut rng, alphabet, plen);
339                let fm = FmIndex::new(&t).expect("non-empty");
340                assert_eq!(fm.count(&p), naive_search(&p, &t).len(), "p={p:?} t={t:?}");
341            }
342        }
343    }
344
345    /// (c) Locate returns the correct sorted positions.
346    #[test]
347    fn locate_matches_naive() {
348        let fm = FmIndex::new(b"banana").expect("non-empty");
349        assert_eq!(fm.locate(b"ana"), vec![1, 3]);
350        assert_eq!(fm.locate(b"a"), vec![1, 3, 5]);
351        assert_eq!(fm.locate(b"na"), vec![2, 4]);
352        assert!(fm.locate(b"xyz").is_empty());
353        assert!(fm.locate(b"").is_empty());
354
355        let mut rng = crate::handle::LcgRng::new(303);
356        for &alphabet in &[b"ab".as_slice(), b"abc"] {
357            for _ in 0..400 {
358                let tlen = 1 + rng.next_usize(40);
359                let plen = 1 + rng.next_usize(5);
360                let t = random_bytes(&mut rng, alphabet, tlen);
361                let p = random_bytes(&mut rng, alphabet, plen);
362                let fm = FmIndex::new(&t).expect("non-empty");
363                let mut want = naive_search(&p, &t);
364                want.sort_unstable();
365                assert_eq!(fm.locate(&p), want, "p={p:?} t={t:?}");
366            }
367        }
368    }
369
370    /// (d) The LF-mapping is a permutation of the rows.
371    #[test]
372    fn lf_is_permutation() {
373        let mut rng = crate::handle::LcgRng::new(404);
374        for _ in 0..300 {
375            let len = 1 + rng.next_usize(30);
376            let s = random_bytes(&mut rng, b"abc", len);
377            let fm = FmIndex::new(&s).expect("non-empty");
378            let rows = fm.sa.len(); // n + 1
379            let mut seen = vec![false; rows];
380            for i in 0..rows {
381                let target = fm.lf(i);
382                assert!(target < rows, "LF out of range");
383                assert!(!seen[target], "LF not injective at {i}");
384                seen[target] = true;
385            }
386            assert!(seen.iter().all(|&b| b), "LF not surjective");
387        }
388    }
389
390    /// (e) C/Occ consistency: Occ monotone non-decreasing in i, C cumulative.
391    #[test]
392    fn c_and_occ_consistent() {
393        let mut rng = crate::handle::LcgRng::new(505);
394        for _ in 0..100 {
395            let len = 1 + rng.next_usize(30);
396            let s = random_bytes(&mut rng, b"abc", len);
397            let fm = FmIndex::new(&s).expect("non-empty");
398            let rows = fm.sa.len();
399
400            // C is cumulative (non-decreasing) and starts at 0.
401            assert_eq!(fm.c[SENTINEL], 0);
402            for sym in 1..SIGMA {
403                assert!(fm.c[sym] >= fm.c[sym - 1], "C not cumulative");
404            }
405            // C[max] + (count of max sym) == rows; equivalently the last bucket
406            // boundary plus its size equals the total number of rows.
407            assert!(*fm.c.last().expect("non-empty C") <= rows);
408
409            // Occ monotone in i for every symbol, and Occ(sym, rows) totals to
410            // the count of sym, whose sum over symbols equals rows.
411            let mut total = 0usize;
412            for sym in 0..SIGMA {
413                for i in 0..rows {
414                    assert!(fm.occ(sym, i + 1) >= fm.occ(sym, i), "Occ not monotone");
415                }
416                total += fm.occ(sym, rows);
417            }
418            assert_eq!(total, rows, "Occ totals must sum to row count");
419
420            // Cross-check C against Occ at the end: C[sym] equals the number of
421            // BWT symbols strictly less than sym, i.e. Σ_{k<sym} Occ(k, rows).
422            let mut acc = 0usize;
423            for sym in 0..SIGMA {
424                assert_eq!(fm.c[sym], acc, "C vs Occ mismatch at {sym}");
425                acc += fm.occ(sym, rows);
426            }
427        }
428    }
429
430    /// (f) A sentinel-terminated string round-trips (text containing the byte we
431    /// later render the sentinel as — the internal sentinel is distinct, so this
432    /// still works).
433    #[test]
434    fn sentinel_rendering_does_not_collide() {
435        // Use a text that contains byte 0 to prove the internal sentinel is a
436        // separate symbol from any data byte (the BWT is over symbol codes).
437        let s = &[0u8, 1u8, 0u8, 2u8, 0u8];
438        let fm = FmIndex::new(s).expect("non-empty");
439        assert_eq!(fm.inverse_bwt(), s, "round-trip with embedded zero bytes");
440        assert_eq!(fm.count(&[0u8]), 3);
441        assert_eq!(fm.locate(&[0u8]), vec![0, 2, 4]);
442        assert_eq!(fm.count(&[0u8, 0u8]), 0); // no adjacent zeros
443
444        // The rendered BWT with '$' as a placeholder has the sentinel exactly
445        // once even though byte 0 occurs three times in the source.
446        let rendered = fm.bwt_bytes(b'$');
447        assert_eq!(rendered.iter().filter(|&&b| b == b'$').count(), 1);
448    }
449
450    /// Empty input is rejected.
451    #[test]
452    fn empty_input_errors() {
453        assert!(matches!(FmIndex::new(b""), Err(SeqError::EmptyInput)));
454    }
455
456    /// The BWT of "banana" is the textbook "annb$aa".
457    #[test]
458    fn banana_bwt_textbook() {
459        let fm = FmIndex::new(b"banana").expect("non-empty");
460        let rendered = fm.bwt_bytes(b'$');
461        assert_eq!(rendered.as_slice(), b"annb$aa");
462    }
463}