use crate::error::{SeqError, SeqResult};
use crate::string::SuffixArray;
const SIGMA: usize = 257;
const SENTINEL: usize = 0;
fn code_of(byte: u8) -> usize {
byte as usize + 1
}
#[derive(Debug, Clone)]
pub struct FmIndex {
bwt: Vec<usize>,
sa: Vec<usize>,
c: Vec<usize>,
occ: Vec<Vec<usize>>,
text_len: usize,
}
impl FmIndex {
pub fn new(s: &[u8]) -> SeqResult<Self> {
if s.is_empty() {
return Err(SeqError::EmptyInput);
}
let n = s.len();
let sa_no_sentinel = SuffixArray::new(s)?;
let sa_t = sa_no_sentinel.sa();
let mut sa: Vec<usize> = Vec::with_capacity(n + 1);
sa.push(n); sa.extend_from_slice(sa_t);
let mut bwt = vec![0usize; n + 1];
for (i, &p) in sa.iter().enumerate() {
bwt[i] = if p == 0 {
SENTINEL } else {
code_of(s[p - 1])
};
}
let mut counts = vec![0usize; SIGMA];
counts[SENTINEL] += 1; for &b in s {
counts[code_of(b)] += 1;
}
let mut c = vec![0usize; SIGMA];
let mut acc = 0usize;
for sym in 0..SIGMA {
c[sym] = acc;
acc += counts[sym];
}
let mut occ = vec![vec![0usize; n + 2]; SIGMA];
for i in 0..=n {
let sym = bwt[i];
for s_idx in 0..SIGMA {
occ[s_idx][i + 1] = occ[s_idx][i];
}
occ[sym][i + 1] += 1;
}
Ok(Self {
bwt,
sa,
c,
occ,
text_len: n,
})
}
pub fn text_len(&self) -> usize {
self.text_len
}
pub fn bwt_bytes(&self, sentinel_byte: u8) -> Vec<u8> {
self.bwt
.iter()
.map(|&sym| {
if sym == SENTINEL {
sentinel_byte
} else {
(sym - 1) as u8
}
})
.collect()
}
fn occ(&self, sym: usize, i: usize) -> usize {
self.occ[sym][i]
}
fn lf(&self, i: usize) -> usize {
let sym = self.bwt[i];
self.c[sym] + self.occ(sym, i)
}
pub fn inverse_bwt(&self) -> Vec<u8> {
let n = self.text_len;
let mut out = vec![0u8; n];
let mut row = 0usize;
for k in (0..n).rev() {
let sym = self.bwt[row];
out[k] = (sym - 1) as u8;
row = self.lf(row);
}
out
}
fn backward_search(&self, pattern: &[u8]) -> Option<(usize, usize)> {
if pattern.is_empty() {
return None;
}
let mut lo = 0usize;
let mut hi = self.sa.len(); for &b in pattern.iter().rev() {
let sym = code_of(b);
lo = self.c[sym] + self.occ(sym, lo);
hi = self.c[sym] + self.occ(sym, hi);
if lo >= hi {
return Some((lo, lo)); }
}
Some((lo, hi))
}
pub fn count(&self, pattern: &[u8]) -> usize {
match self.backward_search(pattern) {
Some((lo, hi)) => hi - lo,
None => 0,
}
}
pub fn locate(&self, pattern: &[u8]) -> Vec<usize> {
match self.backward_search(pattern) {
Some((lo, hi)) if lo < hi => {
let mut positions: Vec<usize> = self.sa[lo..hi].to_vec();
positions.sort_unstable();
positions
}
_ => Vec::new(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn naive_search(p: &[u8], t: &[u8]) -> Vec<usize> {
let (m, n) = (p.len(), t.len());
if m == 0 || m > n {
return Vec::new();
}
(0..=(n - m)).filter(|&i| &t[i..i + m] == p).collect()
}
fn random_bytes(rng: &mut crate::handle::LcgRng, alphabet: &[u8], len: usize) -> Vec<u8> {
(0..len)
.map(|_| alphabet[rng.next_usize(alphabet.len())])
.collect()
}
#[test]
fn bwt_round_trips() {
for s in [
b"banana".as_slice(),
b"mississippi",
b"abracadabra",
b"aaaa",
b"a",
b"the quick brown fox",
] {
let fm = FmIndex::new(s).expect("non-empty");
assert_eq!(fm.inverse_bwt(), s, "round-trip for {s:?}");
}
let mut rng = crate::handle::LcgRng::new(101);
for &alphabet in &[b"a".as_slice(), b"ab", b"abc", b"abcd"] {
for _ in 0..400 {
let len = 1 + rng.next_usize(40);
let s = random_bytes(&mut rng, alphabet, len);
let fm = FmIndex::new(&s).expect("non-empty");
assert_eq!(fm.inverse_bwt(), s, "round-trip for {s:?}");
}
}
}
#[test]
fn count_matches_naive() {
let fm = FmIndex::new(b"mississippi").expect("non-empty");
assert_eq!(fm.count(b"issi"), 2);
assert_eq!(fm.count(b"ss"), 2);
assert_eq!(fm.count(b"i"), 4);
assert_eq!(fm.count(b"mississippi"), 1);
assert_eq!(fm.count(b"xyz"), 0); assert_eq!(fm.count(b"ppp"), 0); assert_eq!(fm.count(b""), 0);
let mut rng = crate::handle::LcgRng::new(202);
for &alphabet in &[b"ab".as_slice(), b"abc"] {
for _ in 0..400 {
let tlen = 1 + rng.next_usize(40);
let plen = 1 + rng.next_usize(5);
let t = random_bytes(&mut rng, alphabet, tlen);
let p = random_bytes(&mut rng, alphabet, plen);
let fm = FmIndex::new(&t).expect("non-empty");
assert_eq!(fm.count(&p), naive_search(&p, &t).len(), "p={p:?} t={t:?}");
}
}
}
#[test]
fn locate_matches_naive() {
let fm = FmIndex::new(b"banana").expect("non-empty");
assert_eq!(fm.locate(b"ana"), vec![1, 3]);
assert_eq!(fm.locate(b"a"), vec![1, 3, 5]);
assert_eq!(fm.locate(b"na"), vec![2, 4]);
assert!(fm.locate(b"xyz").is_empty());
assert!(fm.locate(b"").is_empty());
let mut rng = crate::handle::LcgRng::new(303);
for &alphabet in &[b"ab".as_slice(), b"abc"] {
for _ in 0..400 {
let tlen = 1 + rng.next_usize(40);
let plen = 1 + rng.next_usize(5);
let t = random_bytes(&mut rng, alphabet, tlen);
let p = random_bytes(&mut rng, alphabet, plen);
let fm = FmIndex::new(&t).expect("non-empty");
let mut want = naive_search(&p, &t);
want.sort_unstable();
assert_eq!(fm.locate(&p), want, "p={p:?} t={t:?}");
}
}
}
#[test]
fn lf_is_permutation() {
let mut rng = crate::handle::LcgRng::new(404);
for _ in 0..300 {
let len = 1 + rng.next_usize(30);
let s = random_bytes(&mut rng, b"abc", len);
let fm = FmIndex::new(&s).expect("non-empty");
let rows = fm.sa.len(); let mut seen = vec![false; rows];
for i in 0..rows {
let target = fm.lf(i);
assert!(target < rows, "LF out of range");
assert!(!seen[target], "LF not injective at {i}");
seen[target] = true;
}
assert!(seen.iter().all(|&b| b), "LF not surjective");
}
}
#[test]
fn c_and_occ_consistent() {
let mut rng = crate::handle::LcgRng::new(505);
for _ in 0..100 {
let len = 1 + rng.next_usize(30);
let s = random_bytes(&mut rng, b"abc", len);
let fm = FmIndex::new(&s).expect("non-empty");
let rows = fm.sa.len();
assert_eq!(fm.c[SENTINEL], 0);
for sym in 1..SIGMA {
assert!(fm.c[sym] >= fm.c[sym - 1], "C not cumulative");
}
assert!(*fm.c.last().expect("non-empty C") <= rows);
let mut total = 0usize;
for sym in 0..SIGMA {
for i in 0..rows {
assert!(fm.occ(sym, i + 1) >= fm.occ(sym, i), "Occ not monotone");
}
total += fm.occ(sym, rows);
}
assert_eq!(total, rows, "Occ totals must sum to row count");
let mut acc = 0usize;
for sym in 0..SIGMA {
assert_eq!(fm.c[sym], acc, "C vs Occ mismatch at {sym}");
acc += fm.occ(sym, rows);
}
}
}
#[test]
fn sentinel_rendering_does_not_collide() {
let s = &[0u8, 1u8, 0u8, 2u8, 0u8];
let fm = FmIndex::new(s).expect("non-empty");
assert_eq!(fm.inverse_bwt(), s, "round-trip with embedded zero bytes");
assert_eq!(fm.count(&[0u8]), 3);
assert_eq!(fm.locate(&[0u8]), vec![0, 2, 4]);
assert_eq!(fm.count(&[0u8, 0u8]), 0);
let rendered = fm.bwt_bytes(b'$');
assert_eq!(rendered.iter().filter(|&&b| b == b'$').count(), 1);
}
#[test]
fn empty_input_errors() {
assert!(matches!(FmIndex::new(b""), Err(SeqError::EmptyInput)));
}
#[test]
fn banana_bwt_textbook() {
let fm = FmIndex::new(b"banana").expect("non-empty");
let rendered = fm.bwt_bytes(b'$');
assert_eq!(rendered.as_slice(), b"annb$aa");
}
}