#[derive(Debug, Clone)]
pub struct SuffixArray {
pub sa: Vec<usize>,
pub isa: Vec<usize>,
pub lcp: Vec<usize>,
text_len: usize,
}
impl SuffixArray {
pub fn build(text: &[u8]) -> Self {
let n = text.len();
if n == 0 {
return SuffixArray {
sa: Vec::new(),
isa: Vec::new(),
lcp: Vec::new(),
text_len: 0,
};
}
let mut sa: Vec<usize> = (0..n).collect();
let mut rank: Vec<i64> = text.iter().map(|&b| b as i64).collect();
sa.sort_by_key(|&i| rank[i]);
let mut gap = 1usize;
while gap < n {
let cur_rank = rank.clone();
let second_rank = |i: usize| -> i64 {
if i + gap < n {
cur_rank[i + gap]
} else {
-1
}
};
sa.sort_by(|&a, &b| {
let ka = (cur_rank[a], second_rank(a));
let kb = (cur_rank[b], second_rank(b));
ka.cmp(&kb)
});
let mut new_rank = vec![0i64; n];
new_rank[sa[0]] = 0;
for i in 1..n {
let prev = sa[i - 1];
let cur = sa[i];
let same = cur_rank[prev] == cur_rank[cur]
&& second_rank(prev) == second_rank(cur);
new_rank[cur] = new_rank[prev] + if same { 0 } else { 1 };
}
rank = new_rank;
if rank[sa[n - 1]] == (n as i64 - 1) {
break;
}
gap <<= 1;
}
let mut isa = vec![0usize; n];
for (i, &s) in sa.iter().enumerate() {
isa[s] = i;
}
let lcp = Self::build_lcp(text, &sa, &isa);
SuffixArray {
sa,
isa,
lcp,
text_len: n,
}
}
pub fn build_str(text: &str) -> Self {
Self::build(text.as_bytes())
}
fn build_lcp(text: &[u8], sa: &[usize], isa: &[usize]) -> Vec<usize> {
let n = text.len();
let mut lcp = vec![0usize; n];
let mut h = 0usize;
for i in 0..n {
if isa[i] > 0 {
let j = sa[isa[i] - 1];
while i + h < n && j + h < n && text[i + h] == text[j + h] {
h += 1;
}
lcp[isa[i]] = h;
if h > 0 {
h -= 1;
}
}
}
lcp
}
pub fn search(&self, text: &[u8], pattern: &[u8]) -> Option<(usize, usize)> {
if pattern.is_empty() || self.text_len == 0 {
return None;
}
let n = self.text_len;
let m = pattern.len();
let lo = {
let mut low = 0usize;
let mut high = n;
while low < high {
let mid = low + (high - low) / 2;
let start = self.sa[mid];
let end = (start + m).min(n);
if text[start..end] < *pattern {
low = mid + 1;
} else {
high = mid;
}
}
low
};
let hi = {
let mut low = lo;
let mut high = n;
while low < high {
let mid = low + (high - low) / 2;
let start = self.sa[mid];
let end = (start + m).min(n);
if text[start..end] <= *pattern {
low = mid + 1;
} else {
high = mid;
}
}
low
};
if lo < hi {
Some((lo, hi))
} else {
None
}
}
pub fn count_occurrences(&self, text: &[u8], pattern: &[u8]) -> usize {
self.search(text, pattern)
.map(|(lo, hi)| hi - lo)
.unwrap_or(0)
}
pub fn find_all(&self, text: &[u8], pattern: &[u8]) -> Vec<usize> {
match self.search(text, pattern) {
Some((lo, hi)) => {
let mut positions: Vec<usize> = self.sa[lo..hi].to_vec();
positions.sort_unstable();
positions
}
None => Vec::new(),
}
}
pub fn longest_repeated_substring<'a>(&self, text: &'a [u8]) -> &'a [u8] {
if self.lcp.is_empty() {
return &text[0..0];
}
let (max_lcp, max_i) = self
.lcp
.iter()
.enumerate()
.max_by_key(|&(_, &v)| v)
.map(|(i, &v)| (v, i))
.unwrap_or((0, 0));
if max_lcp == 0 {
return &text[0..0];
}
let start = self.sa[max_i];
&text[start..start + max_lcp]
}
pub fn longest_common_substring<'a>(text1: &'a [u8], text2: &[u8]) -> &'a [u8] {
if text1.is_empty() || text2.is_empty() {
return &text1[0..0];
}
let sep = 0x01u8;
let mut combined = Vec::with_capacity(text1.len() + 1 + text2.len());
combined.extend_from_slice(text1);
combined.push(sep);
combined.extend_from_slice(text2);
let n1 = text1.len();
let sa = SuffixArray::build(&combined);
let mut best_len = 0usize;
let mut best_start = 0usize;
let n = sa.sa.len();
for i in 1..n {
let lcp_val = sa.lcp[i];
if lcp_val == 0 {
continue;
}
let a = sa.sa[i - 1];
let b = sa.sa[i];
let a_in_t1 = a < n1;
let b_in_t1 = b < n1;
if a_in_t1 != b_in_t1 && lcp_val > best_len {
let start1 = if a_in_t1 { a } else { b };
if start1 + lcp_val <= n1 {
best_len = lcp_val;
best_start = start1;
}
}
}
&text1[best_start..best_start + best_len]
}
}
pub fn bwt_encode(text: &[u8]) -> (Vec<u8>, usize) {
let n = text.len();
if n == 0 {
return (Vec::new(), 0);
}
let sa = SuffixArray::build(text);
let mut bwt = Vec::with_capacity(n);
let mut original_pos = 0usize;
for (i, &start) in sa.sa.iter().enumerate() {
if start == 0 {
bwt.push(text[n - 1]);
original_pos = i;
} else {
bwt.push(text[start - 1]);
}
}
(bwt, original_pos)
}
pub fn bwt_decode(bwt: &[u8], original_pos: usize) -> Vec<u8> {
let n = bwt.len();
if n == 0 {
return Vec::new();
}
let mut sorted = bwt.to_vec();
sorted.sort_unstable();
let mut count = [0usize; 256];
for &b in bwt {
count[b as usize] += 1;
}
let mut first_occ = [0usize; 256];
let mut total = 0usize;
for (b, c) in count.iter().enumerate() {
first_occ[b] = total;
total += c;
}
let mut byte_rank = [0usize; 256]; let mut lf = vec![0usize; n];
for (i, &b) in bwt.iter().enumerate() {
lf[i] = first_occ[b as usize] + byte_rank[b as usize];
byte_rank[b as usize] += 1;
}
let mut result = vec![0u8; n];
let mut row = original_pos;
for slot in result.iter_mut().rev() {
*slot = bwt[row];
row = lf[row];
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_sa_build_banana() {
let sa = SuffixArray::build(b"banana");
assert_eq!(sa.sa, vec![5, 3, 1, 0, 4, 2]);
}
#[test]
fn test_sa_inverse() {
let sa = SuffixArray::build(b"banana");
for (i, &s) in sa.sa.iter().enumerate() {
assert_eq!(sa.isa[s], i);
}
}
#[test]
fn test_sa_lcp_banana() {
let sa = SuffixArray::build(b"banana");
assert_eq!(sa.lcp[0], 0);
assert_eq!(sa.lcp[1], 1); assert_eq!(sa.lcp[2], 3); assert_eq!(sa.lcp[3], 0); assert_eq!(sa.lcp[4], 0); assert_eq!(sa.lcp[5], 2); }
#[test]
fn test_sa_search_found() {
let text = b"banana";
let sa = SuffixArray::build(text);
let range = sa.search(text, b"ana");
assert!(range.is_some());
}
#[test]
fn test_sa_search_not_found() {
let text = b"banana";
let sa = SuffixArray::build(text);
assert!(sa.search(text, b"xyz").is_none());
}
#[test]
fn test_sa_count_occurrences() {
let text = b"banana";
let sa = SuffixArray::build(text);
assert_eq!(sa.count_occurrences(text, b"ana"), 2);
assert_eq!(sa.count_occurrences(text, b"na"), 2);
assert_eq!(sa.count_occurrences(text, b"banana"), 1);
}
#[test]
fn test_sa_find_all() {
let text = b"banana";
let sa = SuffixArray::build(text);
let mut pos = sa.find_all(text, b"ana");
pos.sort_unstable();
assert_eq!(pos, vec![1, 3]);
}
#[test]
fn test_sa_empty_text() {
let sa = SuffixArray::build(b"");
assert!(sa.sa.is_empty());
}
#[test]
fn test_sa_single_char() {
let text = b"aaaa";
let sa = SuffixArray::build(text);
assert_eq!(sa.count_occurrences(text, b"a"), 4);
assert_eq!(sa.count_occurrences(text, b"aa"), 3);
}
#[test]
fn test_sa_longest_repeated_substring() {
let text = b"banana";
let sa = SuffixArray::build(text);
let lrs = sa.longest_repeated_substring(text);
assert_eq!(lrs, b"ana");
}
#[test]
fn test_sa_longest_common_substring() {
let lcs = SuffixArray::longest_common_substring(b"xabcy", b"zabc");
assert_eq!(lcs, b"abc");
}
#[test]
fn test_sa_lcs_empty() {
let lcs = SuffixArray::longest_common_substring(b"abc", b"xyz");
assert_eq!(lcs, b"");
}
#[test]
fn test_sa_lcs_full_overlap() {
let text = b"hello";
let lcs = SuffixArray::longest_common_substring(text, text);
assert_eq!(lcs, b"hello");
}
#[test]
fn test_bwt_encode_decode_banana() {
let (bwt, pos) = bwt_encode(b"banana");
let decoded = bwt_decode(&bwt, pos);
assert_eq!(decoded, b"banana");
}
#[test]
fn test_bwt_encode_decode_empty() {
let (bwt, pos) = bwt_encode(b"");
let decoded = bwt_decode(&bwt, pos);
assert_eq!(decoded, b"");
}
#[test]
fn test_bwt_encode_decode_single() {
let (bwt, pos) = bwt_encode(b"a");
let decoded = bwt_decode(&bwt, pos);
assert_eq!(decoded, b"a");
}
#[test]
fn test_bwt_encode_decode_repeated() {
let original = b"aaaaaa";
let (bwt, pos) = bwt_encode(original);
let decoded = bwt_decode(&bwt, pos);
assert_eq!(decoded, original.as_ref());
}
#[test]
fn test_bwt_encode_decode_general() {
let original = b"mississippi";
let (bwt, pos) = bwt_encode(original);
let decoded = bwt_decode(&bwt, pos);
assert_eq!(decoded, original.as_ref());
}
}