use crate::algorithms::{Algorithm, AlgorithmStats};
use crate::error::Result;
use std::cmp::Ordering;
use std::time::Instant;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum SuffixArrayAlgorithm {
SAIS,
DivSufSort,
DC3,
LarssonSadakane,
#[default]
Adaptive,
}
impl SuffixArrayAlgorithm {
pub fn description(&self) -> &'static str {
match self {
Self::SAIS => "SA-IS: Linear-time induced sorting algorithm",
Self::DivSufSort => "DivSufSort: Practical performance optimized algorithm",
Self::DC3 => "DC3: Simple divide-and-conquer algorithm",
Self::LarssonSadakane => "Larsson-Sadakane: Optimized for repetitive data",
Self::Adaptive => {
"Adaptive: Automatic algorithm selection based on data characteristics"
}
}
}
}
#[derive(Debug, Clone)]
pub struct DataCharacteristics {
pub text_length: usize,
pub alphabet_size: usize,
pub repetition_ratio: f64,
pub average_run_length: f64,
pub entropy: f64,
}
#[derive(Debug, Clone)]
pub struct SuffixArrayConfig {
pub algorithm: SuffixArrayAlgorithm,
pub use_parallel: bool,
pub parallel_threshold: usize,
pub compute_lcp: bool,
pub optimize_small_alphabet: bool,
pub adaptive_threshold: usize,
}
impl Default for SuffixArrayConfig {
fn default() -> Self {
Self {
algorithm: SuffixArrayAlgorithm::default(),
use_parallel: true,
parallel_threshold: 100_000,
compute_lcp: false,
optimize_small_alphabet: true,
adaptive_threshold: 10_000,
}
}
}
#[derive(Debug)]
pub struct SuffixArray {
sa: Vec<usize>,
text_len: usize,
stats: AlgorithmStats,
}
impl SuffixArray {
pub fn new(text: &[u8]) -> Result<Self> {
Self::with_config(text, &SuffixArrayConfig::default())
}
pub fn with_config(text: &[u8], config: &SuffixArrayConfig) -> Result<Self> {
let builder = SuffixArrayBuilder::new(config.clone());
builder.build(text)
}
#[inline]
pub fn as_slice(&self) -> &[usize] {
&self.sa
}
pub fn text_len(&self) -> usize {
self.text_len
}
pub fn suffix_at_rank(&self, rank: usize) -> Option<usize> {
self.sa.get(rank).copied()
}
pub fn search(&self, text: &[u8], pattern: &[u8]) -> (usize, usize) {
let (left, right) = self.search_range(text, pattern);
(left, right.saturating_sub(left))
}
pub fn search_range(&self, text: &[u8], pattern: &[u8]) -> (usize, usize) {
let left = self.lower_bound(text, pattern);
let right = self.upper_bound(text, pattern);
(left, right)
}
fn lower_bound(&self, text: &[u8], pattern: &[u8]) -> usize {
let mut left = 0;
let mut right = self.sa.len();
while left < right {
let mid = left + (right - left) / 2;
let suffix_start = self.sa[mid];
let suffix = &text[suffix_start..];
if Self::compare_suffix_pattern(suffix, pattern) == Ordering::Less {
left = mid + 1;
} else {
right = mid;
}
}
left
}
fn upper_bound(&self, text: &[u8], pattern: &[u8]) -> usize {
let mut left = 0;
let mut right = self.sa.len();
while left < right {
let mid = left + (right - left) / 2;
let suffix_start = self.sa[mid];
let suffix = &text[suffix_start..];
if Self::compare_suffix_pattern(suffix, pattern) != Ordering::Greater {
left = mid + 1;
} else {
right = mid;
}
}
left
}
fn compare_suffix_pattern(suffix: &[u8], pattern: &[u8]) -> Ordering {
let min_len = suffix.len().min(pattern.len());
for i in 0..min_len {
match suffix[i].cmp(&pattern[i]) {
Ordering::Equal => continue,
other => return other,
}
}
if pattern.len() <= suffix.len() {
Ordering::Equal
} else {
Ordering::Less
}
}
pub fn stats(&self) -> &AlgorithmStats {
&self.stats
}
pub fn analyze_text_characteristics(text: &[u8]) -> DataCharacteristics {
if text.is_empty() {
return DataCharacteristics {
text_length: 0,
alphabet_size: 0,
repetition_ratio: 0.0,
average_run_length: 0.0,
entropy: 0.0,
};
}
let text_length = text.len();
let mut freq = [0u32; 256];
for &byte in text {
freq[byte as usize] += 1;
}
let alphabet_size = freq.iter().filter(|&&count| count > 0).count();
let entropy = Self::calculate_entropy(&freq, text_length);
let (repetition_ratio, average_run_length) = Self::calculate_repetition_metrics(text);
DataCharacteristics {
text_length,
alphabet_size,
repetition_ratio,
average_run_length,
entropy,
}
}
fn calculate_entropy(freq: &[u32; 256], text_length: usize) -> f64 {
let mut entropy = 0.0;
let len_f64 = text_length as f64;
for &count in freq.iter() {
if count > 0 {
let p = count as f64 / len_f64;
entropy -= p * p.log2();
}
}
entropy
}
fn calculate_repetition_metrics(text: &[u8]) -> (f64, f64) {
if text.len() <= 1 {
return (0.0, 1.0);
}
let mut total_run_length = 0;
let mut num_runs = 0;
let mut current_run_length = 1;
let mut repeated_chars = 0;
for i in 1..text.len() {
if text[i] == text[i - 1] {
current_run_length += 1;
repeated_chars += 1;
} else {
total_run_length += current_run_length;
num_runs += 1;
current_run_length = 1;
}
}
total_run_length += current_run_length;
num_runs += 1;
let repetition_ratio = repeated_chars as f64 / text.len() as f64;
let average_run_length = if num_runs > 0 {
total_run_length as f64 / num_runs as f64
} else {
1.0
};
(repetition_ratio, average_run_length)
}
}
pub struct SuffixArrayBuilder {
config: SuffixArrayConfig,
}
impl SuffixArrayBuilder {
pub fn new(config: SuffixArrayConfig) -> Self {
Self { config }
}
pub fn select_algorithm(&self, text: &[u8]) -> SuffixArrayAlgorithm {
if self.config.algorithm != SuffixArrayAlgorithm::Adaptive {
return self.config.algorithm;
}
if text.len() < self.config.adaptive_threshold {
return SuffixArrayAlgorithm::DC3;
}
let characteristics = SuffixArray::analyze_text_characteristics(text);
if characteristics.alphabet_size <= 4 {
SuffixArrayAlgorithm::SAIS
} else if characteristics.repetition_ratio > 0.7 {
SuffixArrayAlgorithm::LarssonSadakane
} else if characteristics.entropy < 2.0 && characteristics.text_length < 100_000 {
SuffixArrayAlgorithm::DC3
} else if characteristics.text_length > 1_000_000 {
SuffixArrayAlgorithm::DivSufSort
} else if characteristics.text_length > 50_000 {
SuffixArrayAlgorithm::DivSufSort
} else {
SuffixArrayAlgorithm::SAIS
}
}
pub fn build(&self, text: &[u8]) -> Result<SuffixArray> {
let start_time = Instant::now();
if text.is_empty() {
return Ok(SuffixArray {
sa: Vec::new(),
text_len: 0,
stats: AlgorithmStats {
items_processed: 0,
processing_time_us: 0,
memory_used: 0,
used_parallel: false,
used_simd: false,
},
});
}
let sa = if text.len() >= self.config.parallel_threshold && self.config.use_parallel {
self.build_parallel(text)?
} else {
self.build_sequential(text)?
};
let elapsed = start_time.elapsed();
let memory_used = sa.len() * std::mem::size_of::<usize>();
Ok(SuffixArray {
text_len: text.len(),
stats: AlgorithmStats {
items_processed: text.len(),
processing_time_us: elapsed.as_micros() as u64,
memory_used,
used_parallel: text.len() >= self.config.parallel_threshold
&& self.config.use_parallel,
used_simd: false,
},
sa,
})
}
fn build_sequential(&self, text: &[u8]) -> Result<Vec<usize>> {
if text.is_empty() {
return Ok(Vec::new());
}
if text.len() == 1 {
return Ok(vec![0]);
}
let algorithm = self.select_algorithm(text);
match algorithm {
SuffixArrayAlgorithm::SAIS => self.sais_construct(text),
SuffixArrayAlgorithm::DC3 => self.dc3_construct(text),
SuffixArrayAlgorithm::DivSufSort => self.divsufsort_construct(text),
SuffixArrayAlgorithm::LarssonSadakane => self.larsson_sadakane_construct(text),
SuffixArrayAlgorithm::Adaptive => {
self.sais_construct(text)
}
}
}
fn sais_construct(&self, text: &[u8]) -> Result<Vec<usize>> {
let n = text.len();
if n == 0 {
return Ok(Vec::new());
}
if n == 1 {
return Ok(vec![0]);
}
if n >= u32::MAX as usize {
return Err(crate::error::ZiporaError::invalid_data(
"text too large for SA-IS (must be < 4 GiB)",
));
}
let mut s: Vec<u32> = Vec::with_capacity(n + 1);
s.extend(text.iter().map(|&b| b as u32 + 1));
s.push(0);
let sa = sais_core(&s, 257);
debug_assert_eq!(sa[0] as usize, n, "sentinel suffix must sort first");
Ok(sa.into_iter().skip(1).map(|x| x as usize).collect())
}
fn dc3_construct(&self, text: &[u8]) -> Result<Vec<usize>> {
if text.is_empty() {
return Ok(Vec::new());
}
if text.len() == 1 {
return Ok(vec![0]);
}
if text.len() == 2 {
return Ok(if text[0] <= text[1] {
vec![0, 1]
} else {
vec![1, 0]
});
}
let mut sa: Vec<usize> = (0..text.len()).collect();
sa.sort_by(|&a, &b| {
let suffix_a = &text[a..];
let suffix_b = &text[b..];
suffix_a.cmp(suffix_b)
});
Ok(sa)
}
fn divsufsort_construct(&self, text: &[u8]) -> Result<Vec<usize>> {
let n = text.len();
if n == 0 {
return Ok(Vec::new());
}
if n == 1 {
return Ok(vec![0]);
}
let mut sa: Vec<usize> = (0..n).collect();
sa.sort_by(|&a, &b| {
let suffix_a = &text[a..];
let suffix_b = &text[b..];
suffix_a.cmp(suffix_b)
});
Ok(sa)
}
fn larsson_sadakane_construct(&self, text: &[u8]) -> Result<Vec<usize>> {
let n = text.len();
if n == 0 {
return Ok(Vec::new());
}
if n == 1 {
return Ok(vec![0]);
}
let mut sa: Vec<usize> = (0..n).collect();
sa.sort_by(|&a, &b| {
let suffix_a = &text[a..];
let suffix_b = &text[b..];
suffix_a.cmp(suffix_b)
});
Ok(sa)
}
fn build_parallel(&self, text: &[u8]) -> Result<Vec<usize>> {
self.build_sequential(text)
}
}
const SAIS_EMPTY: u32 = u32::MAX;
fn sais_buckets(counts: &[usize], heads: bool) -> Vec<usize> {
let mut res = vec![0usize; counts.len()];
let mut sum = 0usize;
for (i, &c) in counts.iter().enumerate() {
if heads {
res[i] = sum;
sum += c;
} else {
sum += c;
res[i] = sum;
}
}
res
}
fn sais_induce(s: &[u32], sa: &mut [u32], t: &[bool], counts: &[usize]) {
let n = s.len();
let mut heads = sais_buckets(counts, true);
for i in 0..n {
let p = sa[i];
if p != SAIS_EMPTY && p > 0 {
let j = (p - 1) as usize;
if !t[j] {
let c = s[j] as usize;
sa[heads[c]] = j as u32;
heads[c] += 1;
}
}
}
let mut tails = sais_buckets(counts, false);
for i in (0..n).rev() {
let p = sa[i];
if p != SAIS_EMPTY && p > 0 {
let j = (p - 1) as usize;
if t[j] {
let c = s[j] as usize;
tails[c] -= 1;
sa[tails[c]] = j as u32;
}
}
}
}
fn sais_lms_substr_eq(
s: &[u32],
t: &[bool],
is_lms: &impl Fn(usize) -> bool,
a: usize,
b: usize,
n: usize,
) -> bool {
if a == b {
return true;
}
let mut d = 0usize;
loop {
let (ia, ib) = (a + d, b + d);
if ia >= n || ib >= n {
return false;
}
if s[ia] != s[ib] || t[ia] != t[ib] {
return false;
}
if d > 0 {
let (la, lb) = (is_lms(ia), is_lms(ib));
if la && lb {
return true; }
if la || lb {
return false; }
}
d += 1;
}
}
fn sais_core(s: &[u32], k: usize) -> Vec<u32> {
let n = s.len();
let mut sa = vec![SAIS_EMPTY; n];
if n == 0 {
return sa;
}
if n == 1 {
sa[0] = 0;
return sa;
}
let mut t = vec![false; n];
t[n - 1] = true;
for i in (0..n - 1).rev() {
t[i] = s[i] < s[i + 1] || (s[i] == s[i + 1] && t[i + 1]);
}
let is_lms = |i: usize| i > 0 && t[i] && !t[i - 1];
let mut counts = vec![0usize; k];
for &c in s {
counts[c as usize] += 1;
}
let mut tails = sais_buckets(&counts, false);
for (i, &ch) in s.iter().enumerate() {
if is_lms(i) {
let c = ch as usize;
tails[c] -= 1;
sa[tails[c]] = i as u32;
}
}
sais_induce(s, &mut sa, &t, &counts);
let mut n1 = 0usize;
for i in 0..n {
let p = sa[i];
if p != SAIS_EMPTY && is_lms(p as usize) {
sa[n1] = p;
n1 += 1;
}
}
for slot in sa.iter_mut().take(n).skip(n1) {
*slot = SAIS_EMPTY;
}
let mut name = 0u32;
let mut prev: Option<usize> = None;
for i in 0..n1 {
let pos = sa[i] as usize;
let diff = match prev {
None => true,
Some(prev_pos) => !sais_lms_substr_eq(s, &t, &is_lms, prev_pos, pos, n),
};
if diff {
name += 1;
prev = Some(pos);
}
sa[n1 + pos / 2] = name - 1;
}
let mut j = n;
for i in (n1..n).rev() {
if sa[i] != SAIS_EMPTY {
j -= 1;
sa[j] = sa[i];
}
}
let s1: Vec<u32> = sa[n - n1..n].to_vec();
let sa1 = if (name as usize) < n1 {
sais_core(&s1, name as usize)
} else {
let mut tmp = vec![SAIS_EMPTY; n1];
for (i, &name_i) in s1.iter().enumerate() {
tmp[name_i as usize] = i as u32;
}
tmp
};
let mut lms_positions = Vec::with_capacity(n1);
for i in 0..n {
if is_lms(i) {
lms_positions.push(i as u32);
}
}
for slot in sa.iter_mut() {
*slot = SAIS_EMPTY;
}
let mut tails = sais_buckets(&counts, false);
for i in (0..n1).rev() {
let pos = lms_positions[sa1[i] as usize] as usize;
let c = s[pos] as usize;
tails[c] -= 1;
sa[tails[c]] = pos as u32;
}
sais_induce(s, &mut sa, &t, &counts);
sa
}
impl Algorithm for SuffixArrayBuilder {
type Config = SuffixArrayConfig;
type Input = Vec<u8>;
type Output = SuffixArray;
fn execute(&self, config: &Self::Config, input: Self::Input) -> Result<Self::Output> {
let builder = Self::new(config.clone());
builder.build(&input)
}
fn stats(&self) -> AlgorithmStats {
AlgorithmStats {
items_processed: 0,
processing_time_us: 0,
memory_used: 0,
used_parallel: false,
used_simd: false,
}
}
fn estimate_memory(&self, input_size: usize) -> usize {
input_size * std::mem::size_of::<usize>()
}
fn supports_parallel(&self) -> bool {
true
}
}
pub struct LcpArray {
lcp: Vec<usize>,
stats: AlgorithmStats,
}
impl LcpArray {
pub fn new(text: &[u8], suffix_array: &SuffixArray) -> Result<Self> {
let start_time = Instant::now();
let lcp = Self::compute_lcp_kasai(text, suffix_array.as_slice())?;
let elapsed = start_time.elapsed();
let memory_used = lcp.len() * std::mem::size_of::<usize>();
Ok(Self {
stats: AlgorithmStats {
items_processed: text.len(),
processing_time_us: elapsed.as_micros() as u64,
memory_used,
used_parallel: false,
used_simd: false,
},
lcp,
})
}
#[inline]
pub fn as_slice(&self) -> &[usize] {
&self.lcp
}
pub fn lcp_at(&self, index: usize) -> Option<usize> {
self.lcp.get(index).copied()
}
pub fn stats(&self) -> &AlgorithmStats {
&self.stats
}
fn compute_lcp_kasai(text: &[u8], sa: &[usize]) -> Result<Vec<usize>> {
let n = text.len();
if n == 0 {
return Ok(Vec::new());
}
let mut rank = vec![0; n];
for i in 0..n {
if sa[i] < n {
rank[sa[i]] = i;
}
}
let mut lcp = vec![0; n];
let mut h = 0;
for i in 0..n {
if rank[i] > 0 {
let j = sa[rank[i] - 1];
while i + h < n && j + h < n && text[i + h] == text[j + h] {
h += 1;
}
lcp[rank[i]] = h;
h = h.saturating_sub(1);
}
}
Ok(lcp)
}
}
pub struct EnhancedSuffixArray {
sa: SuffixArray,
lcp: Option<LcpArray>,
bwt: Option<Vec<u8>>,
}
impl EnhancedSuffixArray {
pub fn with_lcp(text: &[u8]) -> Result<Self> {
let config = SuffixArrayConfig {
compute_lcp: true,
..Default::default()
};
let sa = SuffixArray::with_config(text, &config)?;
let lcp = Some(LcpArray::new(text, &sa)?);
Ok(Self { sa, lcp, bwt: None })
}
pub fn with_bwt(text: &[u8]) -> Result<Self> {
let sa = SuffixArray::new(text)?;
let bwt = Some(Self::compute_bwt(text, sa.as_slice()));
Ok(Self { sa, lcp: None, bwt })
}
pub fn suffix_array(&self) -> &SuffixArray {
&self.sa
}
pub fn lcp_array(&self) -> Option<&LcpArray> {
self.lcp.as_ref()
}
pub fn bwt(&self) -> Option<&[u8]> {
self.bwt.as_deref()
}
fn compute_bwt(text: &[u8], sa: &[usize]) -> Vec<u8> {
let mut bwt = Vec::with_capacity(text.len());
for &suffix_start in sa {
if suffix_start == 0 {
bwt.push(text[text.len() - 1]);
} else {
bwt.push(text[suffix_start - 1]);
}
}
bwt
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_suffix_array_empty() {
let sa = SuffixArray::new(b"").unwrap();
assert_eq!(sa.as_slice().len(), 0);
assert_eq!(sa.text_len(), 0);
}
#[test]
fn test_suffix_array_simple() {
let text = b"banana";
let sa = SuffixArray::new(text).unwrap();
assert_eq!(sa.as_slice().len(), 6);
assert_eq!(sa.text_len(), 6);
let suffixes = sa.as_slice();
for i in 1..suffixes.len() {
let suffix1 = &text[suffixes[i - 1]..];
let suffix2 = &text[suffixes[i]..];
assert!(suffix1 <= suffix2);
}
}
#[test]
fn test_suffix_array_search() {
let text = b"banana";
let sa = SuffixArray::new(text).unwrap();
let (start, count) = sa.search(text, b"an");
assert!(count > 0);
for i in start..start + count {
let suffix_idx = sa.suffix_at_rank(i).unwrap();
let suffix = &text[suffix_idx..];
assert!(suffix.starts_with(b"an"));
}
}
#[test]
fn test_suffix_array_search_not_found() {
let text = b"banana";
let sa = SuffixArray::new(text).unwrap();
let (_, count) = sa.search(text, b"xyz");
assert_eq!(count, 0);
}
#[test]
fn test_lcp_array() {
let text = b"banana";
let sa = SuffixArray::new(text).unwrap();
let lcp = LcpArray::new(text, &sa).unwrap();
assert_eq!(lcp.as_slice().len(), 6);
assert_eq!(lcp.lcp_at(0), Some(0));
}
#[test]
fn test_enhanced_suffix_array() {
let text = b"banana";
let esa = EnhancedSuffixArray::with_lcp(text).unwrap();
assert_eq!(esa.suffix_array().text_len(), 6);
assert!(esa.lcp_array().is_some());
assert!(esa.bwt().is_none());
}
#[test]
fn test_enhanced_suffix_array_with_bwt() {
let text = b"banana";
let esa = EnhancedSuffixArray::with_bwt(text).unwrap();
assert_eq!(esa.suffix_array().text_len(), 6);
assert!(esa.lcp_array().is_none());
assert!(esa.bwt().is_some());
assert_eq!(esa.bwt().unwrap().len(), 6);
}
#[test]
fn test_suffix_array_config() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::SAIS,
use_parallel: false,
parallel_threshold: 1000,
compute_lcp: true,
optimize_small_alphabet: false,
adaptive_threshold: 10_000,
};
let text = b"test";
let sa = SuffixArray::with_config(text, &config).unwrap();
assert_eq!(sa.text_len(), 4);
assert!(!sa.stats().used_parallel);
}
#[test]
fn test_algorithm_trait() {
let builder = SuffixArrayBuilder::new(SuffixArrayConfig::default());
assert!(builder.supports_parallel());
assert!(!builder.supports_simd());
let memory_estimate = builder.estimate_memory(1000);
assert_eq!(memory_estimate, 1000 * std::mem::size_of::<usize>());
}
#[test]
fn test_suffix_array_algorithm_enum() {
assert_eq!(
SuffixArrayAlgorithm::default(),
SuffixArrayAlgorithm::Adaptive
);
assert!(!SuffixArrayAlgorithm::SAIS.description().is_empty());
assert!(!SuffixArrayAlgorithm::DC3.description().is_empty());
assert!(!SuffixArrayAlgorithm::DivSufSort.description().is_empty());
assert!(
!SuffixArrayAlgorithm::LarssonSadakane
.description()
.is_empty()
);
assert!(!SuffixArrayAlgorithm::Adaptive.description().is_empty());
}
#[test]
fn test_data_characteristics_analysis() {
let chars = SuffixArray::analyze_text_characteristics(b"");
assert_eq!(chars.text_length, 0);
assert_eq!(chars.alphabet_size, 0);
let chars = SuffixArray::analyze_text_characteristics(b"abcd");
assert_eq!(chars.text_length, 4);
assert_eq!(chars.alphabet_size, 4);
assert!(chars.entropy > 0.0);
let chars = SuffixArray::analyze_text_characteristics(b"aaaa");
assert_eq!(chars.text_length, 4);
assert_eq!(chars.alphabet_size, 1);
assert!(chars.repetition_ratio > 0.5);
assert_eq!(chars.entropy, 0.0);
let chars = SuffixArray::analyze_text_characteristics(b"banana");
assert_eq!(chars.text_length, 6);
assert_eq!(chars.alphabet_size, 3); assert!(chars.entropy > 0.0);
assert!(chars.repetition_ratio < 1.0);
}
#[test]
fn test_adaptive_algorithm_selection() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::Adaptive,
adaptive_threshold: 100,
..Default::default()
};
let builder = SuffixArrayBuilder::new(config);
let algorithm = builder.select_algorithm(b"small");
assert_eq!(algorithm, SuffixArrayAlgorithm::DC3);
let algorithm = builder.select_algorithm(&vec![b'a'; 1000]);
assert_eq!(algorithm, SuffixArrayAlgorithm::SAIS);
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::SAIS,
..Default::default()
};
let builder = SuffixArrayBuilder::new(config);
let algorithm = builder.select_algorithm(b"any text");
assert_eq!(algorithm, SuffixArrayAlgorithm::SAIS);
}
#[test]
fn test_dc3_algorithm() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::DC3,
..Default::default()
};
let text = b"banana";
let sa = SuffixArray::with_config(text, &config).unwrap();
assert_eq!(sa.as_slice().len(), 6);
assert_eq!(sa.text_len(), 6);
let suffixes = sa.as_slice();
for i in 1..suffixes.len() {
let suffix1 = &text[suffixes[i - 1]..];
let suffix2 = &text[suffixes[i]..];
assert!(
suffix1 <= suffix2,
"Suffix at {} ({:?}) should be <= suffix at {} ({:?})",
i - 1,
suffix1,
i,
suffix2
);
}
}
#[test]
fn test_algorithm_consistency() {
let test_cases = [
b"banana".as_slice(),
b"abcdef".as_slice(),
b"mississippi".as_slice(),
b"aaaaaa".as_slice(),
b"abab".as_slice(),
];
for &text in &test_cases {
let config_div = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::DivSufSort,
..Default::default()
};
let sa_div = SuffixArray::with_config(text, &config_div).unwrap();
let config_ls = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::LarssonSadakane,
..Default::default()
};
let sa_ls = SuffixArray::with_config(text, &config_ls).unwrap();
verify_suffix_array_is_sorted(text, sa_div.as_slice());
verify_suffix_array_is_sorted(text, sa_ls.as_slice());
}
}
fn verify_suffix_array_is_sorted(text: &[u8], sa: &[usize]) {
for i in 1..sa.len() {
let suffix1 = &text[sa[i - 1]..];
let suffix2 = &text[sa[i]..];
assert!(
suffix1 <= suffix2,
"Suffix array not properly sorted at position {}",
i
);
}
}
fn reference_suffix_array(text: &[u8]) -> Vec<usize> {
let mut sa: Vec<usize> = (0..text.len()).collect();
sa.sort_by(|&a, &b| text[a..].cmp(&text[b..]));
sa
}
#[test]
fn test_sais_correctness_vs_reference() {
let cfg = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::SAIS,
..Default::default()
};
let mut cases: Vec<Vec<u8>> = vec![
b"a".to_vec(),
b"ab".to_vec(),
b"ba".to_vec(),
b"aa".to_vec(),
b"banana".to_vec(),
b"mississippi".to_vec(),
b"aaaaaa".to_vec(),
b"abababab".to_vec(),
b"the quick brown fox jumps over the lazy dog".to_vec(),
];
let mut x: u32 = 0x1234_5678;
for &n in &[50usize, 200, 1000, 5000] {
let v: Vec<u8> = (0..n)
.map(|_| {
x = x.wrapping_mul(1664525).wrapping_add(1013904223);
(x >> 24) as u8
})
.collect();
cases.push(v);
}
cases.push((0..2000).flat_map(|_| [b'a', b'b']).collect());
cases.push((0..4000u32).map(|i| (i % 256) as u8).collect());
for text in &cases {
let sa = SuffixArray::with_config(text, &cfg).unwrap();
assert_eq!(
sa.as_slice().len(),
text.len(),
"SA length mismatch for input len {}",
text.len()
);
assert_eq!(
sa.as_slice(),
reference_suffix_array(text).as_slice(),
"SA-IS produced an incorrect suffix array for input len {}",
text.len()
);
}
}
#[test]
fn test_sais_randomized_small_alphabet() {
let cfg = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::SAIS,
..Default::default()
};
let mut x: u32 = 0x9E37_79B9;
let mut next = |modulo: u32| {
x = x.wrapping_mul(1664525).wrapping_add(1013904223);
(x >> 16) % modulo
};
for alphabet in [2u32, 3, 4] {
for _ in 0..200 {
let len = (next(300) + 1) as usize;
let data: Vec<u8> = (0..len).map(|_| next(alphabet) as u8).collect();
let sa = SuffixArray::with_config(&data, &cfg).unwrap();
assert_eq!(
sa.as_slice(),
reference_suffix_array(&data).as_slice(),
"SA-IS mismatch: alphabet={alphabet}, data={data:?}"
);
}
}
}
#[test]
fn test_dc3_edge_cases() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::DC3,
..Default::default()
};
let sa = SuffixArray::with_config(b"", &config).unwrap();
assert_eq!(sa.as_slice().len(), 0);
let sa = SuffixArray::with_config(b"a", &config).unwrap();
assert_eq!(sa.as_slice(), &[0]);
let text = b"ab";
let sa = SuffixArray::with_config(text, &config).unwrap();
assert_eq!(sa.as_slice().len(), 2);
let text = b"ba";
let sa = SuffixArray::with_config(text, &config).unwrap();
assert_eq!(sa.as_slice().len(), 2);
}
#[test]
fn test_adaptive_with_different_data_types() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::Adaptive,
adaptive_threshold: 10,
..Default::default()
};
let test_cases = [
(b"xyz".as_slice(), "small input"), (b"aaaaaaaaaaaaaaaaaaaa".as_slice(), "repetitive input"),
(b"abcdefghijklmnopqrstuvwxyz".as_slice(), "diverse alphabet"),
(&vec![b'a'; 2000], "large repetitive"),
];
for (text, description) in &test_cases {
let sa = SuffixArray::with_config(text, &config).unwrap();
assert_eq!(
sa.as_slice().len(),
text.len(),
"Suffix array length mismatch for {}",
description
);
assert_eq!(
sa.text_len(),
text.len(),
"Text length mismatch for {}",
description
);
for &idx in sa.as_slice() {
assert!(
idx < text.len(),
"Invalid suffix index {} for text length {} in {}",
idx,
text.len(),
description
);
}
}
}
#[test]
fn test_divsufsort_algorithm() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::DivSufSort,
..Default::default()
};
let test_cases = [
b"banana".as_slice(),
b"abcdef".as_slice(),
b"mississippi".as_slice(),
b"abab".as_slice(),
b"aaaa".as_slice(),
b"abcdefghijklmnopqrstuvwxyz".as_slice(),
];
for &text in &test_cases {
let sa = SuffixArray::with_config(text, &config).unwrap();
assert_eq!(
sa.as_slice().len(),
text.len(),
"DivSufSort length mismatch for text: {:?}",
std::str::from_utf8(text)
);
assert_eq!(sa.text_len(), text.len());
let mut indices = sa.as_slice().to_vec();
indices.sort_unstable();
for (i, &idx) in indices.iter().enumerate() {
assert_eq!(idx, i, "Missing or duplicate index in DivSufSort result");
}
let suffixes = sa.as_slice();
for i in 1..suffixes.len() {
let suffix1 = &text[suffixes[i - 1]..];
let suffix2 = &text[suffixes[i]..];
assert!(
suffix1 <= suffix2,
"DivSufSort: Suffix at {} ({:?}) should be <= suffix at {} ({:?})",
i - 1,
suffix1,
i,
suffix2
);
}
}
}
#[test]
fn test_larsson_sadakane_algorithm() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::LarssonSadakane,
..Default::default()
};
let test_cases = [
b"banana".as_slice(),
b"abcdef".as_slice(),
b"mississippi".as_slice(),
b"abab".as_slice(),
b"aaaa".as_slice(),
b"ababababab".as_slice(), b"aaaabbbbcccc".as_slice(), ];
for &text in &test_cases {
let sa = SuffixArray::with_config(text, &config).unwrap();
assert_eq!(
sa.as_slice().len(),
text.len(),
"Larsson-Sadakane length mismatch for text: {:?}",
std::str::from_utf8(text)
);
assert_eq!(sa.text_len(), text.len());
let mut indices = sa.as_slice().to_vec();
indices.sort_unstable();
for (i, &idx) in indices.iter().enumerate() {
assert_eq!(
idx, i,
"Missing or duplicate index in Larsson-Sadakane result"
);
}
let suffixes = sa.as_slice();
for i in 1..suffixes.len() {
let suffix1 = &text[suffixes[i - 1]..];
let suffix2 = &text[suffixes[i]..];
assert!(
suffix1 <= suffix2,
"Larsson-Sadakane: Suffix at {} ({:?}) should be <= suffix at {} ({:?})",
i - 1,
suffix1,
i,
suffix2
);
}
}
}
#[test]
fn test_algorithm_consistency_all() {
let test_cases = [
b"banana".as_slice(),
b"abcdef".as_slice(),
b"mississippi".as_slice(),
b"aaaa".as_slice(),
b"abab".as_slice(),
];
for &text in &test_cases {
let config_dc3 = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::DC3,
..Default::default()
};
let sa_dc3 = SuffixArray::with_config(text, &config_dc3).unwrap();
let config_divsufsort = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::DivSufSort,
..Default::default()
};
let sa_divsufsort = SuffixArray::with_config(text, &config_divsufsort).unwrap();
let config_ls = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::LarssonSadakane,
..Default::default()
};
let sa_ls = SuffixArray::with_config(text, &config_ls).unwrap();
verify_suffix_array_is_sorted(text, sa_dc3.as_slice());
verify_suffix_array_is_sorted(text, sa_divsufsort.as_slice());
verify_suffix_array_is_sorted(text, sa_ls.as_slice());
}
}
#[test]
fn test_divsufsort_edge_cases() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::DivSufSort,
..Default::default()
};
let sa = SuffixArray::with_config(b"", &config).unwrap();
assert_eq!(sa.as_slice().len(), 0);
let sa = SuffixArray::with_config(b"a", &config).unwrap();
assert_eq!(sa.as_slice(), &[0]);
let sa = SuffixArray::with_config(b"ab", &config).unwrap();
assert_eq!(sa.as_slice().len(), 2);
assert!(
sa.as_slice()[0] < sa.as_slice()[1]
|| b"ab"[sa.as_slice()[0]..] <= b"ab"[sa.as_slice()[1]..]
);
let sa = SuffixArray::with_config(b"aaaa", &config).unwrap();
assert_eq!(sa.as_slice(), &[3, 2, 1, 0]); }
#[test]
fn test_larsson_sadakane_edge_cases() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::LarssonSadakane,
..Default::default()
};
let sa = SuffixArray::with_config(b"", &config).unwrap();
assert_eq!(sa.as_slice().len(), 0);
let sa = SuffixArray::with_config(b"a", &config).unwrap();
assert_eq!(sa.as_slice(), &[0]);
let sa = SuffixArray::with_config(b"ab", &config).unwrap();
assert_eq!(sa.as_slice().len(), 2);
assert!(
sa.as_slice()[0] < sa.as_slice()[1]
|| b"ab"[sa.as_slice()[0]..] <= b"ab"[sa.as_slice()[1]..]
);
let sa = SuffixArray::with_config(b"aaaa", &config).unwrap();
assert_eq!(sa.as_slice(), &[3, 2, 1, 0]);
let sa = SuffixArray::with_config(b"abababab", &config).unwrap();
assert_eq!(sa.as_slice().len(), 8);
let text = b"abababab";
let suffixes = sa.as_slice();
for i in 1..suffixes.len() {
let suffix1 = &text[suffixes[i - 1]..];
let suffix2 = &text[suffixes[i]..];
assert!(suffix1 <= suffix2);
}
}
#[test]
fn test_adaptive_algorithm_selection_updated() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::Adaptive,
adaptive_threshold: 100,
..Default::default()
};
let builder = SuffixArrayBuilder::new(config);
let algorithm = builder.select_algorithm(b"small");
assert_eq!(algorithm, SuffixArrayAlgorithm::DC3);
let algorithm = builder.select_algorithm(&vec![b'a'; 1000]);
assert!(
algorithm == SuffixArrayAlgorithm::LarssonSadakane
|| algorithm == SuffixArrayAlgorithm::SAIS
);
let large_diverse: Vec<u8> = (0..100_000).map(|i| (i % 256) as u8).collect();
let algorithm = builder.select_algorithm(&large_diverse);
assert_eq!(algorithm, SuffixArrayAlgorithm::DivSufSort);
let medium_diverse: Vec<u8> = (0..60_000).map(|i| (i % 256) as u8).collect();
let algorithm = builder.select_algorithm(&medium_diverse);
assert_eq!(algorithm, SuffixArrayAlgorithm::DivSufSort);
}
#[test]
fn test_repetitive_data_performance() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::DivSufSort,
adaptive_threshold: 10,
..Default::default()
};
let repetitive_text = b"abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc";
let builder = SuffixArrayBuilder::new(config);
let _algorithm = builder.select_algorithm(repetitive_text);
let sa = SuffixArray::with_config(repetitive_text, &builder.config).unwrap();
assert_eq!(sa.as_slice().len(), repetitive_text.len());
let suffixes = sa.as_slice();
for i in 1..suffixes.len() {
let suffix1 = &repetitive_text[suffixes[i - 1]..];
let suffix2 = &repetitive_text[suffixes[i]..];
assert!(
suffix1 <= suffix2,
"Repetitive data suffix array not properly sorted at position {}",
i
);
}
}
}