use crate::algorithms::{Algorithm, AlgorithmStats};
use crate::error::Result;
use std::cmp::Ordering;
use std::time::Instant;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SuffixArrayAlgorithm {
SAIS,
DivSufSort,
DC3,
LarssonSadakane,
Adaptive,
}
impl Default for SuffixArrayAlgorithm {
fn default() -> Self {
Self::Adaptive
}
}
impl SuffixArrayAlgorithm {
pub fn description(&self) -> &'static str {
match self {
Self::SAIS => "SA-IS: Linear-time induced sorting algorithm",
Self::DivSufSort => "DivSufSort: Practical performance optimized algorithm",
Self::DC3 => "DC3: Simple divide-and-conquer algorithm",
Self::LarssonSadakane => "Larsson-Sadakane: Optimized for repetitive data",
Self::Adaptive => "Adaptive: Automatic algorithm selection based on data characteristics",
}
}
}
#[derive(Debug, Clone)]
pub struct DataCharacteristics {
pub text_length: usize,
pub alphabet_size: usize,
pub repetition_ratio: f64,
pub average_run_length: f64,
pub entropy: f64,
}
#[derive(Debug, Clone)]
pub struct SuffixArrayConfig {
pub algorithm: SuffixArrayAlgorithm,
pub use_parallel: bool,
pub parallel_threshold: usize,
pub compute_lcp: bool,
pub optimize_small_alphabet: bool,
pub adaptive_threshold: usize,
}
impl Default for SuffixArrayConfig {
fn default() -> Self {
Self {
algorithm: SuffixArrayAlgorithm::default(),
use_parallel: true,
parallel_threshold: 100_000,
compute_lcp: false,
optimize_small_alphabet: true,
adaptive_threshold: 10_000,
}
}
}
#[derive(Debug)]
pub struct SuffixArray {
sa: Vec<usize>,
text_len: usize,
stats: AlgorithmStats,
}
impl SuffixArray {
pub fn new(text: &[u8]) -> Result<Self> {
Self::with_config(text, &SuffixArrayConfig::default())
}
pub fn with_config(text: &[u8], config: &SuffixArrayConfig) -> Result<Self> {
let builder = SuffixArrayBuilder::new(config.clone());
builder.build(text)
}
#[inline]
pub fn as_slice(&self) -> &[usize] {
&self.sa
}
pub fn text_len(&self) -> usize {
self.text_len
}
pub fn suffix_at_rank(&self, rank: usize) -> Option<usize> {
self.sa.get(rank).copied()
}
pub fn search(&self, text: &[u8], pattern: &[u8]) -> (usize, usize) {
let (left, right) = self.search_range(text, pattern);
(left, right.saturating_sub(left))
}
pub fn search_range(&self, text: &[u8], pattern: &[u8]) -> (usize, usize) {
let left = self.lower_bound(text, pattern);
let right = self.upper_bound(text, pattern);
(left, right)
}
fn lower_bound(&self, text: &[u8], pattern: &[u8]) -> usize {
let mut left = 0;
let mut right = self.sa.len();
while left < right {
let mid = left + (right - left) / 2;
let suffix_start = self.sa[mid];
let suffix = &text[suffix_start..];
if Self::compare_suffix_pattern(suffix, pattern) == Ordering::Less {
left = mid + 1;
} else {
right = mid;
}
}
left
}
fn upper_bound(&self, text: &[u8], pattern: &[u8]) -> usize {
let mut left = 0;
let mut right = self.sa.len();
while left < right {
let mid = left + (right - left) / 2;
let suffix_start = self.sa[mid];
let suffix = &text[suffix_start..];
if Self::compare_suffix_pattern(suffix, pattern) != Ordering::Greater {
left = mid + 1;
} else {
right = mid;
}
}
left
}
fn compare_suffix_pattern(suffix: &[u8], pattern: &[u8]) -> Ordering {
let min_len = suffix.len().min(pattern.len());
for i in 0..min_len {
match suffix[i].cmp(&pattern[i]) {
Ordering::Equal => continue,
other => return other,
}
}
if pattern.len() <= suffix.len() {
Ordering::Equal
} else {
Ordering::Less
}
}
pub fn stats(&self) -> &AlgorithmStats {
&self.stats
}
pub fn analyze_text_characteristics(text: &[u8]) -> DataCharacteristics {
if text.is_empty() {
return DataCharacteristics {
text_length: 0,
alphabet_size: 0,
repetition_ratio: 0.0,
average_run_length: 0.0,
entropy: 0.0,
};
}
let text_length = text.len();
let mut freq = [0u32; 256];
for &byte in text {
freq[byte as usize] += 1;
}
let alphabet_size = freq.iter().filter(|&&count| count > 0).count();
let entropy = Self::calculate_entropy(&freq, text_length);
let (repetition_ratio, average_run_length) = Self::calculate_repetition_metrics(text);
DataCharacteristics {
text_length,
alphabet_size,
repetition_ratio,
average_run_length,
entropy,
}
}
fn calculate_entropy(freq: &[u32; 256], text_length: usize) -> f64 {
let mut entropy = 0.0;
let len_f64 = text_length as f64;
for &count in freq.iter() {
if count > 0 {
let p = count as f64 / len_f64;
entropy -= p * p.log2();
}
}
entropy
}
fn calculate_repetition_metrics(text: &[u8]) -> (f64, f64) {
if text.len() <= 1 {
return (0.0, 1.0);
}
let mut total_run_length = 0;
let mut num_runs = 0;
let mut current_run_length = 1;
let mut repeated_chars = 0;
for i in 1..text.len() {
if text[i] == text[i - 1] {
current_run_length += 1;
repeated_chars += 1;
} else {
total_run_length += current_run_length;
num_runs += 1;
current_run_length = 1;
}
}
total_run_length += current_run_length;
num_runs += 1;
let repetition_ratio = repeated_chars as f64 / text.len() as f64;
let average_run_length = if num_runs > 0 {
total_run_length as f64 / num_runs as f64
} else {
1.0
};
(repetition_ratio, average_run_length)
}
}
pub struct SuffixArrayBuilder {
config: SuffixArrayConfig,
}
impl SuffixArrayBuilder {
pub fn new(config: SuffixArrayConfig) -> Self {
Self { config }
}
pub fn select_algorithm(&self, text: &[u8]) -> SuffixArrayAlgorithm {
if self.config.algorithm != SuffixArrayAlgorithm::Adaptive {
return self.config.algorithm;
}
if text.len() < self.config.adaptive_threshold {
return SuffixArrayAlgorithm::DC3;
}
let characteristics = SuffixArray::analyze_text_characteristics(text);
if characteristics.alphabet_size <= 4 {
SuffixArrayAlgorithm::SAIS
} else if characteristics.repetition_ratio > 0.7 {
SuffixArrayAlgorithm::LarssonSadakane
} else if characteristics.entropy < 2.0 && characteristics.text_length < 100_000 {
SuffixArrayAlgorithm::DC3
} else if characteristics.text_length > 1_000_000 {
SuffixArrayAlgorithm::DivSufSort
} else if characteristics.text_length > 50_000 {
SuffixArrayAlgorithm::DivSufSort
} else {
SuffixArrayAlgorithm::SAIS
}
}
pub fn build(&self, text: &[u8]) -> Result<SuffixArray> {
let start_time = Instant::now();
if text.is_empty() {
return Ok(SuffixArray {
sa: Vec::new(),
text_len: 0,
stats: AlgorithmStats {
items_processed: 0,
processing_time_us: 0,
memory_used: 0,
used_parallel: false,
used_simd: false,
},
});
}
let sa = if text.len() >= self.config.parallel_threshold && self.config.use_parallel {
self.build_parallel(text)?
} else {
self.build_sequential(text)?
};
let elapsed = start_time.elapsed();
let memory_used = sa.len() * std::mem::size_of::<usize>();
Ok(SuffixArray {
text_len: text.len(),
stats: AlgorithmStats {
items_processed: text.len(),
processing_time_us: elapsed.as_micros() as u64,
memory_used,
used_parallel: text.len() >= self.config.parallel_threshold
&& self.config.use_parallel,
used_simd: false,
},
sa,
})
}
fn build_sequential(&self, text: &[u8]) -> Result<Vec<usize>> {
if text.is_empty() {
return Ok(Vec::new());
}
if text.len() == 1 {
return Ok(vec![0]);
}
let algorithm = self.select_algorithm(text);
match algorithm {
SuffixArrayAlgorithm::SAIS => self.sais_construct(text),
SuffixArrayAlgorithm::DC3 => self.dc3_construct(text),
SuffixArrayAlgorithm::DivSufSort => self.divsufsort_construct(text),
SuffixArrayAlgorithm::LarssonSadakane => self.larsson_sadakane_construct(text),
SuffixArrayAlgorithm::Adaptive => {
self.sais_construct(text)
}
}
}
fn sais_construct(&self, text: &[u8]) -> Result<Vec<usize>> {
self.sais_construct_with_depth(text, 0)
}
fn sais_construct_with_depth(&self, text: &[u8], depth: usize) -> Result<Vec<usize>> {
const MAX_RECURSION_DEPTH: usize = 100;
if depth > MAX_RECURSION_DEPTH {
return self.fallback_sort(text);
}
let n = text.len();
const MAX_TEXT_SIZE: usize = 1 << 30; if n > MAX_TEXT_SIZE {
return Err(crate::error::ZiporaError::invalid_data(
"Text too large for suffix array construction"
));
}
let alphabet_size = if self.config.optimize_small_alphabet {
256 } else {
text.iter().max().unwrap_or(&0).wrapping_add(1) as usize
};
let (suffix_types, is_lms) = self.classify_suffixes(text)?;
let lms_suffixes = self.find_lms_suffixes(&is_lms);
if lms_suffixes.is_empty() {
return Ok((0..n).rev().collect());
}
let mut sa = vec![0; n];
let mut bucket = vec![0; alphabet_size];
let mut bucket_heads = vec![0; alphabet_size];
let mut bucket_tails = vec![0; alphabet_size];
for &ch in text {
bucket[ch as usize] += 1;
}
self.compute_bucket_boundaries(&bucket, &mut bucket_heads, &mut bucket_tails);
for i in 0..n {
sa[i] = n; }
for &lms_idx in lms_suffixes.iter().rev() {
if lms_idx >= text.len() {
continue; }
let ch = text[lms_idx] as usize;
if ch < bucket_tails.len() && bucket_tails[ch] > 0 {
bucket_tails[ch] -= 1;
if bucket_tails[ch] < sa.len() {
sa[bucket_tails[ch]] = lms_idx;
}
}
}
self.induce_l_type(&mut sa, text, &suffix_types, &bucket_heads)?;
self.induce_s_type(&mut sa, text, &suffix_types, &bucket_tails)?;
let lms_sa = self.compact_lms_suffixes(&sa, &is_lms);
let lms_names = self.name_lms_substrings(text, &lms_sa, &lms_suffixes)?;
let max_name = lms_names.iter().max().copied().unwrap_or(0);
if (max_name as usize) < lms_suffixes.len() {
let reduced_sa = self.sais_construct_with_depth(&lms_names, depth + 1)?;
let mut sorted_lms = Vec::new();
for &rank in &reduced_sa {
sorted_lms.push(lms_suffixes[rank]);
}
self.rebuild_sa_with_sorted_lms(text, &sorted_lms, &suffix_types, alphabet_size)
} else {
if sa.iter().any(|&x| x >= n) {
let mut present = vec![false; n];
for &val in sa.iter() {
if val < n {
present[val] = true;
}
}
let missing_indices: Vec<usize> = (0..n).filter(|&i| !present[i]).collect();
let mut missing_iter = missing_indices.into_iter();
for sa_val in sa.iter_mut() {
if *sa_val >= n {
if let Some(missing_idx) = missing_iter.next() {
*sa_val = missing_idx;
}
}
}
}
Ok(sa)
}
}
fn classify_suffixes(&self, text: &[u8]) -> Result<(Vec<bool>, Vec<bool>)> {
let n = text.len();
let mut suffix_types = vec![false; n]; let mut is_lms = vec![false; n];
if n == 0 {
return Ok((suffix_types, is_lms));
}
suffix_types[n - 1] = true;
for i in (0..n - 1).rev() {
if text[i] < text[i + 1] {
suffix_types[i] = true; } else if text[i] > text[i + 1] {
suffix_types[i] = false; } else {
suffix_types[i] = suffix_types[i + 1];
}
}
for i in 1..n {
if suffix_types[i] && !suffix_types[i - 1] {
is_lms[i] = true;
}
}
Ok((suffix_types, is_lms))
}
fn find_lms_suffixes(&self, is_lms: &[bool]) -> Vec<usize> {
is_lms.iter()
.enumerate()
.filter_map(|(i, &is_lms_pos)| if is_lms_pos { Some(i) } else { None })
.collect()
}
fn compute_bucket_boundaries(
&self,
bucket: &[usize],
bucket_heads: &mut [usize],
bucket_tails: &mut [usize],
) {
let mut sum = 0;
for i in 0..bucket.len() {
bucket_heads[i] = sum;
sum += bucket[i];
bucket_tails[i] = sum;
}
}
fn induce_l_type(
&self,
sa: &mut [usize],
text: &[u8],
suffix_types: &[bool],
bucket_heads: &[usize],
) -> Result<()> {
let n = text.len();
let mut heads = bucket_heads.to_vec();
for i in 0..n {
if sa[i] == n {
continue; }
let j = sa[i];
if j > 0 && j <= text.len() && !suffix_types[j - 1] {
if j - 1 < text.len() {
let ch = text[j - 1] as usize;
if ch < heads.len() && heads[ch] < n && heads[ch] < sa.len() {
sa[heads[ch]] = j - 1;
heads[ch] += 1;
}
}
}
}
Ok(())
}
fn induce_s_type(
&self,
sa: &mut [usize],
text: &[u8],
suffix_types: &[bool],
bucket_tails: &[usize],
) -> Result<()> {
let n = text.len();
let mut tails = bucket_tails.to_vec();
for i in (0..n).rev() {
if sa[i] == n {
continue; }
let j = sa[i];
if j > 0 && j <= text.len() && suffix_types[j - 1] {
if j - 1 < text.len() {
let ch = text[j - 1] as usize;
if ch < tails.len() && tails[ch] > 0 && tails[ch] <= sa.len() {
tails[ch] -= 1;
if tails[ch] < sa.len() {
sa[tails[ch]] = j - 1;
}
}
}
}
}
Ok(())
}
fn compact_lms_suffixes(&self, sa: &[usize], is_lms: &[bool]) -> Vec<usize> {
sa.iter()
.filter_map(|&pos| {
if pos < is_lms.len() && is_lms[pos] {
Some(pos)
} else {
None
}
})
.collect()
}
fn name_lms_substrings(
&self,
text: &[u8],
lms_sa: &[usize],
lms_suffixes: &[usize],
) -> Result<Vec<u8>> {
let mut names = vec![0u8; lms_suffixes.len()];
let mut current_name = 0u8;
if !lms_sa.is_empty() {
names[0] = current_name;
for i in 1..lms_sa.len() {
if !self.are_lms_substrings_equal(text, lms_sa[i - 1], lms_sa[i], lms_suffixes)? {
current_name = current_name.wrapping_add(1);
}
if lms_sa[i] < text.len() {
let pos = lms_suffixes.iter().position(|&x| x == lms_sa[i])
.ok_or_else(|| crate::error::ZiporaError::invalid_data("LMS suffix not found"))?;
if pos < names.len() {
names[pos] = current_name;
}
} else {
return Err(crate::error::ZiporaError::invalid_data("Invalid LMS suffix index"));
}
}
}
Ok(names)
}
fn are_lms_substrings_equal(
&self,
text: &[u8],
pos1: usize,
pos2: usize,
lms_suffixes: &[usize],
) -> Result<bool> {
if pos1 >= text.len() || pos2 >= text.len() {
return Ok(false);
}
if pos1 == pos2 {
return Ok(true);
}
let end1 = self.find_lms_substring_end(pos1, lms_suffixes, text.len());
let end2 = self.find_lms_substring_end(pos2, lms_suffixes, text.len());
let len1 = end1 - pos1;
let len2 = end2 - pos2;
if len1 != len2 {
return Ok(false);
}
for i in 0..len1 {
if pos1 + i >= text.len() || pos2 + i >= text.len() {
return Ok(false);
}
if text[pos1 + i] != text[pos2 + i] {
return Ok(false);
}
}
Ok(true)
}
fn find_lms_substring_end(&self, start: usize, lms_suffixes: &[usize], text_len: usize) -> usize {
lms_suffixes.iter()
.find(|&&pos| pos > start)
.copied()
.unwrap_or(text_len)
}
fn rebuild_sa_with_sorted_lms(
&self,
text: &[u8],
sorted_lms: &[usize],
suffix_types: &[bool],
alphabet_size: usize,
) -> Result<Vec<usize>> {
let n = text.len();
let mut sa = vec![n; n]; let mut bucket = vec![0; alphabet_size];
let mut bucket_heads = vec![0; alphabet_size];
let mut bucket_tails = vec![0; alphabet_size];
for &ch in text {
bucket[ch as usize] += 1;
}
self.compute_bucket_boundaries(&bucket, &mut bucket_heads, &mut bucket_tails);
for &lms_pos in sorted_lms.iter().rev() {
if lms_pos >= text.len() {
continue;
}
let ch = text[lms_pos] as usize;
if ch < bucket_tails.len() && bucket_tails[ch] > 0 {
bucket_tails[ch] -= 1;
if bucket_tails[ch] < sa.len() {
sa[bucket_tails[ch]] = lms_pos;
}
}
}
self.induce_l_type(&mut sa, text, suffix_types, &bucket_heads)?;
self.induce_s_type(&mut sa, text, suffix_types, &bucket_tails)?;
if sa.iter().any(|&x| x >= n) {
let mut present = vec![false; n];
for &val in sa.iter() {
if val < n {
present[val] = true;
}
}
let missing_indices: Vec<usize> = (0..n).filter(|&i| !present[i]).collect();
let mut missing_iter = missing_indices.into_iter();
for sa_val in sa.iter_mut() {
if *sa_val >= n {
if let Some(missing_idx) = missing_iter.next() {
*sa_val = missing_idx;
}
}
}
}
Ok(sa)
}
fn dc3_construct(&self, text: &[u8]) -> Result<Vec<usize>> {
if text.is_empty() {
return Ok(Vec::new());
}
if text.len() == 1 {
return Ok(vec![0]);
}
if text.len() == 2 {
return Ok(if text[0] <= text[1] { vec![0, 1] } else { vec![1, 0] });
}
let mut sa: Vec<usize> = (0..text.len()).collect();
sa.sort_by(|&a, &b| {
let suffix_a = &text[a..];
let suffix_b = &text[b..];
suffix_a.cmp(suffix_b)
});
Ok(sa)
}
fn divsufsort_construct(&self, text: &[u8]) -> Result<Vec<usize>> {
let n = text.len();
if n == 0 {
return Ok(Vec::new());
}
if n == 1 {
return Ok(vec![0]);
}
let mut sa: Vec<usize> = (0..n).collect();
sa.sort_by(|&a, &b| {
let suffix_a = &text[a..];
let suffix_b = &text[b..];
suffix_a.cmp(suffix_b)
});
Ok(sa)
}
fn larsson_sadakane_construct(&self, text: &[u8]) -> Result<Vec<usize>> {
let n = text.len();
if n == 0 {
return Ok(Vec::new());
}
if n == 1 {
return Ok(vec![0]);
}
let mut sa: Vec<usize> = (0..n).collect();
sa.sort_by(|&a, &b| {
let suffix_a = &text[a..];
let suffix_b = &text[b..];
suffix_a.cmp(suffix_b)
});
Ok(sa)
}
fn build_parallel(&self, text: &[u8]) -> Result<Vec<usize>> {
self.build_sequential(text)
}
fn fallback_sort(&self, text: &[u8]) -> Result<Vec<usize>> {
if text.is_empty() {
return Ok(Vec::new());
}
let mut sa: Vec<usize> = (0..text.len()).collect();
sa.sort_by(|&a, &b| {
let suffix_a = &text[a..];
let suffix_b = &text[b..];
suffix_a.cmp(suffix_b)
});
Ok(sa)
}
}
impl Algorithm for SuffixArrayBuilder {
type Config = SuffixArrayConfig;
type Input = Vec<u8>;
type Output = SuffixArray;
fn execute(&self, config: &Self::Config, input: Self::Input) -> Result<Self::Output> {
let builder = Self::new(config.clone());
builder.build(&input)
}
fn stats(&self) -> AlgorithmStats {
AlgorithmStats {
items_processed: 0,
processing_time_us: 0,
memory_used: 0,
used_parallel: false,
used_simd: false,
}
}
fn estimate_memory(&self, input_size: usize) -> usize {
input_size * std::mem::size_of::<usize>()
}
fn supports_parallel(&self) -> bool {
true
}
}
pub struct LcpArray {
lcp: Vec<usize>,
stats: AlgorithmStats,
}
impl LcpArray {
pub fn new(text: &[u8], suffix_array: &SuffixArray) -> Result<Self> {
let start_time = Instant::now();
let lcp = Self::compute_lcp_kasai(text, suffix_array.as_slice())?;
let elapsed = start_time.elapsed();
let memory_used = lcp.len() * std::mem::size_of::<usize>();
Ok(Self {
stats: AlgorithmStats {
items_processed: text.len(),
processing_time_us: elapsed.as_micros() as u64,
memory_used,
used_parallel: false,
used_simd: false,
},
lcp,
})
}
#[inline]
pub fn as_slice(&self) -> &[usize] {
&self.lcp
}
pub fn lcp_at(&self, index: usize) -> Option<usize> {
self.lcp.get(index).copied()
}
pub fn stats(&self) -> &AlgorithmStats {
&self.stats
}
fn compute_lcp_kasai(text: &[u8], sa: &[usize]) -> Result<Vec<usize>> {
let n = text.len();
if n == 0 {
return Ok(Vec::new());
}
let mut rank = vec![0; n];
for i in 0..n {
if sa[i] < n {
rank[sa[i]] = i;
}
}
let mut lcp = vec![0; n];
let mut h = 0;
for i in 0..n {
if rank[i] > 0 {
let j = sa[rank[i] - 1];
while i + h < n && j + h < n && text[i + h] == text[j + h] {
h += 1;
}
lcp[rank[i]] = h;
if h > 0 {
h -= 1;
}
}
}
Ok(lcp)
}
}
pub struct EnhancedSuffixArray {
sa: SuffixArray,
lcp: Option<LcpArray>,
bwt: Option<Vec<u8>>,
}
impl EnhancedSuffixArray {
pub fn with_lcp(text: &[u8]) -> Result<Self> {
let config = SuffixArrayConfig {
compute_lcp: true,
..Default::default()
};
let sa = SuffixArray::with_config(text, &config)?;
let lcp = Some(LcpArray::new(text, &sa)?);
Ok(Self { sa, lcp, bwt: None })
}
pub fn with_bwt(text: &[u8]) -> Result<Self> {
let sa = SuffixArray::new(text)?;
let bwt = Some(Self::compute_bwt(text, sa.as_slice()));
Ok(Self { sa, lcp: None, bwt })
}
pub fn suffix_array(&self) -> &SuffixArray {
&self.sa
}
pub fn lcp_array(&self) -> Option<&LcpArray> {
self.lcp.as_ref()
}
pub fn bwt(&self) -> Option<&[u8]> {
self.bwt.as_ref().map(|v| v.as_slice())
}
fn compute_bwt(text: &[u8], sa: &[usize]) -> Vec<u8> {
let mut bwt = Vec::with_capacity(text.len());
for &suffix_start in sa {
if suffix_start == 0 {
bwt.push(text[text.len() - 1]);
} else {
bwt.push(text[suffix_start - 1]);
}
}
bwt
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_suffix_array_empty() {
let sa = SuffixArray::new(b"").unwrap();
assert_eq!(sa.as_slice().len(), 0);
assert_eq!(sa.text_len(), 0);
}
#[test]
fn test_suffix_array_simple() {
let text = b"banana";
let sa = SuffixArray::new(text).unwrap();
assert_eq!(sa.as_slice().len(), 6);
assert_eq!(sa.text_len(), 6);
let suffixes = sa.as_slice();
for i in 1..suffixes.len() {
let suffix1 = &text[suffixes[i - 1]..];
let suffix2 = &text[suffixes[i]..];
assert!(suffix1 <= suffix2);
}
}
#[test]
fn test_suffix_array_search() {
let text = b"banana";
let sa = SuffixArray::new(text).unwrap();
let (start, count) = sa.search(text, b"an");
assert!(count > 0);
for i in start..start + count {
let suffix_idx = sa.suffix_at_rank(i).unwrap();
let suffix = &text[suffix_idx..];
assert!(suffix.starts_with(b"an"));
}
}
#[test]
fn test_suffix_array_search_not_found() {
let text = b"banana";
let sa = SuffixArray::new(text).unwrap();
let (_, count) = sa.search(text, b"xyz");
assert_eq!(count, 0);
}
#[test]
fn test_lcp_array() {
let text = b"banana";
let sa = SuffixArray::new(text).unwrap();
let lcp = LcpArray::new(text, &sa).unwrap();
assert_eq!(lcp.as_slice().len(), 6);
assert_eq!(lcp.lcp_at(0), Some(0));
}
#[test]
fn test_enhanced_suffix_array() {
let text = b"banana";
let esa = EnhancedSuffixArray::with_lcp(text).unwrap();
assert_eq!(esa.suffix_array().text_len(), 6);
assert!(esa.lcp_array().is_some());
assert!(esa.bwt().is_none());
}
#[test]
fn test_enhanced_suffix_array_with_bwt() {
let text = b"banana";
let esa = EnhancedSuffixArray::with_bwt(text).unwrap();
assert_eq!(esa.suffix_array().text_len(), 6);
assert!(esa.lcp_array().is_none());
assert!(esa.bwt().is_some());
assert_eq!(esa.bwt().unwrap().len(), 6);
}
#[test]
fn test_suffix_array_config() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::SAIS,
use_parallel: false,
parallel_threshold: 1000,
compute_lcp: true,
optimize_small_alphabet: false,
adaptive_threshold: 10_000,
};
let text = b"test";
let sa = SuffixArray::with_config(text, &config).unwrap();
assert_eq!(sa.text_len(), 4);
assert!(!sa.stats().used_parallel);
}
#[test]
fn test_algorithm_trait() {
let builder = SuffixArrayBuilder::new(SuffixArrayConfig::default());
assert!(builder.supports_parallel());
assert!(!builder.supports_simd());
let memory_estimate = builder.estimate_memory(1000);
assert_eq!(memory_estimate, 1000 * std::mem::size_of::<usize>());
}
#[test]
fn test_suffix_array_algorithm_enum() {
assert_eq!(SuffixArrayAlgorithm::default(), SuffixArrayAlgorithm::Adaptive);
assert!(!SuffixArrayAlgorithm::SAIS.description().is_empty());
assert!(!SuffixArrayAlgorithm::DC3.description().is_empty());
assert!(!SuffixArrayAlgorithm::DivSufSort.description().is_empty());
assert!(!SuffixArrayAlgorithm::LarssonSadakane.description().is_empty());
assert!(!SuffixArrayAlgorithm::Adaptive.description().is_empty());
}
#[test]
fn test_data_characteristics_analysis() {
let chars = SuffixArray::analyze_text_characteristics(b"");
assert_eq!(chars.text_length, 0);
assert_eq!(chars.alphabet_size, 0);
let chars = SuffixArray::analyze_text_characteristics(b"abcd");
assert_eq!(chars.text_length, 4);
assert_eq!(chars.alphabet_size, 4);
assert!(chars.entropy > 0.0);
let chars = SuffixArray::analyze_text_characteristics(b"aaaa");
assert_eq!(chars.text_length, 4);
assert_eq!(chars.alphabet_size, 1);
assert!(chars.repetition_ratio > 0.5);
assert_eq!(chars.entropy, 0.0);
let chars = SuffixArray::analyze_text_characteristics(b"banana");
assert_eq!(chars.text_length, 6);
assert_eq!(chars.alphabet_size, 3); assert!(chars.entropy > 0.0);
assert!(chars.repetition_ratio < 1.0);
}
#[test]
fn test_adaptive_algorithm_selection() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::Adaptive,
adaptive_threshold: 100,
..Default::default()
};
let builder = SuffixArrayBuilder::new(config);
let algorithm = builder.select_algorithm(b"small");
assert_eq!(algorithm, SuffixArrayAlgorithm::DC3);
let algorithm = builder.select_algorithm(&vec![b'a'; 1000]);
assert_eq!(algorithm, SuffixArrayAlgorithm::SAIS);
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::SAIS,
..Default::default()
};
let builder = SuffixArrayBuilder::new(config);
let algorithm = builder.select_algorithm(b"any text");
assert_eq!(algorithm, SuffixArrayAlgorithm::SAIS);
}
#[test]
fn test_dc3_algorithm() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::DC3,
..Default::default()
};
let text = b"banana";
let sa = SuffixArray::with_config(text, &config).unwrap();
assert_eq!(sa.as_slice().len(), 6);
assert_eq!(sa.text_len(), 6);
let suffixes = sa.as_slice();
for i in 1..suffixes.len() {
let suffix1 = &text[suffixes[i - 1]..];
let suffix2 = &text[suffixes[i]..];
assert!(suffix1 <= suffix2,
"Suffix at {} ({:?}) should be <= suffix at {} ({:?})",
i-1, suffix1, i, suffix2);
}
}
#[test]
fn test_algorithm_consistency() {
let test_cases = [
b"banana".as_slice(),
b"abcdef".as_slice(),
b"mississippi".as_slice(),
b"aaaaaa".as_slice(),
b"abab".as_slice(),
];
for &text in &test_cases {
let config_div = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::DivSufSort,
..Default::default()
};
let sa_div = SuffixArray::with_config(text, &config_div).unwrap();
let config_ls = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::LarssonSadakane,
..Default::default()
};
let sa_ls = SuffixArray::with_config(text, &config_ls).unwrap();
verify_suffix_array_is_sorted(text, sa_div.as_slice());
verify_suffix_array_is_sorted(text, sa_ls.as_slice());
}
}
fn verify_suffix_array_is_sorted(text: &[u8], sa: &[usize]) {
for i in 1..sa.len() {
let suffix1 = &text[sa[i - 1]..];
let suffix2 = &text[sa[i]..];
assert!(suffix1 <= suffix2,
"Suffix array not properly sorted at position {}", i);
}
}
#[test]
fn test_dc3_edge_cases() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::DC3,
..Default::default()
};
let sa = SuffixArray::with_config(b"", &config).unwrap();
assert_eq!(sa.as_slice().len(), 0);
let sa = SuffixArray::with_config(b"a", &config).unwrap();
assert_eq!(sa.as_slice(), &[0]);
let text = b"ab";
let sa = SuffixArray::with_config(text, &config).unwrap();
assert_eq!(sa.as_slice().len(), 2);
let text = b"ba";
let sa = SuffixArray::with_config(text, &config).unwrap();
assert_eq!(sa.as_slice().len(), 2);
}
#[test]
fn test_adaptive_with_different_data_types() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::Adaptive,
adaptive_threshold: 10,
..Default::default()
};
let test_cases = [
(b"xyz".as_slice(), "small input"), (b"aaaaaaaaaaaaaaaaaaaa".as_slice(), "repetitive input"),
(b"abcdefghijklmnopqrstuvwxyz".as_slice(), "diverse alphabet"),
(&vec![b'a'; 2000], "large repetitive"),
];
for (text, description) in &test_cases {
let sa = SuffixArray::with_config(text, &config).unwrap();
assert_eq!(sa.as_slice().len(), text.len(),
"Suffix array length mismatch for {}", description);
assert_eq!(sa.text_len(), text.len(),
"Text length mismatch for {}", description);
for &idx in sa.as_slice() {
assert!(idx < text.len(),
"Invalid suffix index {} for text length {} in {}",
idx, text.len(), description);
}
}
}
#[test]
fn test_divsufsort_algorithm() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::DivSufSort,
..Default::default()
};
let test_cases = [
b"banana".as_slice(),
b"abcdef".as_slice(),
b"mississippi".as_slice(),
b"abab".as_slice(),
b"aaaa".as_slice(),
b"abcdefghijklmnopqrstuvwxyz".as_slice(),
];
for &text in &test_cases {
let sa = SuffixArray::with_config(text, &config).unwrap();
assert_eq!(sa.as_slice().len(), text.len(),
"DivSufSort length mismatch for text: {:?}", std::str::from_utf8(text));
assert_eq!(sa.text_len(), text.len());
let mut indices = sa.as_slice().to_vec();
indices.sort_unstable();
for (i, &idx) in indices.iter().enumerate() {
assert_eq!(idx, i, "Missing or duplicate index in DivSufSort result");
}
let suffixes = sa.as_slice();
for i in 1..suffixes.len() {
let suffix1 = &text[suffixes[i - 1]..];
let suffix2 = &text[suffixes[i]..];
assert!(suffix1 <= suffix2,
"DivSufSort: Suffix at {} ({:?}) should be <= suffix at {} ({:?})",
i-1, suffix1, i, suffix2);
}
}
}
#[test]
fn test_larsson_sadakane_algorithm() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::LarssonSadakane,
..Default::default()
};
let test_cases = [
b"banana".as_slice(),
b"abcdef".as_slice(),
b"mississippi".as_slice(),
b"abab".as_slice(),
b"aaaa".as_slice(),
b"ababababab".as_slice(), b"aaaabbbbcccc".as_slice(), ];
for &text in &test_cases {
let sa = SuffixArray::with_config(text, &config).unwrap();
assert_eq!(sa.as_slice().len(), text.len(),
"Larsson-Sadakane length mismatch for text: {:?}", std::str::from_utf8(text));
assert_eq!(sa.text_len(), text.len());
let mut indices = sa.as_slice().to_vec();
indices.sort_unstable();
for (i, &idx) in indices.iter().enumerate() {
assert_eq!(idx, i, "Missing or duplicate index in Larsson-Sadakane result");
}
let suffixes = sa.as_slice();
for i in 1..suffixes.len() {
let suffix1 = &text[suffixes[i - 1]..];
let suffix2 = &text[suffixes[i]..];
assert!(suffix1 <= suffix2,
"Larsson-Sadakane: Suffix at {} ({:?}) should be <= suffix at {} ({:?})",
i-1, suffix1, i, suffix2);
}
}
}
#[test]
fn test_algorithm_consistency_all() {
let test_cases = [
b"banana".as_slice(),
b"abcdef".as_slice(),
b"mississippi".as_slice(),
b"aaaa".as_slice(),
b"abab".as_slice(),
];
for &text in &test_cases {
let config_dc3 = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::DC3,
..Default::default()
};
let sa_dc3 = SuffixArray::with_config(text, &config_dc3).unwrap();
let config_divsufsort = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::DivSufSort,
..Default::default()
};
let sa_divsufsort = SuffixArray::with_config(text, &config_divsufsort).unwrap();
let config_ls = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::LarssonSadakane,
..Default::default()
};
let sa_ls = SuffixArray::with_config(text, &config_ls).unwrap();
verify_suffix_array_is_sorted(text, sa_dc3.as_slice());
verify_suffix_array_is_sorted(text, sa_divsufsort.as_slice());
verify_suffix_array_is_sorted(text, sa_ls.as_slice());
}
}
#[test]
fn test_divsufsort_edge_cases() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::DivSufSort,
..Default::default()
};
let sa = SuffixArray::with_config(b"", &config).unwrap();
assert_eq!(sa.as_slice().len(), 0);
let sa = SuffixArray::with_config(b"a", &config).unwrap();
assert_eq!(sa.as_slice(), &[0]);
let sa = SuffixArray::with_config(b"ab", &config).unwrap();
assert_eq!(sa.as_slice().len(), 2);
assert!(sa.as_slice()[0] < sa.as_slice()[1] ||
b"ab"[sa.as_slice()[0]..] <= b"ab"[sa.as_slice()[1]..]);
let sa = SuffixArray::with_config(b"aaaa", &config).unwrap();
assert_eq!(sa.as_slice(), &[3, 2, 1, 0]); }
#[test]
fn test_larsson_sadakane_edge_cases() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::LarssonSadakane,
..Default::default()
};
let sa = SuffixArray::with_config(b"", &config).unwrap();
assert_eq!(sa.as_slice().len(), 0);
let sa = SuffixArray::with_config(b"a", &config).unwrap();
assert_eq!(sa.as_slice(), &[0]);
let sa = SuffixArray::with_config(b"ab", &config).unwrap();
assert_eq!(sa.as_slice().len(), 2);
assert!(sa.as_slice()[0] < sa.as_slice()[1] ||
b"ab"[sa.as_slice()[0]..] <= b"ab"[sa.as_slice()[1]..]);
let sa = SuffixArray::with_config(b"aaaa", &config).unwrap();
assert_eq!(sa.as_slice(), &[3, 2, 1, 0]);
let sa = SuffixArray::with_config(b"abababab", &config).unwrap();
assert_eq!(sa.as_slice().len(), 8);
let text = b"abababab";
let suffixes = sa.as_slice();
for i in 1..suffixes.len() {
let suffix1 = &text[suffixes[i - 1]..];
let suffix2 = &text[suffixes[i]..];
assert!(suffix1 <= suffix2);
}
}
#[test]
fn test_adaptive_algorithm_selection_updated() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::Adaptive,
adaptive_threshold: 100,
..Default::default()
};
let builder = SuffixArrayBuilder::new(config);
let algorithm = builder.select_algorithm(b"small");
assert_eq!(algorithm, SuffixArrayAlgorithm::DC3);
let algorithm = builder.select_algorithm(&vec![b'a'; 1000]);
assert!(algorithm == SuffixArrayAlgorithm::LarssonSadakane || algorithm == SuffixArrayAlgorithm::SAIS);
let large_diverse: Vec<u8> = (0..100_000).map(|i| (i % 256) as u8).collect();
let algorithm = builder.select_algorithm(&large_diverse);
assert_eq!(algorithm, SuffixArrayAlgorithm::DivSufSort);
let medium_diverse: Vec<u8> = (0..60_000).map(|i| (i % 256) as u8).collect();
let algorithm = builder.select_algorithm(&medium_diverse);
assert_eq!(algorithm, SuffixArrayAlgorithm::DivSufSort);
}
#[test]
fn test_repetitive_data_performance() {
let config = SuffixArrayConfig {
algorithm: SuffixArrayAlgorithm::DivSufSort,
adaptive_threshold: 10,
..Default::default()
};
let repetitive_text = b"abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc";
let builder = SuffixArrayBuilder::new(config);
let _algorithm = builder.select_algorithm(repetitive_text);
let sa = SuffixArray::with_config(repetitive_text, &builder.config).unwrap();
assert_eq!(sa.as_slice().len(), repetitive_text.len());
let suffixes = sa.as_slice();
for i in 1..suffixes.len() {
let suffix1 = &repetitive_text[suffixes[i - 1]..];
let suffix2 = &repetitive_text[suffixes[i]..];
assert!(suffix1 <= suffix2,
"Repetitive data suffix array not properly sorted at position {}", i);
}
}
}