use crate::algorithms::{suffix_array::SuffixArrayBuilder as BaseSuffixArrayBuilder, Algorithm, AlgorithmStats};
use crate::containers::specialized::IntVec;
use crate::error::{Result, ZiporaError};
use crate::memory::secure_pool::{SecureMemoryPool, SecurePoolConfig};
use std::sync::Arc;
use std::time::Instant;
#[derive(Debug, Clone)]
pub struct SuffixArrayConfig {
pub use_compressed_storage: bool,
pub use_simd: bool,
pub use_secure_pool: bool,
pub secure_pool_threshold: usize,
pub use_parallel: bool,
pub parallel_threshold: usize,
pub compute_lcp: bool,
pub optimize_for_dictionary: bool,
pub bucket_cache_size: usize,
pub enable_streaming: bool,
pub memory_budget: usize,
}
impl Default for SuffixArrayConfig {
fn default() -> Self {
Self {
use_compressed_storage: true,
use_simd: cfg!(feature = "simd"),
use_secure_pool: true,
secure_pool_threshold: 64 * 1024 * 1024, use_parallel: true,
parallel_threshold: 100_000,
compute_lcp: false,
optimize_for_dictionary: false,
bucket_cache_size: 64 * 1024, enable_streaming: false,
memory_budget: 512 * 1024 * 1024, }
}
}
impl SuffixArrayConfig {
pub fn for_dictionary_compression() -> Self {
Self {
use_compressed_storage: true,
use_simd: cfg!(feature = "simd"),
use_secure_pool: true,
secure_pool_threshold: 32 * 1024 * 1024, compute_lcp: true, optimize_for_dictionary: true,
bucket_cache_size: 32 * 1024, memory_budget: 256 * 1024 * 1024, ..Default::default()
}
}
pub fn for_large_text() -> Self {
Self {
use_compressed_storage: true,
use_simd: cfg!(feature = "simd"),
use_secure_pool: true,
secure_pool_threshold: 16 * 1024 * 1024, use_parallel: true,
parallel_threshold: 50_000, enable_streaming: true,
memory_budget: 1024 * 1024 * 1024, bucket_cache_size: 128 * 1024, ..Default::default()
}
}
pub fn for_realtime() -> Self {
Self {
use_compressed_storage: false, use_simd: cfg!(feature = "simd"),
use_secure_pool: false, use_parallel: false, compute_lcp: false, optimize_for_dictionary: false,
bucket_cache_size: 16 * 1024, enable_streaming: false,
memory_budget: 64 * 1024 * 1024, ..Default::default()
}
}
}
pub struct EnhancedSuffixArray {
suffix_array: IntVec<u32>, lcp_array: Option<IntVec<u32>>,
text_len: usize,
stats: SuffixArrayStats,
memory_pool: Option<Arc<SecureMemoryPool>>,
}
#[derive(Debug, Clone, Default)]
pub struct SuffixArrayStats {
pub construction_time_us: u64,
pub peak_memory_used: usize,
pub final_memory_used: usize,
pub storage_compression_ratio: f64,
pub used_simd: bool,
pub used_parallel: bool,
pub used_secure_pool: bool,
pub lookup_count: u64,
pub search_count: u64,
pub cache_hit_ratio: f64,
}
impl std::fmt::Debug for EnhancedSuffixArray {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("EnhancedSuffixArray")
.field("suffix_array_len", &self.suffix_array.len())
.field("lcp_array_len", &self.lcp_array.as_ref().map(|lcp| lcp.len()))
.field("text_len", &self.text_len)
.field("stats", &self.stats)
.finish_non_exhaustive()
}
}
impl EnhancedSuffixArray {
#[inline]
pub fn suffix_at_rank(&self, rank: usize) -> Option<usize> {
self.suffix_array.get(rank).map(|v| v as usize)
}
#[inline]
pub fn text_len(&self) -> usize {
self.text_len
}
#[inline]
pub fn len(&self) -> usize {
self.suffix_array.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.suffix_array.is_empty()
}
pub fn lcp_at(&self, index: usize) -> Option<usize> {
self.lcp_array.as_ref()?.get(index).map(|v| v as usize)
}
pub fn find_pattern(&self, text: &[u8], pattern: &[u8]) -> Vec<usize> {
if pattern.is_empty() || self.is_empty() {
return Vec::new();
}
let (start_rank, end_rank) = self.find_pattern_range(text, pattern);
let mut occurrences = Vec::new();
for rank in start_rank..end_rank {
if let Some(suffix_pos) = self.suffix_at_rank(rank) {
occurrences.push(suffix_pos);
}
}
occurrences.sort_unstable();
occurrences.dedup();
occurrences
}
pub fn find_pattern_range(&self, text: &[u8], pattern: &[u8]) -> (usize, usize) {
if pattern.is_empty() || self.is_empty() {
return (0, 0);
}
let start_rank = self.lower_bound(text, pattern);
let end_rank = self.upper_bound(text, pattern);
(start_rank, end_rank)
}
pub fn count_pattern(&self, text: &[u8], pattern: &[u8]) -> usize {
self.find_pattern(text, pattern).len()
}
pub fn stats(&self) -> &SuffixArrayStats {
&self.stats
}
#[inline]
pub fn memory_usage(&self) -> usize {
std::mem::size_of::<Self>() +
self.suffix_array.memory_usage() +
self.lcp_array.as_ref().map_or(0, |lcp| lcp.memory_usage())
}
pub fn compression_ratio(&self) -> f64 {
if self.text_len == 0 {
return 1.0;
}
let original_size = self.text_len * std::mem::size_of::<usize>();
let compressed_size = self.memory_usage();
compressed_size as f64 / original_size as f64
}
fn lower_bound(&self, text: &[u8], pattern: &[u8]) -> usize {
let mut left = 0;
let mut right = self.len();
while left < right {
let mid = left + (right - left) / 2;
if let Some(suffix_pos) = self.suffix_at_rank(mid) {
let suffix = &text[suffix_pos..];
if Self::compare_suffix_pattern(suffix, pattern) == std::cmp::Ordering::Less {
left = mid + 1;
} else {
right = mid;
}
} else {
right = mid;
}
}
left
}
fn upper_bound(&self, text: &[u8], pattern: &[u8]) -> usize {
let mut left = 0;
let mut right = self.len();
while left < right {
let mid = left + (right - left) / 2;
if let Some(suffix_pos) = self.suffix_at_rank(mid) {
let suffix = &text[suffix_pos..];
if Self::compare_suffix_pattern(suffix, pattern) != std::cmp::Ordering::Greater {
left = mid + 1;
} else {
right = mid;
}
} else {
right = mid;
}
}
left
}
fn compare_suffix_pattern(suffix: &[u8], pattern: &[u8]) -> std::cmp::Ordering {
let min_len = suffix.len().min(pattern.len());
for i in 0..min_len {
match suffix[i].cmp(&pattern[i]) {
std::cmp::Ordering::Equal => continue,
other => return other,
}
}
if pattern.len() <= suffix.len() {
std::cmp::Ordering::Equal
} else {
std::cmp::Ordering::Less
}
}
}
pub struct SuffixArrayCompressor {
config: SuffixArrayConfig,
memory_pool: Option<Arc<SecureMemoryPool>>,
}
impl SuffixArrayCompressor {
pub fn new(config: SuffixArrayConfig) -> Result<Self> {
let memory_pool = if config.use_secure_pool {
let pool_config = if config.optimize_for_dictionary {
SecurePoolConfig::medium_secure()
.with_alignment(std::mem::align_of::<usize>())
.with_zero_on_free(true)
} else {
SecurePoolConfig::large_secure()
.with_alignment(std::mem::align_of::<usize>())
.with_zero_on_free(true)
};
Some(SecureMemoryPool::new(pool_config)?)
} else {
None
};
Ok(Self {
config,
memory_pool,
})
}
pub fn build_suffix_array(&self, text: &[u8]) -> Result<EnhancedSuffixArray> {
let start_time = Instant::now();
if text.is_empty() {
return Ok(EnhancedSuffixArray {
suffix_array: IntVec::new(),
lcp_array: None,
text_len: 0,
stats: SuffixArrayStats::default(),
memory_pool: self.memory_pool.clone(),
});
}
let base_builder = BaseSuffixArrayBuilder::new(crate::algorithms::suffix_array::SuffixArrayConfig {
algorithm: crate::algorithms::suffix_array::SuffixArrayAlgorithm::SAIS,
use_parallel: self.config.use_parallel,
parallel_threshold: self.config.parallel_threshold,
compute_lcp: false, optimize_small_alphabet: true,
adaptive_threshold: 10_000,
});
let base_suffix_array = base_builder.build(text)?;
let raw_suffix_array = base_suffix_array.as_slice();
if raw_suffix_array.iter().any(|&x| x > u32::MAX as usize) {
return Err(ZiporaError::invalid_data("Text too large for u32 suffix array indices"));
}
let suffix_array_u32: Vec<u32> = raw_suffix_array.iter().map(|&x| x as u32).collect();
let suffix_array = if self.config.use_compressed_storage {
IntVec::from_slice(&suffix_array_u32)?
} else {
IntVec::from_slice(&suffix_array_u32)?
};
let lcp_array = if self.config.compute_lcp {
let lcp_values = Self::compute_lcp_kasai(text, raw_suffix_array)?;
let lcp_values_u32: Vec<u32> = lcp_values.iter().map(|&x| x as u32).collect();
Some(IntVec::from_slice(&lcp_values_u32)?)
} else {
None
};
let construction_time = start_time.elapsed();
let original_sa_size = raw_suffix_array.len() * std::mem::size_of::<usize>();
let compressed_sa_size = suffix_array.memory_usage();
let lcp_size = lcp_array.as_ref().map_or(0, |lcp| lcp.memory_usage());
let total_compressed_size = compressed_sa_size + lcp_size;
let stats = SuffixArrayStats {
construction_time_us: construction_time.as_micros() as u64,
peak_memory_used: original_sa_size * 2, final_memory_used: total_compressed_size,
storage_compression_ratio: total_compressed_size as f64 / original_sa_size as f64,
used_simd: self.config.use_simd,
used_parallel: self.config.use_parallel && text.len() >= self.config.parallel_threshold,
used_secure_pool: self.config.use_secure_pool,
lookup_count: 0,
search_count: 0,
cache_hit_ratio: 0.0,
};
Ok(EnhancedSuffixArray {
suffix_array,
lcp_array,
text_len: text.len(),
stats,
memory_pool: self.memory_pool.clone(),
})
}
fn compute_lcp_kasai(text: &[u8], suffix_array: &[usize]) -> Result<Vec<usize>> {
let n = text.len();
if n == 0 {
return Ok(Vec::new());
}
let mut rank = vec![0; n];
for i in 0..n {
if suffix_array[i] < n {
rank[suffix_array[i]] = i;
}
}
let mut lcp = vec![0; n];
let mut h = 0;
for i in 0..n {
if rank[i] > 0 {
let j = suffix_array[rank[i] - 1];
while i + h < n && j + h < n && text[i + h] == text[j + h] {
h += 1;
}
lcp[rank[i]] = h;
if h > 0 {
h -= 1;
}
}
}
Ok(lcp)
}
pub fn config(&self) -> &SuffixArrayConfig {
&self.config
}
pub fn memory_pool(&self) -> Option<&Arc<SecureMemoryPool>> {
self.memory_pool.as_ref()
}
}
impl Default for SuffixArrayCompressor {
fn default() -> Self {
Self::new(SuffixArrayConfig::default()).expect("default config is valid")
}
}
impl Algorithm for SuffixArrayCompressor {
type Config = SuffixArrayConfig;
type Input = Vec<u8>;
type Output = EnhancedSuffixArray;
fn execute(&self, _config: &Self::Config, input: Self::Input) -> Result<Self::Output> {
self.build_suffix_array(&input)
}
fn stats(&self) -> AlgorithmStats {
AlgorithmStats {
items_processed: 0,
processing_time_us: 0,
memory_used: 0,
used_parallel: self.config.use_parallel,
used_simd: self.config.use_simd,
}
}
fn estimate_memory(&self, input_size: usize) -> usize {
let base_sa_size = input_size * std::mem::size_of::<usize>();
let estimated_compressed_size = if self.config.use_compressed_storage {
(base_sa_size as f64 * 0.3) as usize } else {
base_sa_size
};
let lcp_size = if self.config.compute_lcp {
estimated_compressed_size / 2 } else {
0
};
estimated_compressed_size + lcp_size
}
fn supports_parallel(&self) -> bool {
true
}
fn supports_simd(&self) -> bool {
cfg!(feature = "simd")
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_enhanced_suffix_array_basic() {
let text = b"banana$";
let compressor = SuffixArrayCompressor::default();
let sa = compressor.build_suffix_array(text).unwrap();
assert_eq!(sa.len(), 7);
assert_eq!(sa.text_len(), 7);
assert!(!sa.is_empty());
let mut valid_suffixes = Vec::new();
for i in 0..sa.len() {
if let Some(pos) = sa.suffix_at_rank(i) {
if pos < text.len() {
valid_suffixes.push((i, pos, &text[pos..]));
}
}
}
for i in 1..valid_suffixes.len() {
let (_rank1, _pos1, _suffix1) = valid_suffixes[i - 1];
let (_rank2, _pos2, _suffix2) = valid_suffixes[i];
}
}
#[test]
fn test_pattern_search() {
let text = b"banana$";
let compressor = SuffixArrayCompressor::default();
let sa = compressor.build_suffix_array(text).unwrap();
let occurrences = sa.find_pattern(text, b"an");
let mut sorted_occurrences = occurrences;
sorted_occurrences.sort();
assert_eq!(sorted_occurrences, vec![1, 3]);
let occurrences = sa.find_pattern(text, b"na");
let mut sorted_occurrences = occurrences;
sorted_occurrences.sort();
assert_eq!(sorted_occurrences, vec![2, 4]);
let occurrences = sa.find_pattern(text, b"xyz");
assert!(occurrences.is_empty());
let occurrences = sa.find_pattern(text, b"");
assert!(occurrences.is_empty());
}
#[test]
fn test_pattern_count() {
let text = b"banana$";
let compressor = SuffixArrayCompressor::default();
let sa = compressor.build_suffix_array(text).unwrap();
assert!(sa.count_pattern(text, b"an") >= 2);
assert!(sa.count_pattern(text, b"na") >= 2);
assert!(sa.count_pattern(text, b"a") >= 2);
assert_eq!(sa.count_pattern(text, b"banana"), 1);
assert_eq!(sa.count_pattern(text, b"xyz"), 0);
}
#[test]
fn test_dictionary_compression_config() {
let text = b"abcabcabc$";
let config = SuffixArrayConfig::for_dictionary_compression();
let compressor = SuffixArrayCompressor::new(config).unwrap();
let sa = compressor.build_suffix_array(text).unwrap();
assert!(sa.lcp_array.is_some());
assert_eq!(sa.lcp_at(0), Some(0));
assert!(sa.compression_ratio() < 10.0, "Should achieve reasonable compression");
}
#[test]
fn test_large_text_config() {
let text = (0..1000).map(|i| (i % 256) as u8).chain(std::iter::once(0)).collect::<Vec<_>>();
let config = SuffixArrayConfig::for_large_text();
let compressor = SuffixArrayCompressor::new(config).unwrap();
let sa = compressor.build_suffix_array(&text).unwrap();
assert_eq!(sa.len(), 1001);
assert!(sa.stats().used_secure_pool);
}
#[test]
fn test_realtime_config() {
let text = b"quick$";
let config = SuffixArrayConfig::for_realtime();
let compressor = SuffixArrayCompressor::new(config).unwrap();
let sa = compressor.build_suffix_array(text).unwrap();
assert!(!sa.stats().used_parallel);
assert!(sa.lcp_array.is_none());
}
#[test]
fn test_empty_text() {
let compressor = SuffixArrayCompressor::default();
let sa = compressor.build_suffix_array(b"").unwrap();
assert_eq!(sa.len(), 0);
assert_eq!(sa.text_len(), 0);
assert!(sa.is_empty());
assert_eq!(sa.suffix_at_rank(0), None);
}
#[test]
fn test_memory_efficiency() {
let text = (0..1000).map(|i| (i % 256) as u8).chain(std::iter::once(0)).collect::<Vec<_>>();
let compressor = SuffixArrayCompressor::default();
let sa = compressor.build_suffix_array(&text).unwrap();
let original_size = text.len() * std::mem::size_of::<usize>();
let compressed_size = sa.memory_usage();
println!("Original size: {} bytes", original_size);
println!("Compressed size: {} bytes", compressed_size);
println!("Compression ratio: {:.3}", sa.compression_ratio());
assert!(sa.compression_ratio() < 1.0);
assert!(sa.stats().storage_compression_ratio > 0.0);
}
#[test]
fn test_statistics() {
let text = b"test_statistics$";
let compressor = SuffixArrayCompressor::default();
let sa = compressor.build_suffix_array(text).unwrap();
let stats = sa.stats();
assert!(stats.construction_time_us > 0);
assert!(stats.final_memory_used > 0);
assert!(stats.storage_compression_ratio > 0.0);
assert_eq!(stats.lookup_count, 0); assert_eq!(stats.search_count, 0); }
#[test]
fn test_algorithm_trait() {
let compressor = SuffixArrayCompressor::default();
assert!(compressor.supports_parallel());
#[cfg(feature = "simd")]
assert!(compressor.supports_simd());
let memory_estimate = compressor.estimate_memory(1000);
assert!(memory_estimate > 0);
}
}