use crate::algorithms::suffix_array::{SuffixArray, SuffixArrayConfig};
use crate::compression::dict_zip::dfa_cache::{DfaCache, DfaCacheConfig};
use crate::compression::dict_zip::matcher::{Match, PatternMatcher};
use crate::error::{Result, ZiporaError};
use crate::memory::{SecureMemoryPool, SecurePoolConfig};
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use std::sync::Arc;
#[derive(Debug, Clone)]
pub struct SuffixArrayDictionaryConfig {
pub max_dict_size: usize,
pub min_frequency: u32,
pub max_bfs_depth: u32,
pub max_cache_states: usize,
pub external_mode: bool,
pub use_memory_pool: bool,
pub enable_simd: bool,
pub sample_ratio: f64,
pub min_pattern_length: usize,
pub max_pattern_length: usize,
pub dfa_cache_config: DfaCacheConfig,
pub suffix_array_config: SuffixArrayConfig,
}
impl Default for SuffixArrayDictionaryConfig {
fn default() -> Self {
Self {
max_dict_size: 64 * 1024 * 1024, min_frequency: 4, max_bfs_depth: 6, max_cache_states: 65536, external_mode: false, use_memory_pool: true,
enable_simd: cfg!(feature = "simd"),
sample_ratio: 1.0, min_pattern_length: 4, max_pattern_length: 256, dfa_cache_config: DfaCacheConfig::default(),
suffix_array_config: SuffixArrayConfig::default(),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct MatchStatus {
pub lo: usize,
pub hi: usize,
pub depth: usize,
}
impl MatchStatus {
pub fn new(lo: usize, hi: usize, depth: usize) -> Self {
Self { lo, hi, depth }
}
#[inline]
pub fn is_empty(&self) -> bool {
self.lo >= self.hi
}
pub fn match_count(&self) -> usize {
if self.is_empty() {
0
} else {
self.hi - self.lo
}
}
}
#[derive(Debug, Clone, Default)]
pub struct MatchStats {
pub total_searches: u64,
pub cache_hits: u64,
pub suffix_array_lookups: u64,
pub bytes_matched: u64,
pub avg_match_length: f64,
pub total_search_time_us: u64,
}
impl MatchStats {
pub fn cache_hit_ratio(&self) -> f64 {
if self.total_searches == 0 {
0.0
} else {
self.cache_hits as f64 / self.total_searches as f64
}
}
pub fn avg_search_time_us(&self) -> f64 {
if self.total_searches == 0 {
0.0
} else {
self.total_search_time_us as f64 / self.total_searches as f64
}
}
pub fn update_search(&mut self, used_cache: bool, match_length: usize, search_time_us: u64) {
self.total_searches += 1;
if used_cache {
self.cache_hits += 1;
} else {
self.suffix_array_lookups += 1;
}
self.bytes_matched += match_length as u64;
self.total_search_time_us += search_time_us;
let total_bytes = self.bytes_matched as f64;
self.avg_match_length = total_bytes / self.total_searches as f64;
}
}
#[derive(Debug, Clone)]
pub struct SuffixArrayDictionary {
suffix_array: Arc<SuffixArray>,
dfa_cache: DfaCache,
dictionary_text: Arc<Vec<u8>>,
matcher: PatternMatcher,
memory_pool: Option<Arc<SecureMemoryPool>>,
config: SuffixArrayDictionaryConfig,
stats: MatchStats,
}
impl SuffixArrayDictionary {
pub fn new(training_data: &[u8], config: SuffixArrayDictionaryConfig) -> Result<Self> {
let memory_pool = if config.use_memory_pool {
let pool_config = SecurePoolConfig::medium_secure()
.with_local_cache_size(32);
Some(SecureMemoryPool::new(pool_config)?)
} else {
None
};
let sampled_data = if config.sample_ratio < 1.0 && training_data.len() > 10000 {
Self::sample_training_data(training_data, config.sample_ratio)
} else {
training_data.to_vec()
};
let suffix_array = Arc::new(SuffixArray::with_config(&sampled_data, &config.suffix_array_config)?);
let dictionary_text = Arc::new(sampled_data);
let dfa_cache = DfaCache::build_from_suffix_array(
&suffix_array,
&dictionary_text,
&config.dfa_cache_config,
config.min_frequency,
config.max_bfs_depth,
)?;
let matcher = PatternMatcher::new(
Arc::clone(&suffix_array),
Arc::clone(&dictionary_text),
config.min_pattern_length,
config.max_pattern_length,
);
Ok(Self {
suffix_array,
dfa_cache,
dictionary_text,
matcher,
memory_pool,
config,
stats: MatchStats::default(),
})
}
pub fn find_longest_match(
&mut self,
input: &[u8],
position: usize,
max_length: usize,
) -> Result<Option<Match>> {
let start_time = std::time::Instant::now();
if position >= input.len() {
return Ok(None);
}
let search_slice = &input[position..];
let _max_search_len = max_length.min(search_slice.len()).min(self.config.max_pattern_length);
let match_status = self.da_match_max_length(search_slice);
let result = if match_status.depth >= self.config.min_pattern_length && !match_status.is_empty() {
let dict_position = if match_status.lo < self.suffix_array.as_slice().len() {
self.suffix_array.as_slice()[match_status.lo]
} else {
0
};
let match_result = Match::new(
match_status.depth,
dict_position,
position,
match_status.depth <= 10, );
self.stats.update_search(
match_result.from_cache,
match_result.length,
start_time.elapsed().as_micros() as u64
);
Some(match_result)
} else {
self.stats.update_search(false, 0, start_time.elapsed().as_micros() as u64);
None
};
Ok(result)
}
pub fn find_all_matches(&self, pattern: &[u8], max_matches: usize) -> Result<Vec<Match>> {
if pattern.len() < self.config.min_pattern_length ||
pattern.len() > self.config.max_pattern_length {
return Ok(Vec::new());
}
self.matcher.find_all_matches(pattern, max_matches)
}
pub fn dictionary_size(&self) -> usize {
self.dictionary_text.len()
}
pub fn dictionary_text(&self) -> &[u8] {
&self.dictionary_text
}
pub fn cache_states(&self) -> usize {
self.dfa_cache.state_count()
}
#[inline]
pub fn memory_usage(&self) -> usize {
let sa_memory = self.suffix_array.as_slice().len() * std::mem::size_of::<usize>();
let dict_memory = self.dictionary_text.len();
let cache_memory = self.dfa_cache.memory_usage();
sa_memory + dict_memory + cache_memory
}
pub fn match_stats(&self) -> &MatchStats {
&self.stats
}
pub fn reset_stats(&mut self) {
self.stats = MatchStats::default();
}
pub fn config(&self) -> &SuffixArrayDictionaryConfig {
&self.config
}
pub fn is_external_mode(&self) -> bool {
self.config.external_mode
}
#[cfg(feature = "serde")]
pub fn serialize(&self) -> Result<Vec<u8>> {
use bincode;
let serializable = SerializableDictionary {
dictionary_text: (*self.dictionary_text).clone(),
dfa_cache_data: self.dfa_cache.serialize()?,
min_pattern_length: self.config.min_pattern_length,
max_pattern_length: self.config.max_pattern_length,
};
bincode::serialize(&serializable)
.map_err(|e| ZiporaError::invalid_data(&format!("Serialization failed: {}", e)))
}
#[cfg(feature = "serde")]
pub fn deserialize(data: &[u8]) -> Result<Self> {
use bincode;
let serializable: SerializableDictionary = bincode::deserialize(data)
.map_err(|e| ZiporaError::invalid_data(&format!("Deserialization failed: {}", e)))?;
let dictionary_text = Arc::new(serializable.dictionary_text);
let suffix_array = Arc::new(SuffixArray::new(&dictionary_text)?);
let dfa_cache = DfaCache::deserialize(&serializable.dfa_cache_data)?;
let matcher = PatternMatcher::new(
Arc::clone(&suffix_array),
Arc::clone(&dictionary_text),
serializable.min_pattern_length,
serializable.max_pattern_length,
);
let config = SuffixArrayDictionaryConfig {
min_pattern_length: serializable.min_pattern_length,
max_pattern_length: serializable.max_pattern_length,
..Default::default()
};
Ok(Self {
suffix_array,
dfa_cache,
dictionary_text,
matcher,
memory_pool: None, config,
stats: MatchStats::default(),
})
}
fn sample_training_data(data: &[u8], ratio: f64) -> Vec<u8> {
if ratio >= 1.0 {
return data.to_vec();
}
let sample_size = (data.len() as f64 * ratio) as usize;
let step = data.len() / sample_size;
let mut sampled = Vec::with_capacity(sample_size);
for i in (0..data.len()).step_by(step.max(1)) {
sampled.push(data[i]);
if sampled.len() >= sample_size {
break;
}
}
sampled
}
pub fn optimize_cache(&mut self) -> Result<()> {
self.dfa_cache.optimize(self.config.min_frequency)?;
Ok(())
}
pub fn cache_hit_ratio(&self) -> f64 {
self.stats.cache_hit_ratio()
}
#[cfg(feature = "serde")]
pub fn save_to_file<P: AsRef<std::path::Path>>(&self, path: P) -> Result<()> {
use std::fs::File;
use std::io::Write;
let serialized = self.serialize()?;
let mut file = File::create(path)
.map_err(|e| ZiporaError::io_error(&format!("Failed to create dictionary file: {}", e)))?;
file.write_all(&serialized)
.map_err(|e| ZiporaError::io_error(&format!("Failed to write dictionary file: {}", e)))?;
Ok(())
}
#[cfg(feature = "serde")]
pub fn load_from_file<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
use std::fs;
let data = fs::read(path)
.map_err(|e| ZiporaError::io_error(&format!("Failed to read dictionary file: {}", e)))?;
Self::deserialize(&data)
}
pub fn size_in_bytes(&self) -> usize {
self.memory_usage()
}
#[inline]
pub fn data(&self) -> &[u8] {
&self.dictionary_text
}
pub fn da_match_max_length(&self, input: &[u8]) -> MatchStatus {
if input.is_empty() {
return MatchStatus::new(0, 0, 0);
}
let mut state = 0u32; let mut lo = 0usize;
let mut hi = self.suffix_array.as_slice().len();
let mut pos = 0usize;
while pos < input.len() {
if let Some(zlen) = self.dfa_cache.get_zstr_length(state) {
let zend = (input.len()).min(pos + zlen);
if lo < self.suffix_array.as_slice().len() {
let dict_start = self.suffix_array.as_slice()[lo];
if dict_start < self.dictionary_text.len() {
let zptr = &self.dictionary_text[dict_start + pos..];
while pos < zend && pos < zptr.len() {
if zptr[pos - (dict_start + pos - dict_start)] != input[pos] {
return MatchStatus::new(lo, hi, pos);
}
pos += 1;
}
}
}
}
if pos >= input.len() {
break;
}
let child = self.dfa_cache.transition_state(state, input[pos]);
if let Some(next_state) = child {
if let Some(dfa_state) = self.dfa_cache.get_state(next_state) {
state = next_state;
lo = dfa_state.suffix_low as usize;
hi = dfa_state.suffix_hig as usize;
pos += 1;
} else {
return self.sa_match_continuation(lo, hi, pos, input);
}
} else {
return self.sa_match_continuation(lo, hi, pos, input);
}
}
MatchStatus::new(lo, hi, pos)
}
pub fn sa_match_continuation(&self, lo: usize, hi: usize, pos: usize, input: &[u8]) -> MatchStatus {
let mut current_lo = lo;
let mut current_hi = hi;
let mut current_pos = pos;
while current_pos < input.len() && current_lo < current_hi {
let ch = input[current_pos];
let (new_lo, new_hi) = self.sa_equal_range(current_lo, current_hi, current_pos, ch);
if new_lo >= new_hi {
break; }
current_lo = new_lo;
current_hi = new_hi;
current_pos += 1;
}
MatchStatus::new(current_lo, current_hi, current_pos)
}
pub fn sa_equal_range(&self, lo: usize, hi: usize, pos: usize, ch: u8) -> (usize, usize) {
if lo >= hi || lo >= self.suffix_array.as_slice().len() {
return (lo, lo); }
self.sa_equal_range_binary_optimized(lo, hi, pos, ch)
}
fn sa_equal_range_linear(&self, lo: usize, hi: usize, pos: usize, ch: u8) -> (usize, usize) {
let mut first_match = None;
let mut last_match = None;
let actual_hi = hi.min(self.suffix_array.as_slice().len());
for i in lo..actual_hi {
if let Some(&suffix_idx) = self.suffix_array.as_slice().get(i) {
if suffix_idx + pos < self.dictionary_text.len() {
let char_at_pos = self.dictionary_text[suffix_idx + pos];
if char_at_pos == ch {
if first_match.is_none() {
first_match = Some(i);
}
last_match = Some(i);
}
}
}
}
match (first_match, last_match) {
(Some(first), Some(last)) => (first, last + 1), _ => (lo, lo), }
}
fn sa_equal_range_binary_optimized(&self, lo: usize, hi: usize, pos: usize, ch: u8) -> (usize, usize) {
let mut search_lo = lo;
let search_hi = hi.min(self.suffix_array.as_slice().len());
if search_lo >= search_hi {
return (search_lo, search_lo);
}
while search_lo < search_hi {
if let Some(&suffix_idx) = self.suffix_array.as_slice().get(search_lo) {
if suffix_idx + pos >= self.dictionary_text.len() {
search_lo += 1;
} else {
break;
}
} else {
break;
}
}
if search_lo >= search_hi {
return (search_lo, search_lo);
}
if search_hi - search_lo <= 3 {
return self.sa_equal_range_linear(search_lo, search_hi, pos, ch);
}
let mut lo_search = search_lo;
let mut hi_search = search_hi;
let mideq = loop {
if lo_search >= hi_search {
return (lo_search, lo_search); }
let mid = lo_search + (hi_search - lo_search) / 2;
let suffix_idx = match self.suffix_array.as_slice().get(mid) {
Some(&idx) => idx,
None => return (lo_search, lo_search),
};
if suffix_idx + pos >= self.dictionary_text.len() {
if mid > lo_search {
hi_search = mid;
} else {
lo_search = mid + 1;
}
continue;
}
let hit_char = self.dictionary_text[suffix_idx + pos];
if hit_char < ch {
lo_search = mid + 1;
} else if hit_char > ch {
hi_search = mid;
} else {
break mid; }
};
let mut lower_lo = search_lo;
let mut lower_hi = mideq + 1;
while lower_lo < lower_hi {
let mid = lower_lo + (lower_hi - lower_lo) / 2;
let suffix_idx = match self.suffix_array.as_slice().get(mid) {
Some(&idx) => idx,
None => break,
};
if suffix_idx + pos >= self.dictionary_text.len() {
lower_lo = mid + 1;
continue;
}
let hit_char = self.dictionary_text[suffix_idx + pos];
if hit_char < ch {
lower_lo = mid + 1; } else {
lower_hi = mid; }
}
let mut upper_lo = mideq;
let mut upper_hi = search_hi;
while upper_lo < upper_hi {
let mid = upper_lo + (upper_hi - upper_lo) / 2;
let suffix_idx = match self.suffix_array.as_slice().get(mid) {
Some(&idx) => idx,
None => break,
};
if suffix_idx + pos >= self.dictionary_text.len() {
upper_lo = mid + 1;
continue;
}
let hit_char = self.dictionary_text[suffix_idx + pos];
if hit_char <= ch {
upper_lo = mid + 1; } else {
upper_hi = mid; }
}
(lower_lo, upper_lo)
}
pub fn validate(&self) -> Result<()> {
if self.suffix_array.text_len() != self.dictionary_text.len() {
return Err(ZiporaError::invalid_data(
"Suffix array length mismatch with dictionary text"
));
}
self.dfa_cache.validate()?;
if self.config.min_pattern_length > self.config.max_pattern_length {
return Err(ZiporaError::invalid_data(
"Invalid pattern length configuration"
));
}
Ok(())
}
pub fn cache_stats(&self) -> crate::compression::dict_zip::dfa_cache::CacheStats {
self.dfa_cache.stats().clone()
}
}
#[cfg(feature = "serde")]
#[derive(Serialize, Deserialize)]
struct SerializableDictionary {
dictionary_text: Vec<u8>,
dfa_cache_data: Vec<u8>,
min_pattern_length: usize,
max_pattern_length: usize,
}
pub struct ConcurrentSuffixArrayDictionary {
inner: std::sync::RwLock<SuffixArrayDictionary>,
}
impl ConcurrentSuffixArrayDictionary {
pub fn new(training_data: &[u8], config: SuffixArrayDictionaryConfig) -> Result<Self> {
let dictionary = SuffixArrayDictionary::new(training_data, config)?;
Ok(Self {
inner: std::sync::RwLock::new(dictionary),
})
}
pub fn find_longest_match(
&self,
input: &[u8],
position: usize,
max_length: usize,
) -> Result<Option<Match>> {
let mut dictionary = self.inner.write()
.map_err(|_| ZiporaError::invalid_data("Failed to acquire write lock"))?;
dictionary.find_longest_match(input, position, max_length)
}
pub fn match_stats(&self) -> Result<MatchStats> {
let dictionary = self.inner.read()
.map_err(|_| ZiporaError::invalid_data("Failed to acquire read lock"))?;
Ok(dictionary.match_stats().clone())
}
}
unsafe impl Send for SuffixArrayDictionary {}
unsafe impl Sync for SuffixArrayDictionary {}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_dictionary_creation() {
let training_data = b"The quick brown fox jumps over the lazy dog. The quick brown fox jumps.";
let config = SuffixArrayDictionaryConfig {
min_frequency: 2, ..Default::default()
};
let dictionary = SuffixArrayDictionary::new(training_data, config).unwrap();
assert_eq!(dictionary.dictionary_size(), training_data.len());
assert!(dictionary.cache_states() > 0);
assert!(dictionary.memory_usage() > 0);
}
#[test]
fn test_pattern_matching() {
let training_data = b"abcdefghijklmnopqrstuvwxyzabcdefgh";
let config = SuffixArrayDictionaryConfig {
min_pattern_length: 3,
max_pattern_length: 10,
..Default::default()
};
let mut dictionary = SuffixArrayDictionary::new(training_data, config).unwrap();
let input = b"abcdefg";
let result = dictionary.find_longest_match(input, 0, 10).unwrap();
assert!(result.is_some());
let match_info = result.unwrap();
assert!(match_info.length >= 3);
}
#[test]
fn test_match_statistics() {
let training_data = b"aaabbbcccaaabbbccc";
let config = SuffixArrayDictionaryConfig::default();
let mut dictionary = SuffixArrayDictionary::new(training_data, config).unwrap();
let input = b"aaabbb";
dictionary.find_longest_match(input, 0, 10).unwrap();
dictionary.find_longest_match(input, 1, 10).unwrap();
let stats = dictionary.match_stats();
assert_eq!(stats.total_searches, 2);
assert!(stats.cache_hit_ratio() >= 0.0 && stats.cache_hit_ratio() <= 1.0);
}
#[test]
fn test_dictionary_validation() {
let training_data = b"test data for validation";
let config = SuffixArrayDictionaryConfig::default();
let dictionary = SuffixArrayDictionary::new(training_data, config).unwrap();
assert!(dictionary.validate().is_ok());
}
#[test]
fn test_concurrent_dictionary() {
let training_data = b"concurrent test data";
let config = SuffixArrayDictionaryConfig::default();
let dict = ConcurrentSuffixArrayDictionary::new(training_data, config).unwrap();
let input = b"concurrent";
let result = dict.find_longest_match(input, 0, 10).unwrap();
assert!(result.is_some() || result.is_none());
let stats = dict.match_stats().unwrap();
assert_eq!(stats.total_searches, 1);
}
#[test]
fn test_external_mode_config() {
let training_data = b"external mode test";
let config = SuffixArrayDictionaryConfig {
external_mode: true,
..Default::default()
};
let dictionary = SuffixArrayDictionary::new(training_data, config).unwrap();
assert!(dictionary.is_external_mode());
}
#[test]
#[cfg(feature = "serde")]
fn test_serialization() {
let training_data = b"serialization test data";
let config = SuffixArrayDictionaryConfig::default();
let dictionary = SuffixArrayDictionary::new(training_data, config).unwrap();
let serialized = dictionary.serialize().unwrap();
assert!(!serialized.is_empty());
let deserialized = SuffixArrayDictionary::deserialize(&serialized).unwrap();
assert_eq!(deserialized.dictionary_size(), dictionary.dictionary_size());
}
#[test]
fn test_sampling() {
let large_data = vec![b'a'; 10000];
let config = SuffixArrayDictionaryConfig {
sample_ratio: 0.1,
..Default::default()
};
let dictionary = SuffixArrayDictionary::new(&large_data, config).unwrap();
assert!(dictionary.dictionary_size() > 0);
assert!(dictionary.dictionary_size() <= large_data.len());
}
#[test]
fn test_cache_optimization() {
let training_data = b"optimization test data with repeated patterns";
let config = SuffixArrayDictionaryConfig::default();
let mut dictionary = SuffixArrayDictionary::new(training_data, config).unwrap();
let initial_states = dictionary.cache_states();
dictionary.optimize_cache().unwrap();
assert!(dictionary.cache_states() <= initial_states);
}
#[test]
fn test_debug_pattern_matching() {
let training_data = b"test data test";
let config = SuffixArrayDictionaryConfig {
min_pattern_length: 3,
max_pattern_length: 10,
..Default::default()
};
let mut dictionary = SuffixArrayDictionary::new(training_data, config).unwrap();
println!("Training data: {:?}", std::str::from_utf8(training_data));
println!("Dictionary text length: {}", dictionary.dictionary_text.len());
println!("Suffix array length: {}", dictionary.suffix_array.as_slice().len());
for i in 0..std::cmp::min(10, dictionary.suffix_array.as_slice().len()) {
let suffix_idx = dictionary.suffix_array.as_slice()[i];
if suffix_idx < dictionary.dictionary_text.len() {
let suffix = &dictionary.dictionary_text[suffix_idx..];
let suffix_str = String::from_utf8_lossy(&suffix[..std::cmp::min(10, suffix.len())]);
println!("SA[{}] = {} -> '{}'", i, suffix_idx, suffix_str);
}
}
let result = dictionary.find_longest_match(b"test", 0, 4).unwrap();
println!("Debug result: {:?}", result);
assert!(result.is_some(), "Should find 'test' in 'test data test'");
}
#[test]
fn test_debug_binary_search() {
let training_data = b"test data test";
let config = SuffixArrayDictionaryConfig {
min_pattern_length: 3,
max_pattern_length: 10,
..Default::default()
};
let dictionary = SuffixArrayDictionary::new(training_data, config).unwrap();
let _result = dictionary.sa_equal_range(0, 5, 0, b't');
}
#[test]
fn test_comprehensive_two_level_algorithm() {
let training_data = b"comprehensive test for the two-level pattern matching algorithm with various patterns";
let config = SuffixArrayDictionaryConfig {
min_pattern_length: 3,
max_pattern_length: 25,
..Default::default()
};
let mut dictionary = SuffixArrayDictionary::new(training_data, config).unwrap();
let test_cases: Vec<(&[u8], bool)> = vec![
(b"comprehensive", true), (b"test", true), (b"pattern", true), (b"xyz", false), (b"comp", true), ];
for (pattern, should_match) in &test_cases {
println!("Testing pattern: {:?}", std::str::from_utf8(pattern).unwrap());
let match_status = dictionary.da_match_max_length(pattern);
println!(" da_match_max_length: lo={}, hi={}, depth={}",
match_status.lo, match_status.hi, match_status.depth);
println!(" min_pattern_length={}", dictionary.config.min_pattern_length);
let result = dictionary.find_longest_match(pattern, 0, pattern.len()).unwrap();
if *should_match {
if result.is_none() {
println!(" ERROR: find_longest_match returned None!");
println!(" Depth check: {} >= {} = {}",
match_status.depth,
dictionary.config.min_pattern_length,
match_status.depth >= dictionary.config.min_pattern_length);
println!(" Empty check: is_empty() = {}", match_status.is_empty());
}
assert!(result.is_some(), "Failed to find pattern: {:?}", std::str::from_utf8(pattern));
} else {
let _ = result;
}
}
}
#[test]
fn test_focused_binary_search_bug() {
println!("\n=== Focused test for binary search bug ===");
let training_data = b"comprehensive test for the two-level pattern matching algorithm with various patterns";
let config = SuffixArrayDictionaryConfig {
min_pattern_length: 3,
max_pattern_length: 25,
..Default::default()
};
let dictionary = SuffixArrayDictionary::new(training_data, config).unwrap();
println!("\nExamining the problematic case:");
println!("After finding 'te' in range [69, 72), searching for 's' at position 2");
println!("\nSuffix array entries in range [69, 72):");
for i in 69..72 {
if let Some(&suffix_idx) = dictionary.suffix_array.as_slice().get(i) {
if suffix_idx < training_data.len() {
let suffix_text = &training_data[suffix_idx..];
let preview = std::str::from_utf8(&suffix_text[..10.min(suffix_text.len())]).unwrap_or("<invalid>");
let char_at_2 = if suffix_idx + 2 < training_data.len() {
training_data[suffix_idx + 2] as char
} else {
'?'
};
println!(" SA[{}] = {} -> '{}...', char[2]='{}'",
i, suffix_idx, preview, char_at_2);
}
}
}
let lo = 69;
let hi = 72;
let pos = 2;
let ch = b's';
println!("\nSearching for '{}' at position {} in range [{}, {})", ch as char, pos, lo, hi);
let linear_result = dictionary.sa_equal_range_linear(lo, hi, pos, ch);
println!("Linear search result: ({}, {})", linear_result.0, linear_result.1);
let binary_result = dictionary.sa_equal_range_binary_optimized(lo, hi, pos, ch);
println!("Binary search result: ({}, {})", binary_result.0, binary_result.1);
if linear_result != binary_result {
println!("\nERROR: Results differ!");
println!("Linear found range [{}, {})", linear_result.0, linear_result.1);
println!("Binary found range [{}, {})", binary_result.0, binary_result.1);
println!("\n=== Tracing binary search algorithm ===");
let mut lo_search = lo;
let mut hi_search = hi;
let mut mideq = None;
println!("Phase 2: Finding ANY match");
while lo_search < hi_search {
let mid = lo_search + (hi_search - lo_search) / 2;
println!(" Checking mid={} in range [{}, {})", mid, lo_search, hi_search);
if let Some(&suffix_idx) = dictionary.suffix_array.as_slice().get(mid) {
if suffix_idx + pos >= training_data.len() {
println!(" Out of bounds, moving lo to {}", mid + 1);
lo_search = mid + 1;
continue;
}
let hit_char = training_data[suffix_idx + pos];
println!(" SA[{}]={}, char[{}]='{}' vs target='{}'",
mid, suffix_idx, pos, hit_char as char, ch as char);
if hit_char < ch {
lo_search = mid + 1;
} else if hit_char > ch {
hi_search = mid;
} else {
println!(" FOUND match at mid={}!", mid);
mideq = Some(mid);
break;
}
}
}
if let Some(mid) = mideq {
println!("\nPhase 3: Finding lower bound in [{}, {})", lo, mid + 1);
let mut lower_lo = lo;
let mut lower_hi = mid + 1;
while lower_lo < lower_hi {
let m = lower_lo + (lower_hi - lower_lo) / 2;
if let Some(&suffix_idx) = dictionary.suffix_array.as_slice().get(m) {
if suffix_idx + pos < training_data.len() {
let hit_char = training_data[suffix_idx + pos];
println!(" mid={}: char='{}' vs '{}'", m, hit_char as char, ch as char);
if hit_char < ch {
lower_lo = m + 1;
} else {
lower_hi = m;
}
} else {
lower_lo = m + 1;
}
}
}
println!("\nPhase 4: Finding upper bound in [{}, {})", mid, hi);
let mut upper_lo = mid;
let mut upper_hi = hi;
while upper_lo < upper_hi {
let m = upper_lo + (upper_hi - upper_lo) / 2;
if let Some(&suffix_idx) = dictionary.suffix_array.as_slice().get(m) {
if suffix_idx + pos < training_data.len() {
let hit_char = training_data[suffix_idx + pos];
println!(" mid={}: char='{}' vs '{}'", m, hit_char as char, ch as char);
if hit_char <= ch {
upper_lo = m + 1;
} else {
upper_hi = m;
}
} else {
upper_lo = m + 1;
}
}
}
println!("\nCalculated range: [{}, {})", lower_lo, upper_lo);
} else {
println!("\nPhase 2 failed to find any match!");
}
}
assert_eq!(linear_result, binary_result,
"Binary search must return same result as linear search");
}
#[test]
fn test_debug_binary_search_issue() {
println!("\n=== Debugging Binary Search Issue ===");
let training_data = b"comprehensive test for the two-level pattern matching algorithm with various patterns";
let config = SuffixArrayDictionaryConfig {
min_pattern_length: 3,
max_pattern_length: 25,
..Default::default()
};
let dictionary = SuffixArrayDictionary::new(training_data, config).unwrap();
println!("\n--- Detailed trace of da_match_max_length for 'test' ---");
let pattern = b"test";
let mut state = 0u32;
let mut lo = 0usize;
let mut hi = dictionary.suffix_array.as_slice().len();
let mut pos = 0usize;
println!("Initial: state={}, lo={}, hi={}, pos={}", state, lo, hi, pos);
while pos < pattern.len() {
let ch = pattern[pos];
println!("\nStep {}: Looking for '{}' at position {}", pos, ch as char, pos);
let child = dictionary.dfa_cache.transition_state(state, ch);
if let Some(next_state) = child {
if let Some(dfa_state) = dictionary.dfa_cache.get_state(next_state) {
println!(" DFA cache HIT: next_state={}, new range=({}, {})",
next_state, dfa_state.suffix_low, dfa_state.suffix_hig);
state = next_state;
lo = dfa_state.suffix_low as usize;
hi = dfa_state.suffix_hig as usize;
pos += 1;
} else {
println!(" DFA cache MISS (invalid state): falling back to suffix array");
break;
}
} else {
println!(" DFA cache MISS (no transition): falling back to suffix array");
break;
}
}
if pos < pattern.len() {
println!("\nContinuing with suffix array from pos={}, range=({}, {})", pos, lo, hi);
while pos < pattern.len() && lo < hi {
let ch = pattern[pos];
println!(" SA Step {}: Looking for '{}' at position {}", pos, ch as char, pos);
let (new_lo, new_hi) = dictionary.sa_equal_range(lo, hi, pos, ch);
println!(" New range: ({}, {})", new_lo, new_hi);
if new_lo >= new_hi {
println!(" No matches found - stopping");
break;
}
lo = new_lo;
hi = new_hi;
pos += 1;
}
}
println!("\nFinal result: lo={}, hi={}, depth={}", lo, hi, pos);
let actual_result = dictionary.da_match_max_length(pattern);
println!("\nActual da_match_max_length result: lo={}, hi={}, depth={}",
actual_result.lo, actual_result.hi, actual_result.depth);
if actual_result.depth != pos {
println!("WARNING: Manual trace depth ({}) differs from actual result ({})",
pos, actual_result.depth);
}
println!("\n--- Test 1: Compare linear vs binary for 't' at position 0 ---");
let lo = 0;
let hi = dictionary.suffix_array.as_slice().len();
let pos = 0;
let ch = b't';
let linear_result = dictionary.sa_equal_range_linear(lo, hi, pos, ch);
println!("Linear search result: ({}, {})", linear_result.0, linear_result.1);
let binary_result = dictionary.sa_equal_range_binary_optimized(lo, hi, pos, ch);
println!("Binary search result: ({}, {})", binary_result.0, binary_result.1);
if linear_result != binary_result {
println!("ERROR: Results differ!");
println!("Suffix array contents around the range:");
for i in 0..20.min(dictionary.suffix_array.as_slice().len()) {
if let Some(&suffix_idx) = dictionary.suffix_array.as_slice().get(i) {
if suffix_idx < dictionary.dictionary_text.len() {
let char_at_pos = if suffix_idx + pos < dictionary.dictionary_text.len() {
dictionary.dictionary_text[suffix_idx + pos] as char
} else {
'?'
};
let text_preview = if suffix_idx < dictionary.dictionary_text.len() {
let end = (suffix_idx + 15).min(dictionary.dictionary_text.len());
std::str::from_utf8(&dictionary.dictionary_text[suffix_idx..end]).unwrap_or("<invalid>")
} else {
"<out of bounds>"
};
println!(" SA[{}] = {}, char[{}]='{}', text='{}'",
i, suffix_idx, pos, char_at_pos, text_preview);
}
}
}
}
println!("\n--- Test 2: Full pattern matching for 'test' ---");
let pattern = b"test";
let match_status = dictionary.da_match_max_length(pattern);
println!("Match status: lo={}, hi={}, depth={}",
match_status.lo, match_status.hi, match_status.depth);
if match_status.is_empty() {
println!("ERROR: No matches found for 'test'!");
println!("\nStep-by-step search for 'test':");
let mut current_lo = 0;
let mut current_hi = dictionary.suffix_array.as_slice().len();
for (i, &ch) in pattern.iter().enumerate() {
println!(" Step {}: searching for '{}' at position {}", i, ch as char, i);
println!(" Current range: ({}, {})", current_lo, current_hi);
let (new_lo, new_hi) = dictionary.sa_equal_range(current_lo, current_hi, i, ch);
println!(" New range: ({}, {})", new_lo, new_hi);
if new_lo >= new_hi {
println!(" FAILED at character '{}'!", ch as char);
let linear = dictionary.sa_equal_range_linear(current_lo, current_hi, i, ch);
println!(" Linear search would give: ({}, {})", linear.0, linear.1);
break;
}
current_lo = new_lo;
current_hi = new_hi;
}
} else {
println!("SUCCESS: Found {} matches", match_status.match_count());
}
println!("\n--- Test 3: Effect of running linear search first ---");
let dict1 = SuffixArrayDictionary::new(training_data, SuffixArrayDictionaryConfig {
min_pattern_length: 3,
max_pattern_length: 25,
..Default::default()
}).unwrap();
let binary_only = dict1.sa_equal_range_binary_optimized(0, dict1.suffix_array.as_slice().len(), 0, b't');
println!("Binary only result: ({}, {})", binary_only.0, binary_only.1);
let dict2 = SuffixArrayDictionary::new(training_data, SuffixArrayDictionaryConfig {
min_pattern_length: 3,
max_pattern_length: 25,
..Default::default()
}).unwrap();
let _ = dict2.sa_equal_range_linear(0, dict2.suffix_array.as_slice().len(), 0, b't');
let binary_after_linear = dict2.sa_equal_range_binary_optimized(0, dict2.suffix_array.as_slice().len(), 0, b't');
println!("Binary after linear result: ({}, {})", binary_after_linear.0, binary_after_linear.1);
if binary_only != binary_after_linear {
println!("WARNING: Results differ based on whether linear was run first!");
}
}
}