use crate::error::{Result, ZiporaError};
use std::collections::HashMap;
use std::mem;
use std::str;
#[derive(Debug, Clone)]
pub struct AdvancedStringConfig {
pub compression_level: u8,
pub initial_arena_capacity: usize,
pub initial_index_capacity: usize,
pub enable_hardware_acceleration: bool,
pub min_overlap_length: usize,
pub hash_table_size: usize,
}
impl Default for AdvancedStringConfig {
fn default() -> Self {
Self {
compression_level: 1, initial_arena_capacity: 4096,
initial_index_capacity: 256,
enable_hardware_acceleration: true,
min_overlap_length: 3,
hash_table_size: 1024,
}
}
}
impl AdvancedStringConfig {
pub fn performance_optimized() -> Self {
Self {
compression_level: 1,
initial_arena_capacity: 64 * 1024,
initial_index_capacity: 1024,
enable_hardware_acceleration: true,
min_overlap_length: 2,
hash_table_size: 4096,
}
}
pub fn memory_optimized() -> Self {
Self {
compression_level: 3, initial_arena_capacity: 8 * 1024,
initial_index_capacity: 512,
enable_hardware_acceleration: true,
min_overlap_length: 3,
hash_table_size: 8192,
}
}
pub fn balanced() -> Self {
Self {
compression_level: 2,
initial_arena_capacity: 32 * 1024,
initial_index_capacity: 512,
enable_hardware_acceleration: true,
min_overlap_length: 3,
hash_table_size: 2048,
}
}
}
#[repr(transparent)]
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
struct BitPackedEntry(u64);
impl BitPackedEntry {
const OFFSET_BITS: u32 = 40;
const LENGTH_BITS: u32 = 24;
const OFFSET_MASK: u64 = (1u64 << Self::OFFSET_BITS) - 1;
const LENGTH_MASK: u64 = (1u64 << Self::LENGTH_BITS) - 1;
const MAX_OFFSET: usize = (1usize << Self::OFFSET_BITS) - 1; const MAX_LENGTH: usize = (1usize << Self::LENGTH_BITS) - 1;
#[inline(always)]
fn new(offset: usize, length: usize) -> Result<Self> {
if offset > Self::MAX_OFFSET {
return Err(ZiporaError::out_of_memory(offset));
}
if length > Self::MAX_LENGTH {
return Err(ZiporaError::out_of_memory(length));
}
let packed = (offset as u64) | ((length as u64) << Self::OFFSET_BITS);
Ok(BitPackedEntry(packed))
}
#[inline(always)]
fn offset(&self) -> usize {
#[cfg(all(target_feature = "bmi2", target_arch = "x86_64"))]
{
self.offset_bmi2()
}
#[cfg(not(all(target_feature = "bmi2", target_arch = "x86_64")))]
{
(self.0 & Self::OFFSET_MASK) as usize
}
}
#[inline(always)]
fn length(&self) -> usize {
#[cfg(all(target_feature = "bmi2", target_arch = "x86_64"))]
{
self.length_bmi2()
}
#[cfg(not(all(target_feature = "bmi2", target_arch = "x86_64")))]
{
((self.0 >> Self::OFFSET_BITS) & Self::LENGTH_MASK) as usize
}
}
#[cfg(all(target_feature = "bmi2", target_arch = "x86_64"))]
#[inline(always)]
fn offset_bmi2(&self) -> usize {
unsafe {
std::arch::x86_64::_bextr_u64(self.0, 0, Self::OFFSET_BITS) as usize
}
}
#[cfg(all(target_feature = "bmi2", target_arch = "x86_64"))]
#[inline(always)]
fn length_bmi2(&self) -> usize {
unsafe {
std::arch::x86_64::_bextr_u64(self.0, Self::OFFSET_BITS, Self::LENGTH_BITS) as usize
}
}
#[inline(always)]
fn end_offset(&self) -> usize {
self.offset() + self.length()
}
}
#[derive(Debug, Default, Clone)]
pub struct CompressionStats {
pub total_strings: usize,
pub unique_strings: usize,
pub total_bytes_stored: usize,
pub arena_bytes_used: usize,
pub compression_ratio: f64,
pub deduplication_savings: usize,
pub level_used: u8,
}
#[derive(Debug)]
struct OverlapHashTable {
prefix3_map: HashMap<[u8; 3], Vec<usize>>,
prefix4_map: HashMap<[u8; 4], Vec<usize>>,
insertion_order: Vec<usize>,
}
impl OverlapHashTable {
fn new() -> Self {
Self {
prefix3_map: HashMap::new(),
prefix4_map: HashMap::new(),
insertion_order: Vec::new(),
}
}
fn clear(&mut self) {
self.prefix3_map.clear();
self.prefix4_map.clear();
self.insertion_order.clear();
}
fn add_string(&mut self, index: usize, bytes: &[u8]) {
self.insertion_order.push(index);
if bytes.len() >= 3 {
let prefix3 = [bytes[0], bytes[1], bytes[2]];
self.prefix3_map.entry(prefix3).or_insert_with(Vec::new).push(index);
}
if bytes.len() >= 4 {
let prefix4 = [bytes[0], bytes[1], bytes[2], bytes[3]];
self.prefix4_map.entry(prefix4).or_insert_with(Vec::new).push(index);
}
}
fn find_overlaps(&self, bytes: &[u8]) -> Vec<usize> {
let mut candidates = Vec::new();
if bytes.len() >= 4 {
let prefix4 = [bytes[0], bytes[1], bytes[2], bytes[3]];
if let Some(indices) = self.prefix4_map.get(&prefix4) {
candidates.extend_from_slice(indices);
}
}
if candidates.is_empty() && bytes.len() >= 3 {
let prefix3 = [bytes[0], bytes[1], bytes[2]];
if let Some(indices) = self.prefix3_map.get(&prefix3) {
candidates.extend_from_slice(indices);
}
}
candidates
}
}
pub struct AdvancedStringVec {
arena: Vec<u8>,
entries: Vec<BitPackedEntry>,
config: AdvancedStringConfig,
stats: CompressionStats,
overlap_table: OverlapHashTable,
dedup_map: HashMap<u64, usize>,
}
impl AdvancedStringVec {
pub fn new() -> Self {
Self::with_config(AdvancedStringConfig::default())
}
pub fn with_config(config: AdvancedStringConfig) -> Self {
let mut vec = Self {
arena: Vec::with_capacity(config.initial_arena_capacity),
entries: Vec::with_capacity(config.initial_index_capacity),
config,
stats: CompressionStats::default(),
overlap_table: OverlapHashTable::new(),
dedup_map: HashMap::new(),
};
vec.stats.level_used = vec.config.compression_level;
vec
}
pub fn with_capacity(capacity: usize) -> Self {
let mut config = AdvancedStringConfig::default();
config.initial_index_capacity = capacity;
config.initial_arena_capacity = capacity * 16; Self::with_config(config)
}
pub fn push(&mut self, s: &str) -> Result<usize> {
let s_bytes = s.as_bytes();
self.stats.total_strings += 1;
self.stats.total_bytes_stored += s_bytes.len();
match self.config.compression_level {
0 => self.push_level0(s_bytes),
1 => self.push_level1(s_bytes),
2 => self.push_level2(s_bytes),
3 => self.push_level3(s_bytes),
_ => self.push_level1(s_bytes), }
}
fn push_level0(&mut self, s_bytes: &[u8]) -> Result<usize> {
let offset = self.arena.len();
let length = s_bytes.len();
self.arena.extend_from_slice(s_bytes);
let entry = BitPackedEntry::new(offset, length)?;
let index = self.entries.len();
self.entries.push(entry);
self.update_stats();
Ok(index)
}
fn push_level1(&mut self, s_bytes: &[u8]) -> Result<usize> {
let hash = self.simple_hash(s_bytes);
if let Some(&existing_index) = self.dedup_map.get(&hash) {
if let Some(existing_str) = self.get_bytes(existing_index) {
if existing_str == s_bytes {
self.stats.deduplication_savings += s_bytes.len();
self.update_stats();
return Ok(existing_index);
}
}
}
let offset = self.arena.len();
let length = s_bytes.len();
self.arena.extend_from_slice(s_bytes);
let entry = BitPackedEntry::new(offset, length)?;
let index = self.entries.len();
self.entries.push(entry);
self.dedup_map.insert(hash, index);
self.update_stats();
Ok(index)
}
fn push_level2(&mut self, s_bytes: &[u8]) -> Result<usize> {
if let Ok(index) = self.try_deduplication(s_bytes) {
return Ok(index);
}
if s_bytes.len() >= self.config.min_overlap_length {
if let Some(overlap_result) = self.find_non_overlapping_match(s_bytes) {
let (existing_offset, _existing_len) = overlap_result;
let entry = BitPackedEntry::new(existing_offset, s_bytes.len())?;
let index = self.entries.len();
self.entries.push(entry);
self.overlap_table.add_string(index, s_bytes);
self.update_stats();
return Ok(index);
}
}
self.store_new_string(s_bytes)
}
fn push_level3(&mut self, s_bytes: &[u8]) -> Result<usize> {
if let Ok(index) = self.try_deduplication(s_bytes) {
return Ok(index);
}
if s_bytes.len() >= self.config.min_overlap_length {
if let Some(overlap_result) = self.find_overlapping_match(s_bytes) {
let (existing_offset, _overlap_len) = overlap_result;
let entry = BitPackedEntry::new(existing_offset, s_bytes.len())?;
let index = self.entries.len();
self.entries.push(entry);
self.overlap_table.add_string(index, s_bytes);
self.update_stats();
return Ok(index);
}
}
self.store_new_string(s_bytes)
}
fn try_deduplication(&mut self, s_bytes: &[u8]) -> Result<usize> {
let hash = self.simple_hash(s_bytes);
if let Some(&existing_index) = self.dedup_map.get(&hash) {
if let Some(existing_str) = self.get_bytes(existing_index) {
if existing_str == s_bytes {
self.stats.deduplication_savings += s_bytes.len();
self.update_stats();
return Ok(existing_index);
}
}
}
Err(ZiporaError::invalid_data("No deduplication match found".to_string()))
}
fn find_non_overlapping_match(&self, s_bytes: &[u8]) -> Option<(usize, usize)> {
let candidates = self.overlap_table.find_overlaps(s_bytes);
for &candidate_idx in &candidates {
if let Some(candidate_bytes) = self.get_bytes(candidate_idx) {
if let Some(pos) = self.find_substring(candidate_bytes, s_bytes) {
let candidate_entry = self.entries[candidate_idx];
let match_offset = candidate_entry.offset() + pos;
return Some((match_offset, s_bytes.len()));
}
}
}
None
}
fn find_overlapping_match(&self, s_bytes: &[u8]) -> Option<(usize, usize)> {
let candidates = self.overlap_table.find_overlaps(s_bytes);
for &candidate_idx in &candidates {
if let Some(candidate_bytes) = self.get_bytes(candidate_idx) {
if let Some(overlap_info) = self.find_best_overlap(candidate_bytes, s_bytes) {
let candidate_entry = self.entries[candidate_idx];
let match_offset = candidate_entry.offset() + overlap_info.0;
return Some((match_offset, overlap_info.1));
}
}
}
None
}
fn store_new_string(&mut self, s_bytes: &[u8]) -> Result<usize> {
let offset = self.arena.len();
let length = s_bytes.len();
self.arena.extend_from_slice(s_bytes);
let entry = BitPackedEntry::new(offset, length)?;
let index = self.entries.len();
self.entries.push(entry);
self.overlap_table.add_string(index, s_bytes);
let hash = self.simple_hash(s_bytes);
self.dedup_map.insert(hash, index);
self.update_stats();
Ok(index)
}
pub fn get(&self, index: usize) -> Option<&str> {
if index >= self.entries.len() {
return None;
}
let entry = self.entries[index];
let offset = entry.offset();
let length = entry.length();
if offset + length <= self.arena.len() {
let slice = &self.arena[offset..offset + length];
str::from_utf8(slice).ok()
} else {
None
}
}
pub fn get_bytes(&self, index: usize) -> Option<&[u8]> {
if index >= self.entries.len() {
return None;
}
let entry = self.entries[index];
let offset = entry.offset();
let length = entry.length();
if offset + length <= self.arena.len() {
Some(&self.arena[offset..offset + length])
} else {
None
}
}
#[inline]
pub fn len(&self) -> usize {
self.entries.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
pub fn stats(&self) -> &CompressionStats {
&self.stats
}
pub fn memory_info(&self) -> (usize, usize, f64) {
let arena_bytes = self.arena.len();
let entries_bytes = self.entries.len() * mem::size_of::<BitPackedEntry>();
let overhead_bytes = mem::size_of::<Self>();
let total_bytes = arena_bytes + entries_bytes + overhead_bytes;
let vec_string_overhead = self.stats.total_strings * mem::size_of::<String>();
let vec_string_heap = self.stats.total_bytes_stored;
let vec_string_alloc_overhead = self.stats.total_strings * 16; let vec_struct_overhead = mem::size_of::<Vec<String>>();
let vec_string_bytes = vec_string_overhead + vec_string_heap + vec_string_alloc_overhead + vec_struct_overhead;
let ratio = if vec_string_bytes > 0 {
total_bytes as f64 / vec_string_bytes as f64
} else {
1.0
};
(total_bytes, vec_string_bytes, ratio)
}
fn simple_hash(&self, bytes: &[u8]) -> u64 {
let mut hash = 0xcbf29ce484222325u64;
for &byte in bytes {
hash ^= byte as u64;
hash = hash.wrapping_mul(0x100000001b3);
}
hash
}
fn find_substring(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
if needle.len() > haystack.len() {
return None;
}
for i in 0..=(haystack.len() - needle.len()) {
if &haystack[i..i + needle.len()] == needle {
return Some(i);
}
}
None
}
fn find_best_overlap(&self, existing: &[u8], new: &[u8]) -> Option<(usize, usize)> {
let min_overlap = self.config.min_overlap_length;
if let Some(pos) = self.find_substring(existing, new) {
return Some((pos, new.len()));
}
for overlap_len in (min_overlap..existing.len().min(new.len())).rev() {
if existing[existing.len() - overlap_len..] == new[..overlap_len] {
return Some((existing.len() - overlap_len, new.len()));
}
}
None
}
fn update_stats(&mut self) {
self.stats.arena_bytes_used = self.arena.len();
self.stats.unique_strings = self.entries.len();
if self.stats.total_bytes_stored > 0 {
self.stats.compression_ratio =
self.stats.arena_bytes_used as f64 / self.stats.total_bytes_stored as f64;
}
}
}
impl Default for AdvancedStringVec {
fn default() -> Self {
Self::new()
}
}
impl Clone for AdvancedStringVec {
fn clone(&self) -> Self {
Self {
arena: self.arena.clone(),
entries: self.entries.clone(),
config: self.config.clone(),
stats: self.stats.clone(),
overlap_table: OverlapHashTable::new(), dedup_map: self.dedup_map.clone(),
}
}
}
pub struct AdvancedStringIter<'a> {
vec: &'a AdvancedStringVec,
current: usize,
}
impl<'a> Iterator for AdvancedStringIter<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<Self::Item> {
if self.current < self.vec.len() {
let result = self.vec.get(self.current);
self.current += 1;
result
} else {
None
}
}
}
impl AdvancedStringVec {
pub fn iter(&self) -> AdvancedStringIter {
AdvancedStringIter { vec: self, current: 0 }
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bit_packed_entry() {
let entry = BitPackedEntry::new(0x123456789, 0x234567).unwrap();
assert_eq!(entry.offset(), 0x123456789);
assert_eq!(entry.length(), 0x234567);
assert_eq!(entry.end_offset(), 0x123456789 + 0x234567);
}
#[test]
fn test_bit_packed_entry_limits() {
let max_entry = BitPackedEntry::new(BitPackedEntry::MAX_OFFSET, BitPackedEntry::MAX_LENGTH);
assert!(max_entry.is_ok());
let overflow_offset = BitPackedEntry::new(BitPackedEntry::MAX_OFFSET + 1, 0);
assert!(overflow_offset.is_err());
let overflow_length = BitPackedEntry::new(0, BitPackedEntry::MAX_LENGTH + 1);
assert!(overflow_length.is_err());
}
#[test]
fn test_level0_compression() {
let mut vec = AdvancedStringVec::with_config(AdvancedStringConfig {
compression_level: 0,
..AdvancedStringConfig::default()
});
let idx1 = vec.push("hello").unwrap();
let idx2 = vec.push("world").unwrap();
let idx3 = vec.push("hello").unwrap();
assert_eq!(vec.get(idx1), Some("hello"));
assert_eq!(vec.get(idx2), Some("world"));
assert_eq!(vec.get(idx3), Some("hello"));
assert_eq!(vec.len(), 3); }
#[test]
fn test_level1_deduplication() {
let mut vec = AdvancedStringVec::with_config(AdvancedStringConfig {
compression_level: 1,
..AdvancedStringConfig::default()
});
let idx1 = vec.push("hello").unwrap();
let idx2 = vec.push("world").unwrap();
let idx3 = vec.push("hello").unwrap();
assert_eq!(vec.get(idx1), Some("hello"));
assert_eq!(vec.get(idx2), Some("world"));
assert_eq!(vec.get(idx3), Some("hello"));
assert_eq!(idx1, idx3); assert_eq!(vec.len(), 2); assert!(vec.stats().deduplication_savings > 0);
}
#[test]
fn test_memory_efficiency() {
let mut vec = AdvancedStringVec::with_capacity(1000);
for i in 0..1000 {
vec.push(&format!("test_string_{:04}", i)).unwrap();
}
let (our_bytes, vec_string_bytes, ratio) = vec.memory_info();
println!("Advanced string container memory test:");
println!(" Our size: {} bytes", our_bytes);
println!(" Vec<String> equivalent: {} bytes", vec_string_bytes);
println!(" Memory ratio: {:.3}", ratio);
println!(" Memory savings: {:.1}%", (1.0 - ratio) * 100.0);
assert!(our_bytes < vec_string_bytes);
assert!(ratio < 0.8); }
#[test]
fn test_level2_overlap_detection() {
let mut vec = AdvancedStringVec::with_config(AdvancedStringConfig {
compression_level: 2,
min_overlap_length: 3,
..AdvancedStringConfig::default()
});
vec.push("hello world").unwrap();
vec.push("world").unwrap(); vec.push("hello").unwrap();
assert_eq!(vec.get(0), Some("hello world"));
assert_eq!(vec.get(1), Some("world"));
assert_eq!(vec.get(2), Some("hello"));
}
#[test]
fn test_level3_aggressive_compression() {
let mut vec = AdvancedStringVec::with_config(AdvancedStringConfig {
compression_level: 3,
min_overlap_length: 2,
..AdvancedStringConfig::default()
});
vec.push("programming").unwrap();
vec.push("program").unwrap(); vec.push("gram").unwrap();
assert_eq!(vec.get(0), Some("programming"));
assert_eq!(vec.get(1), Some("program"));
assert_eq!(vec.get(2), Some("gram"));
}
#[test]
fn test_iterator() {
let mut vec = AdvancedStringVec::new();
vec.push("first").unwrap();
vec.push("second").unwrap();
vec.push("third").unwrap();
let collected: Vec<&str> = vec.iter().collect();
assert_eq!(collected, vec!["first", "second", "third"]);
}
#[test]
fn test_large_dataset() {
let mut vec = AdvancedStringVec::with_config(AdvancedStringConfig::memory_optimized());
for i in 0..10000 {
let s = format!("item_{:08}", i);
vec.push(&s).unwrap();
}
assert_eq!(vec.len(), 10000);
assert_eq!(vec.get(0), Some("item_00000000"));
assert_eq!(vec.get(5000), Some("item_00005000"));
assert_eq!(vec.get(9999), Some("item_00009999"));
let stats = vec.stats();
println!("Large dataset compression stats:");
println!(" Total strings: {}", stats.total_strings);
println!(" Unique strings: {}", stats.unique_strings);
println!(" Compression ratio: {:.3}", stats.compression_ratio);
println!(" Deduplication savings: {} bytes", stats.deduplication_savings);
}
#[test]
fn test_configuration_presets() {
let perf_config = AdvancedStringConfig::performance_optimized();
assert_eq!(perf_config.compression_level, 1);
assert!(perf_config.enable_hardware_acceleration);
let mem_config = AdvancedStringConfig::memory_optimized();
assert_eq!(mem_config.compression_level, 3);
let balanced_config = AdvancedStringConfig::balanced();
assert_eq!(balanced_config.compression_level, 2);
}
}