use crate::error::{Result, ZiporaError};
use std::cmp;
#[derive(Debug, Clone, Copy, PartialEq)]
enum CompressionStrategy {
Raw,
MinMaxBitPacked { min_val: u32, bit_width: u8 },
RunLength,
}
const MIN_COMPRESSION_ELEMENTS: usize = 4; const MIN_COMPRESSION_RATIO: f64 = 0.8; const MIN_ALLOCATION_SIZE: usize = 32; const GOLDEN_RATIO_NUMERATOR: usize = 103; const GOLDEN_RATIO_DENOMINATOR: usize = 64;
#[derive(Debug)]
pub struct UintVector {
strategy: CompressionStrategy,
data: Vec<u8>,
len: usize,
stats: CompressionStats,
temp_values: Vec<u32>,
}
#[derive(Debug, Default)]
struct CompressionStats {
original_size: usize,
compressed_size: usize,
strategy_switches: usize,
}
impl UintVector {
pub fn new() -> Self {
Self {
strategy: CompressionStrategy::Raw,
data: Vec::new(),
len: 0,
stats: CompressionStats::default(),
temp_values: Vec::new(),
}
}
pub fn with_capacity(capacity: usize) -> Self {
let mut vec = Self::new();
vec.temp_values.reserve(capacity);
vec
}
pub fn build_from(values: &[u32]) -> Result<Self> {
if values.is_empty() {
return Ok(Self::new());
}
let mut result = Self::new();
result.len = values.len();
let strategy = Self::analyze_optimal_strategy(values);
result.strategy = strategy;
result.compress_with_strategy(values, strategy)?;
result.stats.original_size = values.len() * 4;
result.stats.compressed_size = result.data.len();
Ok(result)
}
pub fn push(&mut self, value: u32) -> Result<()> {
self.temp_values.push(value);
self.len += 1;
if self.temp_values.len() % 64 == 0 || self.temp_values.len() > 1000 {
self.recompress_all()?;
} else {
self.quick_append(value)?;
}
Ok(())
}
pub fn get(&self, index: usize) -> Option<u32> {
if index >= self.len {
return None;
}
if !self.temp_values.is_empty() {
let compressed_len = self.len - self.temp_values.len();
if index >= compressed_len {
return self.temp_values.get(index - compressed_len).copied();
}
}
match self.strategy {
CompressionStrategy::Raw => self.get_raw(index),
CompressionStrategy::MinMaxBitPacked { min_val, bit_width } => {
self.get_min_max_bit_packed(index, min_val, bit_width)
}
CompressionStrategy::RunLength => self.get_run_length(index),
}
}
#[inline]
pub fn len(&self) -> usize {
self.len
}
#[inline]
pub fn is_empty(&self) -> bool {
self.len == 0
}
pub fn compression_ratio(&self) -> f64 {
if self.stats.original_size == 0 {
1.0
} else {
self.stats.compressed_size as f64 / self.stats.original_size as f64
}
}
pub fn stats(&self) -> (usize, usize, f64) {
(
self.stats.original_size,
self.stats.compressed_size,
self.compression_ratio(),
)
}
#[inline]
pub fn memory_usage(&self) -> usize {
std::mem::size_of::<Self>() +
self.data.capacity() +
self.temp_values.capacity() * 4 +
std::mem::size_of::<CompressionStats>()
}
fn calculate_run_ratio(values: &[u32]) -> f64 {
if values.len() < 2 {
return 0.0;
}
let mut runs = 0;
let mut current_run = 1;
for i in 1..values.len() {
if values[i] == values[i - 1] {
current_run += 1;
} else {
if current_run > 1 {
runs += current_run;
}
current_run = 1;
}
}
if current_run > 1 {
runs += current_run;
}
runs as f64 / values.len() as f64
}
fn analyze_optimal_strategy(values: &[u32]) -> CompressionStrategy {
if values.is_empty() {
return CompressionStrategy::Raw;
}
if values.len() < MIN_COMPRESSION_ELEMENTS {
return CompressionStrategy::Raw;
}
let run_ratio = Self::calculate_run_ratio(values);
if run_ratio > 0.5 {
let raw_bytes = values.len() * 4;
let rle_estimate = Self::estimate_run_length_size(values);
if Self::should_compress(values.len(), raw_bytes, rle_estimate) {
return CompressionStrategy::RunLength;
}
}
let min_val = *values.iter().min().expect("non-empty input");
let max_val = *values.iter().max().expect("non-empty input");
if min_val == max_val {
let raw_bytes = values.len() * 4;
let compressed_estimate = Self::compute_compressed_size(1, values.len());
if Self::should_compress(values.len(), raw_bytes, compressed_estimate) {
return CompressionStrategy::MinMaxBitPacked {
min_val,
bit_width: 1,
};
} else {
return CompressionStrategy::Raw;
}
}
let range = max_val - min_val;
let bit_width = if range == 0 {
1
} else {
32 - range.leading_zeros() as u8
};
let raw_bytes = values.len() * 4;
let compressed_estimate = Self::compute_compressed_size(bit_width, values.len());
if Self::should_compress(values.len(), raw_bytes, compressed_estimate) {
CompressionStrategy::MinMaxBitPacked { min_val, bit_width }
} else {
CompressionStrategy::Raw
}
}
fn should_compress(num_elements: usize, raw_bytes: usize, compressed_bytes: usize) -> bool {
if num_elements < MIN_COMPRESSION_ELEMENTS {
return false;
}
let ratio = compressed_bytes as f64 / raw_bytes as f64;
ratio < MIN_COMPRESSION_RATIO
}
fn compute_compressed_size(bit_width: u8, num_elements: usize) -> usize {
let total_bits = bit_width as usize * num_elements;
let using_size = (total_bits + 7) / 8;
let touch_size = using_size + 8 - 1; let aligned_size = (touch_size + 15) & !15; std::cmp::max(aligned_size, MIN_ALLOCATION_SIZE)
}
fn estimate_run_length_size(values: &[u32]) -> usize {
if values.is_empty() {
return 0;
}
let mut runs = 1;
for i in 1..values.len() {
if values[i] != values[i - 1] {
runs += 1;
}
}
let estimated_size = runs * 8;
std::cmp::max(estimated_size, MIN_ALLOCATION_SIZE)
}
fn compress_with_strategy(
&mut self,
values: &[u32],
strategy: CompressionStrategy,
) -> Result<()> {
self.data.clear();
match strategy {
CompressionStrategy::Raw => self.compress_raw(values),
CompressionStrategy::MinMaxBitPacked { min_val, bit_width } => {
self.compress_min_max_bit_packed(values, min_val, bit_width)
}
CompressionStrategy::RunLength => self.compress_run_length(values),
}
}
fn compress_raw(&mut self, values: &[u32]) -> Result<()> {
self.data.reserve(values.len() * 4);
for &value in values {
self.data.extend_from_slice(&value.to_le_bytes());
}
Ok(())
}
fn compress_min_max_bit_packed(
&mut self,
values: &[u32],
min_val: u32,
bit_width: u8,
) -> Result<()> {
if bit_width == 0 || bit_width > 32 {
return Err(ZiporaError::invalid_data("Invalid bit width"));
}
let aligned_size = Self::compute_compressed_size(bit_width, values.len());
self.data.resize(aligned_size, 0);
let mut bit_offset = 0;
for &value in values {
if value < min_val {
return Err(ZiporaError::invalid_data("Value below minimum"));
}
let offset_value = value - min_val;
self.write_bits_fast(offset_value, bit_offset, bit_width)?;
bit_offset += bit_width as usize;
}
Ok(())
}
fn compress_run_length(&mut self, values: &[u32]) -> Result<()> {
if values.is_empty() {
return Ok(());
}
let mut current_value = values[0];
let mut run_length = 1u32;
for &value in values.iter().skip(1) {
if value == current_value && run_length < u32::MAX {
run_length += 1;
} else {
self.data.extend_from_slice(¤t_value.to_le_bytes());
self.data.extend_from_slice(&run_length.to_le_bytes());
current_value = value;
run_length = 1;
}
}
self.data.extend_from_slice(¤t_value.to_le_bytes());
self.data.extend_from_slice(&run_length.to_le_bytes());
Ok(())
}
fn write_bits_fast(&mut self, value: u32, bit_offset: usize, bits: u8) -> Result<()> {
let byte_offset = bit_offset / 8;
let bit_in_byte = bit_offset % 8;
let mask = if bits == 32 {
u32::MAX
} else {
(1u32 << bits) - 1
};
let masked_value = value & mask;
let bits_needed = bit_in_byte + bits as usize;
let bytes_needed = (bits_needed + 7) / 8;
if byte_offset + bytes_needed > self.data.len() {
return Err(ZiporaError::invalid_data("Bit offset out of bounds"));
}
if bytes_needed <= 8 && byte_offset + 8 <= self.data.len() {
let mut bytes = [0u8; 8];
let available = std::cmp::min(8, self.data.len() - byte_offset);
bytes[..available].copy_from_slice(&self.data[byte_offset..byte_offset + available]);
let current = u64::from_le_bytes(bytes);
let shifted_value = (masked_value as u64) << bit_in_byte;
let result = current | shifted_value;
let result_bytes = result.to_le_bytes();
let copy_len = std::cmp::min(8, self.data.len() - byte_offset);
self.data[byte_offset..byte_offset + copy_len]
.copy_from_slice(&result_bytes[..copy_len]);
} else {
self.write_bits_slow(masked_value, bit_offset, bits)?;
}
Ok(())
}
fn write_bits_slow(&mut self, value: u32, bit_offset: usize, bits: u8) -> Result<()> {
for i in 0..bits {
let bit_pos = bit_offset + i as usize;
let byte_idx = bit_pos / 8;
let bit_idx = bit_pos % 8;
if byte_idx >= self.data.len() {
return Err(ZiporaError::invalid_data("Bit position out of bounds"));
}
if (value >> i) & 1 == 1 {
self.data[byte_idx] |= 1 << bit_idx;
}
}
Ok(())
}
fn get_raw(&self, index: usize) -> Option<u32> {
let byte_index = index * 4;
if byte_index + 4 <= self.data.len() {
let bytes = [
self.data[byte_index],
self.data[byte_index + 1],
self.data[byte_index + 2],
self.data[byte_index + 3],
];
Some(u32::from_le_bytes(bytes))
} else {
None
}
}
fn get_min_max_bit_packed(&self, index: usize, min_val: u32, bit_width: u8) -> Option<u32> {
let bit_offset = index * bit_width as usize;
match self.read_bits_fast(bit_offset, bit_width) {
Ok(offset_value) => Some(min_val + offset_value),
Err(_) => None,
}
}
fn read_bits_fast(&self, bit_offset: usize, bits: u8) -> Result<u32> {
let byte_offset = bit_offset / 8;
let bit_in_byte = bit_offset % 8;
if byte_offset >= self.data.len() || bits > 32 {
return self.read_bits_slow(bit_offset, bits);
}
let mut bytes = [0u8; 8];
let available = std::cmp::min(8, self.data.len() - byte_offset);
bytes[..available].copy_from_slice(&self.data[byte_offset..byte_offset + available]);
let value = u64::from_le_bytes(bytes);
let shifted = value >> bit_in_byte;
let mask = if bits == 32 {
u32::MAX
} else {
(1u32 << bits) - 1
};
Ok((shifted as u32) & mask)
}
fn read_bits_slow(&self, bit_offset: usize, bits: u8) -> Result<u32> {
let byte_offset = bit_offset / 8;
let bit_in_byte = bit_offset % 8;
if byte_offset >= self.data.len() || bits > 32 {
return Err(ZiporaError::invalid_data("Invalid bit read parameters"));
}
let mut result = 0u32;
let mut bits_read = 0;
let mut current_byte_offset = byte_offset;
while bits_read < bits && current_byte_offset < self.data.len() {
let byte_value = self.data[current_byte_offset] as u32;
let bits_available_in_byte = 8 - if bits_read == 0 { bit_in_byte } else { 0 };
let bits_to_read = cmp::min(bits - bits_read, bits_available_in_byte as u8) as usize;
let shift_amount = if bits_read == 0 { bit_in_byte } else { 0 };
let mask = ((1u32 << bits_to_read) - 1) << shift_amount;
let extracted_bits = (byte_value & mask) >> shift_amount;
result |= extracted_bits << bits_read;
bits_read += bits_to_read as u8;
current_byte_offset += 1;
}
Ok(result)
}
fn get_run_length(&self, index: usize) -> Option<u32> {
let mut current_index = 0;
let mut byte_offset = 0;
while byte_offset + 8 <= self.data.len() {
let value_bytes = [
self.data[byte_offset],
self.data[byte_offset + 1],
self.data[byte_offset + 2],
self.data[byte_offset + 3],
];
let value = u32::from_le_bytes(value_bytes);
let length_bytes = [
self.data[byte_offset + 4],
self.data[byte_offset + 5],
self.data[byte_offset + 6],
self.data[byte_offset + 7],
];
let length = u32::from_le_bytes(length_bytes);
if index < current_index + length as usize {
return Some(value);
}
current_index += length as usize;
byte_offset += 8;
}
None
}
fn quick_append(&mut self, _value: u32) -> Result<()> {
self.stats.original_size = self.len * 4;
self.stats.compressed_size = self.data.len() + self.temp_values.len() * 4;
Ok(())
}
fn recompress_all(&mut self) -> Result<()> {
if self.temp_values.is_empty() {
return Ok(());
}
let mut all_values = Vec::with_capacity(self.len);
match self.strategy {
CompressionStrategy::Raw => {
for i in 0..(self.len - self.temp_values.len()) {
if let Some(val) = self.get_raw(i) {
all_values.push(val);
}
}
}
CompressionStrategy::MinMaxBitPacked { min_val, bit_width } => {
for i in 0..(self.len - self.temp_values.len()) {
if let Some(val) = self.get_min_max_bit_packed(i, min_val, bit_width) {
all_values.push(val);
}
}
}
CompressionStrategy::RunLength => {
for i in 0..(self.len - self.temp_values.len()) {
if let Some(val) = self.get_run_length(i) {
all_values.push(val);
}
}
}
}
all_values.extend_from_slice(&self.temp_values);
let new_strategy = Self::analyze_optimal_strategy(&all_values);
self.strategy = new_strategy;
self.compress_with_strategy(&all_values, new_strategy)?;
self.temp_values.clear();
self.stats.original_size = self.len * 4;
self.stats.compressed_size = self.data.len();
self.stats.strategy_switches += 1;
Ok(())
}
}
impl Default for UintVector {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_operations() {
let mut vec = UintVector::new();
assert_eq!(vec.len(), 0);
assert!(vec.is_empty());
vec.push(42).unwrap();
assert_eq!(vec.len(), 1);
assert_eq!(vec.get(0), Some(42));
assert_eq!(vec.get(1), None);
}
#[test]
fn test_min_max_compression() {
let values: Vec<u32> = (1000..2000).collect();
let compressed = UintVector::build_from(&values).unwrap();
for (i, &expected) in values.iter().enumerate() {
assert_eq!(compressed.get(i), Some(expected), "Mismatch at index {}", i);
}
let ratio = compressed.compression_ratio();
assert!(ratio < 0.4, "Compression ratio {} should be < 0.4", ratio);
println!(
"Min-max compression test: {:.1}% space savings",
(1.0 - ratio) * 100.0
);
}
#[test]
fn test_small_range_compression() {
let values = vec![42u32; 1000];
let compressed = UintVector::build_from(&values).unwrap();
for i in 0..1000 {
assert_eq!(compressed.get(i), Some(42));
}
let ratio = compressed.compression_ratio();
assert!(
ratio < 0.1,
"Compression ratio {} should be < 0.1 for identical values",
ratio
);
println!(
"Small range compression test: {:.1}% space savings",
(1.0 - ratio) * 100.0
);
}
#[test]
fn test_benchmark_pattern() {
let size = 100000;
let test_data: Vec<u32> = (0..size).map(|i| (i % 1000) as u32).collect();
let compressed = UintVector::build_from(&test_data).unwrap();
println!("Strategy: {:?}", compressed.strategy);
println!("Data len: {} bytes", compressed.data.len());
println!("Original size: {} bytes", size * 4);
println!(
"Compressed size: {} bytes",
compressed.stats.compressed_size
);
let ratio = compressed.compression_ratio();
println!("Compression ratio: {:.3}", ratio);
for i in 0..100 {
assert_eq!(compressed.get(i), Some((i % 1000) as u32));
}
assert!(ratio < 0.5, "Compression ratio {} should be < 0.5", ratio);
println!(
"Benchmark pattern compression: {:.1}% space savings",
(1.0 - ratio) * 100.0
);
println!("Original size: {} bytes", size * 4);
println!("Compressed size: {} bytes", compressed.memory_usage());
}
#[test]
fn test_incremental_vs_batch() {
let values: Vec<u32> = (500..600).collect();
let mut incremental = UintVector::new();
for &val in &values {
incremental.push(val).unwrap();
}
let batch = UintVector::build_from(&values).unwrap();
assert_eq!(incremental.len(), batch.len());
for i in 0..values.len() {
assert_eq!(incremental.get(i), batch.get(i));
}
assert!(batch.compression_ratio() <= incremental.compression_ratio());
}
#[test]
fn test_large_values() {
let values = vec![u32::MAX - 10, u32::MAX - 5, u32::MAX];
let compressed = UintVector::build_from(&values).unwrap();
for (i, &expected) in values.iter().enumerate() {
assert_eq!(compressed.get(i), Some(expected));
}
let ratio = compressed.compression_ratio();
assert!(
ratio <= 1.0,
"Should use raw storage or achieve compression for large values"
);
println!("Large values compression test: ratio = {:.3}", ratio);
}
#[test]
fn test_run_length_compression() {
let mut values = Vec::new();
for i in 0..10 {
for _ in 0..100 {
values.push(i);
}
}
let compressed = UintVector::build_from(&values).unwrap();
let mut expected_idx = 0;
for i in 0..10 {
for _ in 0..100 {
assert_eq!(compressed.get(expected_idx), Some(i));
expected_idx += 1;
}
}
let ratio = compressed.compression_ratio();
println!(
"Run-length compression: {:.1}% space savings",
(1.0 - ratio) * 100.0
);
}
#[test]
fn test_empty_vector() {
let vec = UintVector::new();
assert_eq!(vec.len(), 0);
assert!(vec.is_empty());
assert_eq!(vec.get(0), None);
assert_eq!(vec.compression_ratio(), 1.0);
}
#[test]
fn test_single_value() {
let compressed = UintVector::build_from(&[42]).unwrap();
assert_eq!(compressed.len(), 1);
assert_eq!(compressed.get(0), Some(42));
let ratio = compressed.compression_ratio();
assert!(
ratio <= 1.0,
"Single value should use raw storage to avoid expansion"
);
println!("Single value compression test: ratio = {:.3}", ratio);
}
#[test]
fn test_incremental_push_debug() {
let mut vec = UintVector::new();
for i in 0..100 {
let value = (i % 1000) as u32;
vec.push(value).unwrap();
}
for i in 0..100 {
assert_eq!(
vec.get(i),
Some((i % 1000) as u32),
"Mismatch at index {}",
i
);
}
println!("Incremental push test passed for 100 values");
}
#[test]
fn test_memory_usage() {
let values: Vec<u32> = (0..1000).collect();
let compressed = UintVector::build_from(&values).unwrap();
let memory_usage = compressed.memory_usage();
let original_size = values.len() * 4;
println!(
"Memory usage: {} bytes vs {} bytes original",
memory_usage, original_size
);
assert!(memory_usage < original_size);
}
}