pub mod blob_store;
pub mod builder;
pub mod compression_types;
pub mod compressor;
pub mod dfa_cache;
pub mod dictionary;
pub mod local_matcher;
pub mod matcher;
pub mod reference_encoding;
pub use blob_store::{
DictZipBlobStore, DictZipBlobStoreBuilder, DictZipBlobStoreStats, DictZipConfig,
EntropyAlgorithm
};
pub use builder::{BuildPhase, BuildProgress, BuildStrategy, DictionaryBuilder, DictionaryBuilderConfig, SampleSortPolicy};
pub use compression_types::{
CompressionType, Match, BitReader, BitWriter, encode_match, decode_match,
encode_matches, decode_matches, calculate_encoding_cost, calculate_encoding_overhead,
calculate_compression_efficiency, choose_best_compression_type, calculate_theoretical_compression_ratio
};
pub use compressor::{
CompressionStats, CompressionStrategy, CostAnalysis, PaZipCompressor, PaZipCompressorConfig
};
pub use dfa_cache::{CacheMatch, CacheStats, DfaCache, DfaCacheConfig};
pub use dictionary::{
ConcurrentSuffixArrayDictionary, MatchStats, SuffixArrayDictionary, SuffixArrayDictionaryConfig,
};
pub use local_matcher::{LocalMatch, LocalMatcher, LocalMatcherConfig, LocalMatcherStats};
pub use matcher::{Match as PatternMatch, MatcherConfig, MatcherStats, PatternMatcher, PatternMatcherBuilder};
pub use reference_encoding::{
DzType, DzEncodingMeta, ReferenceEncoder, get_back_ref_encoding_meta,
write_uint_bytes, write_var_size_t, compress_record_reference
};
use crate::error::{Result, ZiporaError};
pub const PA_ZIP_VERSION: &str = "1.0.0";
pub const DEFAULT_MIN_PATTERN_LENGTH: usize = 4;
pub const DEFAULT_MAX_PATTERN_LENGTH: usize = 256;
pub const DEFAULT_MIN_FREQUENCY: u32 = 4;
pub const DEFAULT_BFS_DEPTH: u32 = 6;
pub fn validate_parameters(
min_pattern_length: usize,
max_pattern_length: usize,
min_frequency: u32,
max_bfs_depth: u32,
) -> Result<()> {
if min_pattern_length == 0 {
return Err(ZiporaError::invalid_data("Minimum pattern length must be > 0"));
}
if max_pattern_length < min_pattern_length {
return Err(ZiporaError::invalid_data(
"Maximum pattern length must be >= minimum pattern length",
));
}
if max_pattern_length > 1024 {
return Err(ZiporaError::invalid_data(
"Maximum pattern length must be <= 1024",
));
}
if min_frequency == 0 {
return Err(ZiporaError::invalid_data("Minimum frequency must be > 0"));
}
if max_bfs_depth > 20 {
return Err(ZiporaError::invalid_data("BFS depth must be <= 20"));
}
Ok(())
}
pub fn calculate_optimal_dict_size(input_size: usize, max_memory: usize) -> usize {
if input_size == 0 {
return 0;
}
let theoretical_min = (input_size / 20).max(256); let theoretical_max = (input_size / 6).max(theoretical_min);
let input_constrained_max = theoretical_max.min(input_size);
let input_constrained_min = theoretical_min.min(input_size);
let final_size = if max_memory > 0 {
let budget_limit = max_memory / 2; input_constrained_max.min(budget_limit)
} else {
input_constrained_max
};
if max_memory > 0 {
let memory_limit = max_memory / 2;
final_size.max(input_constrained_min).min(memory_limit)
} else {
final_size.max(input_constrained_min)
}
}
pub fn estimate_compression_ratio(
data_entropy: f64,
repetitiveness: f64,
dict_size_ratio: f64,
) -> f64 {
let base_ratio = (data_entropy / 8.0) * (1.0 - repetitiveness * 0.7);
let dict_overhead = dict_size_ratio * 0.1;
let pa_zip_factor = 0.7 + repetitiveness * 0.2;
(base_ratio * pa_zip_factor + dict_overhead).clamp(0.1, 1.0)
}
pub struct QuickConfig;
impl QuickConfig {
pub fn text_compression() -> SuffixArrayDictionaryConfig {
SuffixArrayDictionaryConfig {
max_dict_size: 32 * 1024 * 1024, min_frequency: 3,
max_bfs_depth: 6,
min_pattern_length: 4,
max_pattern_length: 128,
sample_ratio: 0.8,
..Default::default()
}
}
pub fn binary_compression() -> SuffixArrayDictionaryConfig {
SuffixArrayDictionaryConfig {
max_dict_size: 16 * 1024 * 1024, min_frequency: 8,
max_bfs_depth: 4,
min_pattern_length: 8,
max_pattern_length: 64,
sample_ratio: 0.5,
..Default::default()
}
}
pub fn log_compression() -> SuffixArrayDictionaryConfig {
SuffixArrayDictionaryConfig {
max_dict_size: 64 * 1024 * 1024, min_frequency: 2,
max_bfs_depth: 8,
min_pattern_length: 10,
max_pattern_length: 256,
sample_ratio: 0.3, ..Default::default()
}
}
pub fn realtime_compression() -> SuffixArrayDictionaryConfig {
SuffixArrayDictionaryConfig {
max_dict_size: 8 * 1024 * 1024, min_frequency: 10,
max_bfs_depth: 3,
min_pattern_length: 6,
max_pattern_length: 32,
sample_ratio: 0.2,
..Default::default()
}
}
}
#[cfg(test)]
pub mod bench_utils {
use super::*;
use std::time::{Duration, Instant};
pub fn benchmark_build_time(data: &[u8], config: DictionaryBuilderConfig) -> Duration {
let start = Instant::now();
let builder = DictionaryBuilder::with_config(config);
let _dictionary = builder.build(data).unwrap();
start.elapsed()
}
pub fn benchmark_matching_throughput(
dictionary: &mut SuffixArrayDictionary,
test_data: &[u8],
num_matches: usize,
) -> f64 {
let start = Instant::now();
let mut total_bytes = 0;
for i in 0..num_matches {
let pos = (i * 97) % test_data.len(); if let Ok(Some(m)) = dictionary.find_longest_match(test_data, pos, 100) {
total_bytes += m.length;
}
}
let elapsed = start.elapsed().as_secs_f64();
total_bytes as f64 / elapsed }
pub fn generate_test_data(size: usize, repetitiveness: f64) -> Vec<u8> {
let mut data = Vec::with_capacity(size);
let pattern = b"test pattern with some repetitive content";
let random_bytes: Vec<u8> = (0..=255).cycle().take(size).collect();
for i in 0..size {
if (i as f64 / size as f64) < repetitiveness {
data.push(pattern[i % pattern.len()]);
} else {
data.push(random_bytes[i]);
}
}
data
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parameter_validation() {
assert!(validate_parameters(4, 256, 4, 6).is_ok());
assert!(validate_parameters(0, 256, 4, 6).is_err());
assert!(validate_parameters(10, 5, 4, 6).is_err());
assert!(validate_parameters(4, 256, 0, 6).is_err());
assert!(validate_parameters(4, 256, 4, 25).is_err());
}
#[test]
fn test_optimal_dict_size_calculation() {
let dict_size = calculate_optimal_dict_size(10000, 100000);
assert!(dict_size >= 1024);
assert!(dict_size <= 50000);
let dict_size = calculate_optimal_dict_size(1000000, 100000);
assert!(dict_size <= 50000); }
#[test]
fn test_compression_ratio_estimation() {
let ratio = estimate_compression_ratio(3.0, 0.8, 0.1);
assert!(ratio < 0.5);
let ratio = estimate_compression_ratio(7.5, 0.1, 0.2);
assert!(ratio > 0.6);
}
#[test]
fn test_quick_configs() {
let text_config = QuickConfig::text_compression();
assert_eq!(text_config.min_pattern_length, 4);
assert_eq!(text_config.max_pattern_length, 128);
let binary_config = QuickConfig::binary_compression();
assert_eq!(binary_config.min_pattern_length, 8);
assert!(binary_config.min_frequency > text_config.min_frequency);
let log_config = QuickConfig::log_compression();
assert!(log_config.max_dict_size > text_config.max_dict_size);
assert!(log_config.sample_ratio < text_config.sample_ratio);
let realtime_config = QuickConfig::realtime_compression();
assert!(realtime_config.max_dict_size < text_config.max_dict_size);
assert!(realtime_config.max_bfs_depth < text_config.max_bfs_depth);
}
#[test]
fn test_version_constant() {
assert!(!PA_ZIP_VERSION.is_empty());
assert!(PA_ZIP_VERSION.contains('.'));
}
#[test]
fn test_default_constants() {
assert_eq!(DEFAULT_MIN_PATTERN_LENGTH, 4);
assert_eq!(DEFAULT_MAX_PATTERN_LENGTH, 256);
assert_eq!(DEFAULT_MIN_FREQUENCY, 4);
assert_eq!(DEFAULT_BFS_DEPTH, 6);
}
#[test]
#[cfg(test)]
fn test_bench_utils() {
let test_data = bench_utils::generate_test_data(1000, 0.5);
assert_eq!(test_data.len(), 1000);
let unique_bytes: std::collections::HashSet<u8> = test_data.iter().copied().collect();
assert!(!unique_bytes.is_empty());
assert!(unique_bytes.len() <= 256); }
#[test]
fn test_integration_workflow() {
let training_data = b"the quick brown fox jumps over the lazy dog. the quick brown fox.";
let config = DictionaryBuilderConfig {
target_dict_size: 2048,
max_dict_size: 4096,
validate_result: true,
..Default::default()
};
let builder = DictionaryBuilder::with_config(config);
let mut dictionary = builder.build(training_data).unwrap();
let input = b"the quick brown";
let result = dictionary.find_longest_match(input, 0, 50).unwrap();
assert!(result.is_some());
let match_info = result.unwrap();
assert!(match_info.length > 0);
assert!(match_info.quality > 0.0);
let stats = dictionary.match_stats();
assert_eq!(stats.total_searches, 1);
assert!(dictionary.validate().is_ok());
}
}