use alloc::collections::BTreeMap;
use alloc::string::{String, ToString};
use alloc::vec::Vec;
use core::fmt;
pub const DICT_MAGIC: [u8; 4] = [0x4C, 0x43, 0x44, 0x43];
pub const DEFAULT_DICT_SIZE: usize = 32 * 1024;
pub const MIN_DICT_SIZE: usize = 4 * 1024;
pub const MAX_DICT_SIZE: usize = 256 * 1024;
pub const MIN_MATCH_LEN: usize = 4;
pub const MAX_MATCH_LEN: usize = 65535;
pub const OP_DICT_REF: u8 = 0x00;
pub const OP_LITERAL: u8 = 0x01;
#[derive(Debug, Clone)]
pub struct CompressionDict {
pub id: u64,
pub name: String,
pub data: Vec<u8>,
pub pattern: String,
pub dataset: String,
pub created: u64,
pub sample_count: u32,
pub avg_ratio: f32,
}
impl CompressionDict {
pub fn new(
id: u64,
name: &str,
data: Vec<u8>,
pattern: &str,
dataset: &str,
created: u64,
) -> Self {
Self {
id,
name: name.to_string(),
data,
pattern: pattern.to_string(),
dataset: dataset.to_string(),
created,
sample_count: 0,
avg_ratio: 0.0,
}
}
pub fn with_training_info(mut self, sample_count: u32, avg_ratio: f32) -> Self {
self.sample_count = sample_count;
self.avg_ratio = avg_ratio;
self
}
pub fn size(&self) -> usize {
self.data.len()
}
pub fn matches_pattern(&self, path: &str) -> bool {
if self.pattern.is_empty() || self.pattern == "*" {
return true;
}
if self.pattern.starts_with("*.") {
let ext = &self.pattern[2..];
return path.ends_with(ext);
}
if self.pattern.ends_with("*") {
let prefix = &self.pattern[..self.pattern.len() - 1];
return path.starts_with(prefix);
}
path == self.pattern
}
}
#[derive(Debug, Clone)]
pub struct SubstringEntry {
pub data: Vec<u8>,
pub count: u32,
pub savings: u64,
}
impl SubstringEntry {
pub fn new(data: Vec<u8>) -> Self {
Self {
data,
count: 1,
savings: 0,
}
}
pub fn increment(&mut self) {
self.count += 1;
let len = self.data.len();
if len > 4 {
self.savings = (self.count as u64 - 1) * (len - 4) as u64;
}
}
pub fn score(&self) -> u64 {
self.savings
}
}
#[derive(Debug, Clone, Copy)]
pub struct CompressedHeader {
pub magic: [u8; 4],
pub dict_id: u64,
pub original_size: u32,
pub compressed_size: u32,
pub checksum: u32,
}
impl CompressedHeader {
pub const SIZE: usize = 24;
pub fn new(dict_id: u64, original_size: u32, compressed_size: u32, checksum: u32) -> Self {
Self {
magic: DICT_MAGIC,
dict_id,
original_size,
compressed_size,
checksum,
}
}
pub fn to_bytes(&self) -> [u8; 24] {
let mut buf = [0u8; 24];
buf[0..4].copy_from_slice(&self.magic);
buf[4..12].copy_from_slice(&self.dict_id.to_le_bytes());
buf[12..16].copy_from_slice(&self.original_size.to_le_bytes());
buf[16..20].copy_from_slice(&self.compressed_size.to_le_bytes());
buf[20..24].copy_from_slice(&self.checksum.to_le_bytes());
buf
}
pub fn from_bytes(data: &[u8]) -> Option<Self> {
if data.len() < 24 {
return None;
}
let mut magic = [0u8; 4];
magic.copy_from_slice(&data[0..4]);
if magic != DICT_MAGIC {
return None;
}
let dict_id = u64::from_le_bytes(data[4..12].try_into().ok()?);
let original_size = u32::from_le_bytes(data[12..16].try_into().ok()?);
let compressed_size = u32::from_le_bytes(data[16..20].try_into().ok()?);
let checksum = u32::from_le_bytes(data[20..24].try_into().ok()?);
Some(Self {
magic,
dict_id,
original_size,
compressed_size,
checksum,
})
}
pub fn is_dict_compressed(data: &[u8]) -> bool {
data.len() >= 4 && data[0..4] == DICT_MAGIC
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CompressOp {
DictRef {
offset: u16,
length: u16,
},
Literal {
data: Vec<u8>,
},
}
impl CompressOp {
pub fn dict_ref(offset: u16, length: u16) -> Self {
Self::DictRef { offset, length }
}
pub fn literal(data: Vec<u8>) -> Self {
Self::Literal { data }
}
pub fn encoded_size(&self) -> usize {
match self {
Self::DictRef { .. } => 5, Self::Literal { data } => 3 + data.len(), }
}
pub fn output_size(&self) -> usize {
match self {
Self::DictRef { length, .. } => *length as usize,
Self::Literal { data } => data.len(),
}
}
pub fn encode(&self, buf: &mut Vec<u8>) {
match self {
Self::DictRef { offset, length } => {
buf.push(OP_DICT_REF);
buf.extend_from_slice(&offset.to_le_bytes());
buf.extend_from_slice(&length.to_le_bytes());
}
Self::Literal { data } => {
buf.push(OP_LITERAL);
buf.extend_from_slice(&(data.len() as u16).to_le_bytes());
buf.extend_from_slice(data);
}
}
}
pub fn decode(data: &[u8]) -> Option<(Self, usize)> {
if data.is_empty() {
return None;
}
match data[0] {
OP_DICT_REF => {
if data.len() < 5 {
return None;
}
let offset = u16::from_le_bytes([data[1], data[2]]);
let length = u16::from_le_bytes([data[3], data[4]]);
Some((Self::DictRef { offset, length }, 5))
}
OP_LITERAL => {
if data.len() < 3 {
return None;
}
let length = u16::from_le_bytes([data[1], data[2]]) as usize;
if data.len() < 3 + length {
return None;
}
let literal_data = data[3..3 + length].to_vec();
Some((Self::Literal { data: literal_data }, 3 + length))
}
_ => None,
}
}
}
#[derive(Debug, Clone)]
pub struct TrainingOptions {
pub dict_size: usize,
pub min_substring_len: usize,
pub max_substring_len: usize,
pub min_occurrences: u32,
}
impl Default for TrainingOptions {
fn default() -> Self {
Self {
dict_size: DEFAULT_DICT_SIZE,
min_substring_len: MIN_MATCH_LEN,
max_substring_len: 256,
min_occurrences: 2,
}
}
}
impl TrainingOptions {
pub fn with_size(mut self, size: usize) -> Self {
self.dict_size = size.clamp(MIN_DICT_SIZE, MAX_DICT_SIZE);
self
}
pub fn min_len(mut self, len: usize) -> Self {
self.min_substring_len = len.max(MIN_MATCH_LEN);
self
}
pub fn max_len(mut self, len: usize) -> Self {
self.max_substring_len = len;
self
}
pub fn min_count(mut self, count: u32) -> Self {
self.min_occurrences = count.max(2);
self
}
}
#[derive(Debug, Clone, Default)]
pub struct DictStats {
pub compressions: u64,
pub decompressions: u64,
pub bytes_in: u64,
pub bytes_out: u64,
pub dict_hits: u64,
pub dict_misses: u64,
}
impl DictStats {
pub fn compression_ratio(&self) -> f64 {
if self.bytes_in == 0 {
return 1.0;
}
self.bytes_out as f64 / self.bytes_in as f64
}
pub fn hit_rate(&self) -> f64 {
let total = self.dict_hits + self.dict_misses;
if total == 0 {
return 0.0;
}
self.dict_hits as f64 / total as f64
}
pub fn record_compression(
&mut self,
input_size: u64,
output_size: u64,
hits: u64,
misses: u64,
) {
self.compressions += 1;
self.bytes_in += input_size;
self.bytes_out += output_size;
self.dict_hits += hits;
self.dict_misses += misses;
}
pub fn record_decompression(&mut self) {
self.decompressions += 1;
}
}
#[derive(Debug, Clone)]
pub enum DictError {
InvalidDictSize(usize),
DictNotFound(u64),
InsufficientSamples(usize),
InvalidData(String),
ChecksumMismatch {
expected: u32,
actual: u32,
},
DictTooLarge(usize),
NoMatchingDict(String),
}
impl fmt::Display for DictError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::InvalidDictSize(size) => write!(f, "invalid dictionary size: {}", size),
Self::DictNotFound(id) => write!(f, "dictionary not found: {}", id),
Self::InsufficientSamples(n) => {
write!(f, "insufficient samples: {} (need at least 2)", n)
}
Self::InvalidData(msg) => write!(f, "invalid compressed data: {}", msg),
Self::ChecksumMismatch { expected, actual } => {
write!(
f,
"checksum mismatch: expected {}, got {}",
expected, actual
)
}
Self::DictTooLarge(size) => write!(f, "dictionary too large: {}", size),
Self::NoMatchingDict(path) => write!(f, "no matching dictionary for: {}", path),
}
}
}
pub type DictResult<T> = Result<T, DictError>;
#[cfg(test)]
mod tests {
use super::*;
use alloc::format;
use alloc::vec;
#[test]
fn test_compression_dict() {
let dict = CompressionDict::new(
1,
"json_dict",
vec![1, 2, 3, 4],
"*.json",
"pool/data",
12345,
);
assert_eq!(dict.id, 1);
assert_eq!(dict.size(), 4);
assert!(dict.matches_pattern("file.json"));
assert!(!dict.matches_pattern("file.xml"));
}
#[test]
fn test_pattern_matching() {
let dict = CompressionDict::new(1, "test", vec![], "*.log", "ds", 0);
assert!(dict.matches_pattern("app.log"));
assert!(dict.matches_pattern("/var/log/system.log"));
assert!(!dict.matches_pattern("app.txt"));
let dict2 = CompressionDict::new(1, "test", vec![], "/var/*", "ds", 0);
assert!(dict2.matches_pattern("/var/log"));
assert!(!dict2.matches_pattern("/home/log"));
let dict3 = CompressionDict::new(1, "test", vec![], "*", "ds", 0);
assert!(dict3.matches_pattern("anything"));
}
#[test]
fn test_substring_entry() {
let mut entry = SubstringEntry::new(vec![1, 2, 3, 4, 5, 6, 7, 8]);
assert_eq!(entry.count, 1);
assert_eq!(entry.savings, 0);
entry.increment();
assert_eq!(entry.count, 2);
assert_eq!(entry.savings, 4);
entry.increment();
assert_eq!(entry.count, 3);
assert_eq!(entry.savings, 8); }
#[test]
fn test_compressed_header() {
let header = CompressedHeader::new(12345, 1000, 500, 0xDEADBEEF);
let bytes = header.to_bytes();
assert_eq!(bytes.len(), 24);
assert_eq!(&bytes[0..4], &DICT_MAGIC);
let parsed = CompressedHeader::from_bytes(&bytes).unwrap();
assert_eq!(parsed.dict_id, 12345);
assert_eq!(parsed.original_size, 1000);
assert_eq!(parsed.compressed_size, 500);
assert_eq!(parsed.checksum, 0xDEADBEEF);
}
#[test]
fn test_header_magic_check() {
assert!(CompressedHeader::is_dict_compressed(&DICT_MAGIC));
assert!(CompressedHeader::is_dict_compressed(&[
0x4C, 0x43, 0x44, 0x43, 0x00
]));
assert!(!CompressedHeader::is_dict_compressed(&[
0x00, 0x00, 0x00, 0x00
]));
assert!(!CompressedHeader::is_dict_compressed(&[0x4C]));
}
#[test]
fn test_compress_op_dict_ref() {
let op = CompressOp::dict_ref(100, 50);
assert_eq!(op.encoded_size(), 5);
assert_eq!(op.output_size(), 50);
let mut buf = Vec::new();
op.encode(&mut buf);
assert_eq!(buf.len(), 5);
let (decoded, consumed) = CompressOp::decode(&buf).unwrap();
assert_eq!(consumed, 5);
assert_eq!(decoded, op);
}
#[test]
fn test_compress_op_literal() {
let op = CompressOp::literal(vec![1, 2, 3, 4, 5]);
assert_eq!(op.encoded_size(), 8); assert_eq!(op.output_size(), 5);
let mut buf = Vec::new();
op.encode(&mut buf);
assert_eq!(buf.len(), 8);
let (decoded, consumed) = CompressOp::decode(&buf).unwrap();
assert_eq!(consumed, 8);
assert_eq!(decoded, op);
}
#[test]
fn test_training_options() {
let opts = TrainingOptions::default()
.with_size(16 * 1024)
.min_len(8)
.min_count(3);
assert_eq!(opts.dict_size, 16 * 1024);
assert_eq!(opts.min_substring_len, 8);
assert_eq!(opts.min_occurrences, 3);
}
#[test]
fn test_dict_stats() {
let mut stats = DictStats::default();
stats.record_compression(1000, 500, 100, 20);
assert_eq!(stats.compressions, 1);
assert_eq!(stats.bytes_in, 1000);
assert_eq!(stats.bytes_out, 500);
assert_eq!(stats.compression_ratio(), 0.5);
assert!((stats.hit_rate() - 0.833).abs() < 0.01);
}
#[test]
fn test_error_display() {
let e = DictError::InvalidDictSize(100);
assert!(format!("{}", e).contains("100"));
let e = DictError::DictNotFound(42);
assert!(format!("{}", e).contains("42"));
let e = DictError::ChecksumMismatch {
expected: 1,
actual: 2,
};
assert!(format!("{}", e).contains("mismatch"));
}
}