use crate::varint::{decode_varint, encode_varint, varint_size};
pub const MIN_DICT_SIZE: usize = 16;
pub const MAX_DICT_SIZE: usize = 65536;
pub const MAX_DICT_SRC_OFFSET: usize = 65535;
pub struct Dict {
dict: Vec<u8>,
repeat: usize,
#[allow(dead_code)]
fast_table: Option<Box<[u16; 1 << 14]>>,
#[allow(dead_code)]
better_table_short: Option<Box<[u16; 1 << 14]>>,
#[allow(dead_code)]
better_table_long: Option<Box<[u16; 1 << 17]>>,
#[allow(dead_code)]
best_table_short: Option<Box<[u32; 1 << 16]>>,
#[allow(dead_code)]
best_table_long: Option<Box<[u32; 1 << 19]>>,
}
impl Dict {
pub fn new(data: &[u8]) -> Option<Self> {
if data.is_empty() {
return None;
}
let (repeat, n) = decode_varint(data).ok()?;
let dict_data = &data[n..];
if dict_data.len() < MIN_DICT_SIZE || dict_data.len() > MAX_DICT_SIZE {
return None;
}
if repeat as usize > dict_data.len() {
return None;
}
let mut dict = Vec::with_capacity(dict_data.len() + 16);
dict.extend_from_slice(dict_data);
Some(Dict {
dict,
repeat: repeat as usize,
fast_table: None,
better_table_short: None,
better_table_long: None,
best_table_short: None,
best_table_long: None,
})
}
pub fn to_bytes(&self) -> Vec<u8> {
let size = varint_size(self.repeat as u64);
let mut result = vec![0u8; size + self.dict.len()];
let n = encode_varint(&mut result, self.repeat as u64);
result[n..].copy_from_slice(&self.dict);
result
}
pub fn data(&self) -> &[u8] {
&self.dict
}
pub fn repeat(&self) -> usize {
self.repeat
}
#[allow(dead_code)]
pub(crate) fn get_fast_table(&mut self) -> &[u16; 1 << 14] {
if self.fast_table.is_none() {
self.init_fast();
}
self.fast_table.as_ref().unwrap()
}
#[allow(dead_code)]
pub(crate) fn get_better_tables(&mut self) -> (&[u16; 1 << 14], &[u16; 1 << 17]) {
if self.better_table_short.is_none() || self.better_table_long.is_none() {
self.init_better();
}
(
self.better_table_short.as_ref().unwrap(),
self.better_table_long.as_ref().unwrap(),
)
}
/// Get best hash tables, initializing if needed
#[allow(dead_code)]
pub(crate) fn get_best_tables(&mut self) -> (&[u32; 1 << 16], &[u32; 1 << 19]) {
if self.best_table_short.is_none() || self.best_table_long.is_none() {
self.init_best();
}
(
self.best_table_short.as_ref().unwrap(),
self.best_table_long.as_ref().unwrap(),
)
}
/// Initialize fast hash table
fn init_fast(&mut self) {
const TABLE_BITS: u32 = 14;
let mut table = Box::new([0u16; 1 << TABLE_BITS]);
// Hash every 3rd byte to populate table
let mut i = 0;
while i < self.dict.len().saturating_sub(10) {
let x = load64(&self.dict, i);
let h0 = hash6(x, TABLE_BITS);
let h1 = hash6(x >> 8, TABLE_BITS);
let h2 = hash6(x >> 16, TABLE_BITS);
table[h0 as usize] = i as u16;
table[h1 as usize] = (i + 1) as u16;
table[h2 as usize] = (i + 2) as u16;
i += 3;
}
self.fast_table = Some(table);
}
/// Initialize better hash tables
fn init_better(&mut self) {
const L_TABLE_BITS: u32 = 17;
const S_TABLE_BITS: u32 = 14;
let mut l_table = Box::new([0u16; 1 << L_TABLE_BITS]);
let mut s_table = Box::new([0u16; 1 << S_TABLE_BITS]);
// Hash every byte
for i in 0..self.dict.len().saturating_sub(8) {
let cv = load64(&self.dict, i);
l_table[hash7(cv, L_TABLE_BITS) as usize] = i as u16;
s_table[hash4(cv, S_TABLE_BITS) as usize] = i as u16;
}
self.better_table_long = Some(l_table);
self.better_table_short = Some(s_table);
}
/// Initialize best hash tables
fn init_best(&mut self) {
const L_TABLE_BITS: u32 = 19;
const S_TABLE_BITS: u32 = 16;
let mut l_table = Box::new([0u32; 1 << L_TABLE_BITS]);
let mut s_table = Box::new([0u32; 1 << S_TABLE_BITS]);
// Hash every byte with chaining
for i in 0..self.dict.len().saturating_sub(8) {
let cv = load64(&self.dict, i);
let hash_l = hash8(cv, L_TABLE_BITS) as usize;
let hash_s = hash4(cv, S_TABLE_BITS) as usize;
let candidate_l = l_table[hash_l];
let candidate_s = s_table[hash_s];
l_table[hash_l] = (i as u32) | (candidate_l << 16);
s_table[hash_s] = (i as u32) | (candidate_s << 16);
}
self.best_table_long = Some(l_table);
self.best_table_short = Some(s_table);
}
}
/// Create a dictionary from data
///
/// If `data` is longer than MAX_DICT_SIZE, only the last MAX_DICT_SIZE bytes are used.
/// If `search_start` is provided, the repeat offset will be set to the last occurrence
/// of that pattern in the dictionary (or a shorter prefix if exact match not found).
/// If no match >= 4 bytes is found, repeat is set to 0.
pub fn make_dict(data: &[u8], search_start: Option<&[u8]>) -> Option<Dict> {
if data.is_empty() {
return None;
}
// Trim to max size
let dict_data = if data.len() > MAX_DICT_SIZE {
&data[data.len() - MAX_DICT_SIZE..]
} else {
data
};
if dict_data.len() < MIN_DICT_SIZE {
return None;
}
// Find repeat offset
let mut repeat = 0;
if let Some(search) = search_start {
// Find longest match, trying progressively shorter lengths
for len in (4..=search.len()).rev() {
if let Some(pos) = find_last_occurrence(dict_data, &search[..len]) {
// Make sure we have at least 8 bytes remaining
if pos <= dict_data.len().saturating_sub(8) {
repeat = pos;
break;
}
}
}
}
// Ensure capacity for extra bytes
let mut dict = Vec::with_capacity(dict_data.len() + 16);
dict.extend_from_slice(dict_data);
Some(Dict {
dict,
repeat,
fast_table: None,
better_table_short: None,
better_table_long: None,
best_table_short: None,
best_table_long: None,
})
}
/// Create a dictionary with manual repeat offset
///
/// The `first_idx` must be less than `data.len() - 8`.
pub fn make_dict_manual(data: &[u8], first_idx: u16) -> Option<Dict> {
if data.len() < MIN_DICT_SIZE
|| data.len() > MAX_DICT_SIZE
|| first_idx as usize >= data.len().saturating_sub(8)
{
return None;
}
// Ensure capacity for extra bytes
let mut dict = Vec::with_capacity(data.len() + 16);
dict.extend_from_slice(data);
Some(Dict {
dict,
repeat: first_idx as usize,
fast_table: None,
better_table_short: None,
better_table_long: None,
best_table_short: None,
best_table_long: None,
})
}
/// Find last occurrence of needle in haystack
fn find_last_occurrence(haystack: &[u8], needle: &[u8]) -> Option<usize> {
if needle.is_empty() || needle.len() > haystack.len() {
return None;
}
// Manual implementation is clearer than iterator chaining for this case
#[allow(clippy::manual_find)]
for i in (0..=haystack.len() - needle.len()).rev() {
if &haystack[i..i + needle.len()] == needle {
return Some(i);
}
}
None
}
// Helper functions for loading and hashing (matching encode.rs patterns)
// These will be used when full dictionary encoding is implemented
#[allow(dead_code)]
#[inline(always)]
fn load64(data: &[u8], offset: usize) -> u64 {
u64::from_le_bytes(data[offset..offset + 8].try_into().unwrap())
}
#[allow(dead_code)]
#[inline(always)]
fn hash4(v: u64, bits: u32) -> u32 {
((v.wrapping_mul(0x1e35a7bd)) >> (32 - bits)) as u32
}
#[allow(dead_code)]
#[inline(always)]
fn hash6(v: u64, bits: u32) -> u32 {
((v.wrapping_mul(0xcf1bbcdcb7a56463)) >> (64 - bits)) as u32
}
#[allow(dead_code)]
#[inline(always)]
fn hash7(v: u64, bits: u32) -> u32 {
((v.wrapping_mul(0xcf1bbcdcb7a56463)) >> (64 - bits)) as u32
}
#[allow(dead_code)]
#[inline(always)]
fn hash8(v: u64, bits: u32) -> u32 {
((v.wrapping_mul(0x1e35a7bd1e35a7bd)) >> (64 - bits)) as u32
}
#[cfg(test)]
mod tests {
use super::*;
use crate::decode::decode_with_dict;
use crate::encode::encode;
#[test]
fn test_dict_roundtrip_simple() {
// Create a dictionary
let dict_data = b"Hello, World! This is common text that appears frequently.";
let dict = make_dict(dict_data, Some(b"Hello")).unwrap();
// Encode some data (currently falls back to standard encoding)
let data = b"Hello, World! Testing dictionary compression.";
let compressed = encode(data);
// Decode with dictionary should still work
let decompressed = decode_with_dict(&compressed, &dict).unwrap();
assert_eq!(data, &decompressed[..]);
}
#[test]
fn test_dict_api_compatibility() {
// Verify all encoding APIs work (even if they fall back for now)
let dict_data = b"Common prefix data that repeats often in our dataset.";
let dict = make_dict(dict_data, None).unwrap();
let data = b"Test data for compression.";
// All encode functions should work
let _c1 = crate::encode::encode_with_dict(data, &dict);
let _c2 = crate::encode::encode_better_with_dict(data, &dict);
let _c3 = crate::encode::encode_best_with_dict(data, &dict);
}
#[test]
fn test_dict_new() {
// Create a simple dictionary
let data = b"Hello, World! This is a test.";
let dict = make_dict(data, None).unwrap();
assert_eq!(dict.data(), data);
assert_eq!(dict.repeat(), 0);
}
#[test]
fn test_dict_serialization() {
let data = b"Test dictionary data for serialization";
let dict = make_dict(data, Some(b"dict")).unwrap();
// Serialize and deserialize
let bytes = dict.to_bytes();
let dict2 = Dict::new(&bytes).unwrap();
assert_eq!(dict.data(), dict2.data());
assert_eq!(dict.repeat(), dict2.repeat());
}
#[test]
fn test_dict_size_limits() {
// Too small
let small = b"tiny";
assert!(make_dict(small, None).is_none());
// Minimum size
let min_size = vec![b'X'; MIN_DICT_SIZE];
assert!(make_dict(&min_size, None).is_some());
// Maximum size
let max_size = vec![b'X'; MAX_DICT_SIZE];
assert!(make_dict(&max_size, None).is_some());
// Over maximum - should trim
let over_max = vec![b'X'; MAX_DICT_SIZE + 1000];
let dict = make_dict(&over_max, None).unwrap();
assert_eq!(dict.data().len(), MAX_DICT_SIZE);
}
#[test]
fn test_make_dict_with_search() {
let data = b"The quick brown fox jumps over the lazy dog. The quick brown fox.";
// Search for "The quick"
let dict = make_dict(data, Some(b"The quick")).unwrap();
// Should find the last occurrence
assert!(dict.repeat() > 0);
// Verify it's actually at that position
assert!(dict.data()[dict.repeat()..].starts_with(b"The quick"));
}
#[test]
fn test_make_dict_manual() {
let data = vec![b'A'; 100];
let dict = make_dict_manual(&data, 10).unwrap();
assert_eq!(dict.repeat(), 10);
assert_eq!(dict.data().len(), 100);
}
#[test]
fn test_find_last_occurrence() {
let haystack = b"Hello world, hello Rust, hello!";
assert_eq!(find_last_occurrence(haystack, b"hello"), Some(25));
assert_eq!(find_last_occurrence(haystack, b"Hello"), Some(0));
assert_eq!(find_last_occurrence(haystack, b"xyz"), None);
}
}