use std::collections::HashMap;
use std::sync::Arc;
#[derive(Debug, Clone)]
pub struct DictionaryEncoding {
dictionary: Arc<[Arc<str>]>,
codes: Vec<u32>,
null_bitmap: Option<Vec<u64>>,
}
impl DictionaryEncoding {
pub fn new(dictionary: Arc<[Arc<str>]>, codes: Vec<u32>) -> Self {
Self {
dictionary,
codes,
null_bitmap: None,
}
}
pub fn with_nulls(mut self, null_bitmap: Vec<u64>) -> Self {
self.null_bitmap = Some(null_bitmap);
self
}
pub fn len(&self) -> usize {
self.codes.len()
}
pub fn is_empty(&self) -> bool {
self.codes.is_empty()
}
pub fn dictionary_size(&self) -> usize {
self.dictionary.len()
}
pub fn dictionary(&self) -> &Arc<[Arc<str>]> {
&self.dictionary
}
pub fn codes(&self) -> &[u32] {
&self.codes
}
pub fn is_null(&self, index: usize) -> bool {
if let Some(bitmap) = &self.null_bitmap {
let word_idx = index / 64;
let bit_idx = index % 64;
if word_idx < bitmap.len() {
return (bitmap[word_idx] & (1 << bit_idx)) != 0;
}
}
false
}
pub fn get(&self, index: usize) -> Option<&str> {
if self.is_null(index) {
return None;
}
let code = self.codes.get(index)? as &u32;
self.dictionary.get(*code as usize).map(|s| s.as_ref())
}
pub fn get_code(&self, index: usize) -> Option<u32> {
if self.is_null(index) {
return None;
}
self.codes.get(index).copied()
}
pub fn iter(&self) -> impl Iterator<Item = Option<&str>> {
(0..self.len()).map(move |i| self.get(i))
}
pub fn compression_ratio(&self) -> f64 {
if self.codes.is_empty() {
return 1.0;
}
let original_size: usize = self
.codes
.iter()
.map(|&code| {
if (code as usize) < self.dictionary.len() {
self.dictionary[code as usize].len()
} else {
0
}
})
.sum();
let dict_size: usize = self.dictionary.iter().map(|s| s.len()).sum();
let codes_size = self.codes.len() * std::mem::size_of::<u32>();
let compressed_size = dict_size + codes_size;
if compressed_size == 0 {
return 1.0;
}
original_size as f64 / compressed_size as f64
}
pub fn encode(&self, value: &str) -> Option<u32> {
self.dictionary
.iter()
.position(|s| s.as_ref() == value)
.and_then(|i| u32::try_from(i).ok())
}
pub fn filter_by_code(&self, predicate: impl Fn(u32) -> bool) -> Vec<usize> {
self.codes
.iter()
.enumerate()
.filter_map(|(i, &code)| {
if !self.is_null(i) && predicate(code) {
Some(i)
} else {
None
}
})
.collect()
}
}
#[derive(Debug)]
pub struct DictionaryBuilder {
string_to_code: HashMap<Arc<str>, u32>,
dictionary: Vec<Arc<str>>,
codes: Vec<u32>,
null_positions: Vec<usize>,
}
impl DictionaryBuilder {
pub fn new() -> Self {
Self {
string_to_code: HashMap::new(),
dictionary: Vec::new(),
codes: Vec::new(),
null_positions: Vec::new(),
}
}
pub fn with_capacity(value_capacity: usize, dictionary_capacity: usize) -> Self {
Self {
string_to_code: HashMap::with_capacity(dictionary_capacity),
dictionary: Vec::with_capacity(dictionary_capacity),
codes: Vec::with_capacity(value_capacity),
null_positions: Vec::new(),
}
}
pub fn add(&mut self, value: &str) -> u32 {
if let Some(&code) = self.string_to_code.get(value) {
self.codes.push(code);
code
} else {
#[allow(clippy::cast_possible_truncation)]
let code = self.dictionary.len() as u32;
let arc_value: Arc<str> = value.into();
self.string_to_code.insert(arc_value.clone(), code);
self.dictionary.push(arc_value);
self.codes.push(code);
code
}
}
pub fn add_null(&mut self) {
let idx = self.codes.len();
self.null_positions.push(idx);
self.codes.push(0); }
pub fn add_optional(&mut self, value: Option<&str>) -> Option<u32> {
match value {
Some(v) => Some(self.add(v)),
None => {
self.add_null();
None
}
}
}
pub fn len(&self) -> usize {
self.codes.len()
}
pub fn is_empty(&self) -> bool {
self.codes.is_empty()
}
pub fn dictionary_size(&self) -> usize {
self.dictionary.len()
}
pub fn build(self) -> DictionaryEncoding {
let null_bitmap = if self.null_positions.is_empty() {
None
} else {
let num_words = (self.codes.len() + 63) / 64;
let mut bitmap = vec![0u64; num_words];
for &pos in &self.null_positions {
let word_idx = pos / 64;
let bit_idx = pos % 64;
bitmap[word_idx] |= 1 << bit_idx;
}
Some(bitmap)
};
let dict: Arc<[Arc<str>]> = self.dictionary.into();
let mut encoding = DictionaryEncoding::new(dict, self.codes);
if let Some(bitmap) = null_bitmap {
encoding = encoding.with_nulls(bitmap);
}
encoding
}
pub fn clear(&mut self) {
self.string_to_code.clear();
self.dictionary.clear();
self.codes.clear();
self.null_positions.clear();
}
}
impl Default for DictionaryBuilder {
fn default() -> Self {
Self::new()
}
}
pub trait IntoDictionaryEncoding {
fn into_dictionary_encoding(self) -> DictionaryEncoding;
}
impl<'a, I> IntoDictionaryEncoding for I
where
I: IntoIterator<Item = &'a str>,
{
fn into_dictionary_encoding(self) -> DictionaryEncoding {
let mut builder = DictionaryBuilder::new();
for s in self {
builder.add(s);
}
builder.build()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_dictionary_builder_basic() {
let mut builder = DictionaryBuilder::new();
builder.add("apple");
builder.add("banana");
builder.add("apple");
builder.add("cherry");
builder.add("apple");
let dict = builder.build();
assert_eq!(dict.len(), 5);
assert_eq!(dict.dictionary_size(), 3);
assert_eq!(dict.get(0), Some("apple"));
assert_eq!(dict.get(1), Some("banana"));
assert_eq!(dict.get(2), Some("apple"));
assert_eq!(dict.get(3), Some("cherry"));
assert_eq!(dict.get(4), Some("apple"));
}
#[test]
fn test_dictionary_codes() {
let mut builder = DictionaryBuilder::new();
let code_apple = builder.add("apple");
let code_banana = builder.add("banana");
let code_apple2 = builder.add("apple");
assert_eq!(code_apple, code_apple2);
assert_ne!(code_apple, code_banana);
let dict = builder.build();
assert_eq!(dict.codes(), &[0, 1, 0]);
}
#[test]
fn test_dictionary_with_nulls() {
let mut builder = DictionaryBuilder::new();
builder.add("apple");
builder.add_null();
builder.add("banana");
builder.add_null();
let dict = builder.build();
assert_eq!(dict.len(), 4);
assert_eq!(dict.get(0), Some("apple"));
assert_eq!(dict.get(1), None);
assert!(dict.is_null(1));
assert_eq!(dict.get(2), Some("banana"));
assert_eq!(dict.get(3), None);
assert!(dict.is_null(3));
}
#[test]
fn test_dictionary_encode_lookup() {
let mut builder = DictionaryBuilder::new();
builder.add("apple");
builder.add("banana");
builder.add("cherry");
let dict = builder.build();
assert_eq!(dict.encode("apple"), Some(0));
assert_eq!(dict.encode("banana"), Some(1));
assert_eq!(dict.encode("cherry"), Some(2));
assert_eq!(dict.encode("date"), None);
}
#[test]
fn test_dictionary_filter_by_code() {
let mut builder = DictionaryBuilder::new();
builder.add("apple");
builder.add("banana");
builder.add("apple");
builder.add("cherry");
builder.add("apple");
let dict = builder.build();
let apple_code = dict.encode("apple").unwrap();
let indices = dict.filter_by_code(|code| code == apple_code);
assert_eq!(indices, vec![0, 2, 4]);
}
#[test]
fn test_compression_ratio() {
let mut builder = DictionaryBuilder::new();
for _ in 0..100 {
builder.add("this_is_a_very_long_string_that_repeats_many_times");
}
let dict = builder.build();
let ratio = dict.compression_ratio();
assert!(ratio > 1.0, "Expected compression ratio > 1, got {}", ratio);
}
#[test]
fn test_into_dictionary_encoding() {
let strings = vec!["apple", "banana", "apple", "cherry"];
let dict: DictionaryEncoding = strings.into_iter().into_dictionary_encoding();
assert_eq!(dict.len(), 4);
assert_eq!(dict.dictionary_size(), 3);
}
#[test]
fn test_empty_dictionary() {
let builder = DictionaryBuilder::new();
let dict = builder.build();
assert!(dict.is_empty());
assert_eq!(dict.dictionary_size(), 0);
assert_eq!(dict.get(0), None);
}
#[test]
fn test_single_value() {
let mut builder = DictionaryBuilder::new();
builder.add("only_value");
let dict = builder.build();
assert_eq!(dict.len(), 1);
assert_eq!(dict.dictionary_size(), 1);
assert_eq!(dict.get(0), Some("only_value"));
}
#[test]
fn test_all_unique() {
let mut builder = DictionaryBuilder::new();
builder.add("a");
builder.add("b");
builder.add("c");
builder.add("d");
let dict = builder.build();
assert_eq!(dict.len(), 4);
assert_eq!(dict.dictionary_size(), 4);
assert_eq!(dict.codes(), &[0, 1, 2, 3]);
}
#[test]
fn test_all_same() {
let mut builder = DictionaryBuilder::new();
for _ in 0..10 {
builder.add("same");
}
let dict = builder.build();
assert_eq!(dict.len(), 10);
assert_eq!(dict.dictionary_size(), 1);
assert!(dict.codes().iter().all(|&c| c == 0));
}
}