use crate::prelude::*;
use std::collections::HashMap;
use std::hash::Hash;
use std::vec::IntoIter;
pub struct DictionaryIterator<T> {
indexes: IntoIter<u64>,
values: IntoIter<T>,
cache: HashMap<u64, T>,
}
impl<T: Clone + Default> Iterator for DictionaryIterator<T> {
type Item = T;
fn next(&mut self) -> Option<T> {
match self.indexes.next() {
None => None,
Some(index) => {
while self.cache.len() <= index as usize {
let next = self.values.next().unwrap_or_default();
self.cache.insert(self.cache.len() as u64, next);
}
Some(self.cache.get(&index).unwrap().clone())
}
}
}
}
impl<T: Send + Clone> DictionaryIterator<T> {
pub fn new(
indexes: Box<DynArrayBranch<'_>>,
values: Box<DynArrayBranch<'_>>,
options: &impl DecodeOptions,
f: impl Send + FnOnce(DynArrayBranch<'_>) -> DecodeResult<IntoIter<T>>,
) -> DecodeResult<Self> {
let (indexes, values) = parallel(|| <u64 as Decodable>::DecoderArray::new(*indexes, options), || f(*values), options);
let indexes = indexes?;
let values = values?;
Ok(Self {
indexes,
values,
cache: HashMap::new(),
})
}
}
pub(crate) struct Dictionary<S> {
sub_compressors: S,
}
impl<S> Dictionary<S> {
pub fn new(sub_compressors: S) -> Self {
Self { sub_compressors }
}
}
fn get_lookup_table<T: PartialEq + Copy + std::fmt::Debug + Hash + Eq>(data: &[T]) -> Result<(Vec<u64>, Vec<T>), ()> {
if data.len() < 2 {
return Err(());
}
profile_fn!(get_lookup_table);
let mut indices = Vec::<u64>::new();
let mut values = Vec::new();
let mut lookup = HashMap::new();
for value in data.iter() {
let index = if let Some(i) = lookup.get(value) {
*i
} else {
let i = lookup.len();
lookup.insert(value, i);
values.push(*value);
i
};
indices.push(index as u64);
}
debug_assert!(lookup.len() == values.len());
debug_assert!(indices.len() == data.len());
if indices.len() == values.len() {
return Err(());
}
Ok((indices, values))
}
impl<T: PartialEq + Copy + std::fmt::Debug + Hash + Eq, S: CompressorSet<T>> Compressor<T> for Dictionary<S> {
fn compress<O: EncodeOptions>(&self, data: &[T], stream: &mut EncoderStream<'_, O>) -> Result<ArrayTypeId, ()> {
profile_method!(compress);
let (indices, values) = get_lookup_table(data)?;
stream.encode_with_id(|stream| compress(&values[..], stream, &self.sub_compressors));
stream.encode_with_id(|stream| indices.flush(stream));
Ok(ArrayTypeId::Dictionary)
}
fn fast_size_for<O: EncodeOptions>(&self, data: &[T], options: &O) -> Result<usize, ()> {
profile_method!(fast_size_for);
let (indices, values) = get_lookup_table(data)?;
let from_values = fast_size_for(&values[..], &self.sub_compressors, options);
let from_indices = Vec::<u64>::fast_size_for_all(&indices[..], options);
let from_ids = 2;
Ok(from_ids + from_indices + from_values)
}
}