use crate::builder::{ArrayBuilder, PrimitiveBuilder};
use crate::types::ArrowDictionaryKeyType;
use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray};
use arrow_buffer::{ArrowNativeType, ToByteSlice};
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::sync::Arc;
#[derive(Debug)]
struct Value<T>(T);
impl<T: ToByteSlice> std::hash::Hash for Value<T> {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.0.to_byte_slice().hash(state)
}
}
impl<T: ToByteSlice> PartialEq for Value<T> {
fn eq(&self, other: &Self) -> bool {
self.0.to_byte_slice().eq(other.0.to_byte_slice())
}
}
impl<T: ToByteSlice> Eq for Value<T> {}
#[derive(Debug)]
pub struct PrimitiveDictionaryBuilder<K, V>
where
K: ArrowPrimitiveType,
V: ArrowPrimitiveType,
{
keys_builder: PrimitiveBuilder<K>,
values_builder: PrimitiveBuilder<V>,
map: HashMap<Value<V::Native>, usize>,
}
impl<K, V> Default for PrimitiveDictionaryBuilder<K, V>
where
K: ArrowPrimitiveType,
V: ArrowPrimitiveType,
{
fn default() -> Self {
Self::new()
}
}
impl<K, V> PrimitiveDictionaryBuilder<K, V>
where
K: ArrowPrimitiveType,
V: ArrowPrimitiveType,
{
pub fn new() -> Self {
Self {
keys_builder: PrimitiveBuilder::new(),
values_builder: PrimitiveBuilder::new(),
map: HashMap::new(),
}
}
pub fn new_from_empty_builders(
keys_builder: PrimitiveBuilder<K>,
values_builder: PrimitiveBuilder<V>,
) -> Self {
assert!(
keys_builder.is_empty() && values_builder.is_empty(),
"keys and values builders must be empty"
);
Self {
keys_builder,
values_builder,
map: HashMap::new(),
}
}
pub unsafe fn new_from_builders(
keys_builder: PrimitiveBuilder<K>,
values_builder: PrimitiveBuilder<V>,
) -> Self {
let keys = keys_builder.values_slice();
let values = values_builder.values_slice();
let mut map = HashMap::with_capacity(values.len());
keys.iter().zip(values.iter()).for_each(|(key, value)| {
map.insert(Value(*value), K::Native::to_usize(*key).unwrap());
});
Self {
keys_builder,
values_builder,
map,
}
}
pub fn with_capacity(keys_capacity: usize, values_capacity: usize) -> Self {
Self {
keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
values_builder: PrimitiveBuilder::with_capacity(values_capacity),
map: HashMap::with_capacity(values_capacity),
}
}
}
impl<K, V> ArrayBuilder for PrimitiveDictionaryBuilder<K, V>
where
K: ArrowDictionaryKeyType,
V: ArrowPrimitiveType,
{
fn as_any(&self) -> &dyn Any {
self
}
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
fn len(&self) -> usize {
self.keys_builder.len()
}
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
}
impl<K, V> PrimitiveDictionaryBuilder<K, V>
where
K: ArrowDictionaryKeyType,
V: ArrowPrimitiveType,
{
#[inline]
pub fn append(&mut self, value: V::Native) -> Result<K::Native, ArrowError> {
let key = match self.map.entry(Value(value)) {
Entry::Vacant(vacant) => {
let key = self.values_builder.len();
self.values_builder.append_value(value);
vacant.insert(key);
K::Native::from_usize(key)
.ok_or(ArrowError::DictionaryKeyOverflowError)?
}
Entry::Occupied(o) => K::Native::usize_as(*o.get()),
};
self.keys_builder.append_value(key);
Ok(key)
}
#[inline]
pub fn append_value(&mut self, value: V::Native) {
self.append(value).expect("dictionary key overflow");
}
#[inline]
pub fn append_null(&mut self) {
self.keys_builder.append_null()
}
#[inline]
pub fn append_option(&mut self, value: Option<V::Native>) {
match value {
None => self.append_null(),
Some(v) => self.append_value(v),
};
}
pub fn finish(&mut self) -> DictionaryArray<K> {
self.map.clear();
let values = self.values_builder.finish();
let keys = self.keys_builder.finish();
let data_type = DataType::Dictionary(
Box::new(K::DATA_TYPE),
Box::new(values.data_type().clone()),
);
let builder = keys
.into_data()
.into_builder()
.data_type(data_type)
.child_data(vec![values.into_data()]);
DictionaryArray::from(unsafe { builder.build_unchecked() })
}
pub fn finish_cloned(&self) -> DictionaryArray<K> {
let values = self.values_builder.finish_cloned();
let keys = self.keys_builder.finish_cloned();
let data_type =
DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(V::DATA_TYPE));
let builder = keys
.into_data()
.into_builder()
.data_type(data_type)
.child_data(vec![values.into_data()]);
DictionaryArray::from(unsafe { builder.build_unchecked() })
}
pub fn values_slice(&self) -> &[V::Native] {
self.values_builder.values_slice()
}
pub fn values_slice_mut(&mut self) -> &mut [V::Native] {
self.values_builder.values_slice_mut()
}
}
impl<K: ArrowDictionaryKeyType, P: ArrowPrimitiveType> Extend<Option<P::Native>>
for PrimitiveDictionaryBuilder<K, P>
{
#[inline]
fn extend<T: IntoIterator<Item = Option<P::Native>>>(&mut self, iter: T) {
for v in iter {
self.append_option(v)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::array::Array;
use crate::array::UInt32Array;
use crate::array::UInt8Array;
use crate::builder::Decimal128Builder;
use crate::types::{Decimal128Type, Int32Type, UInt32Type, UInt8Type};
#[test]
fn test_primitive_dictionary_builder() {
let mut builder =
PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(3, 2);
builder.append(12345678).unwrap();
builder.append_null();
builder.append(22345678).unwrap();
let array = builder.finish();
assert_eq!(
array.keys(),
&UInt8Array::from(vec![Some(0), None, Some(1)])
);
let av = array.values();
let ava: &UInt32Array = av.as_any().downcast_ref::<UInt32Array>().unwrap();
let avs: &[u32] = ava.values();
assert!(!array.is_null(0));
assert!(array.is_null(1));
assert!(!array.is_null(2));
assert_eq!(avs, &[12345678, 22345678]);
}
#[test]
fn test_extend() {
let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
builder.extend([1, 2, 3, 1, 2, 3, 1, 2, 3].into_iter().map(Some));
builder.extend([4, 5, 1, 3, 1].into_iter().map(Some));
let dict = builder.finish();
assert_eq!(
dict.keys().values(),
&[0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 4, 0, 2, 0]
);
assert_eq!(dict.values().len(), 5);
}
#[test]
#[should_panic(expected = "DictionaryKeyOverflowError")]
fn test_primitive_dictionary_overflow() {
let mut builder =
PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(257, 257);
for i in 0..256 {
builder.append(i + 1000).unwrap();
}
builder.append(1257).unwrap();
}
#[test]
fn test_primitive_dictionary_with_builders() {
let keys_builder = PrimitiveBuilder::<Int32Type>::new();
let values_builder =
Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2));
let mut builder =
PrimitiveDictionaryBuilder::<Int32Type, Decimal128Type>::new_from_empty_builders(
keys_builder,
values_builder,
);
let dict_array = builder.finish();
assert_eq!(dict_array.value_type(), DataType::Decimal128(1, 2));
assert_eq!(
dict_array.data_type(),
&DataType::Dictionary(
Box::new(DataType::Int32),
Box::new(DataType::Decimal128(1, 2)),
)
);
}
}