use crate::builder::{ArrayBuilder, FixedSizeBinaryBuilder, PrimitiveBuilder};
use crate::types::ArrowDictionaryKeyType;
use crate::{Array, ArrayRef, DictionaryArray};
use arrow_buffer::ArrowNativeType;
use arrow_schema::DataType::FixedSizeBinary;
use arrow_schema::{ArrowError, DataType};
use hashbrown::HashTable;
use std::any::Any;
use std::sync::Arc;
#[derive(Debug)]
pub struct FixedSizeBinaryDictionaryBuilder<K>
where
K: ArrowDictionaryKeyType,
{
state: ahash::RandomState,
dedup: HashTable<usize>,
keys_builder: PrimitiveBuilder<K>,
values_builder: FixedSizeBinaryBuilder,
byte_width: i32,
}
impl<K> FixedSizeBinaryDictionaryBuilder<K>
where
K: ArrowDictionaryKeyType,
{
pub fn new(byte_width: i32) -> Self {
let keys_builder = PrimitiveBuilder::new();
let values_builder = FixedSizeBinaryBuilder::new(byte_width);
Self {
state: Default::default(),
dedup: HashTable::with_capacity(keys_builder.capacity()),
keys_builder,
values_builder,
byte_width,
}
}
pub fn with_capacity(keys_capacity: usize, value_capacity: usize, byte_width: i32) -> Self {
Self {
state: Default::default(),
dedup: Default::default(),
keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
values_builder: FixedSizeBinaryBuilder::with_capacity(value_capacity, byte_width),
byte_width,
}
}
}
impl<K> ArrayBuilder for FixedSizeBinaryDictionaryBuilder<K>
where
K: ArrowDictionaryKeyType,
{
fn as_any(&self) -> &dyn Any {
self
}
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
fn len(&self) -> usize {
self.keys_builder.len()
}
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
}
impl<K> FixedSizeBinaryDictionaryBuilder<K>
where
K: ArrowDictionaryKeyType,
{
fn get_or_insert_key(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
let value_bytes: &[u8] = value.as_ref();
let state = &self.state;
let storage = &mut self.values_builder;
let hash = state.hash_one(value_bytes);
let idx = *self
.dedup
.entry(
hash,
|idx| value_bytes == get_bytes(storage, self.byte_width, *idx),
|idx| state.hash_one(get_bytes(storage, self.byte_width, *idx)),
)
.or_insert_with(|| {
let idx = storage.len();
let _ = storage.append_value(value);
idx
})
.get();
let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?;
Ok(key)
}
pub fn append(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
if self.byte_width != value.as_ref().len() as i32 {
Err(ArrowError::InvalidArgumentError(format!(
"Invalid input length passed to FixedSizeBinaryBuilder. Expected {} got {}",
self.byte_width,
value.as_ref().len()
)))
} else {
let key = self.get_or_insert_key(value)?;
self.keys_builder.append_value(key);
Ok(key)
}
}
#[inline]
pub fn append_null(&mut self) {
self.keys_builder.append_null()
}
#[inline]
pub fn append_nulls(&mut self, n: usize) {
self.keys_builder.append_nulls(n);
}
pub fn append_value(&mut self, value: impl AsRef<[u8]>) {
self.append(value).expect("dictionary key overflow");
}
pub fn finish(&mut self) -> DictionaryArray<K> {
self.dedup.clear();
let values = self.values_builder.finish();
let keys = self.keys_builder.finish();
let data_type = DataType::Dictionary(
Box::new(K::DATA_TYPE),
Box::new(FixedSizeBinary(self.byte_width)),
);
let builder = keys
.into_data()
.into_builder()
.data_type(data_type)
.child_data(vec![values.into_data()]);
DictionaryArray::from(unsafe { builder.build_unchecked() })
}
pub fn finish_cloned(&self) -> DictionaryArray<K> {
let values = self.values_builder.finish_cloned();
let keys = self.keys_builder.finish_cloned();
let data_type = DataType::Dictionary(
Box::new(K::DATA_TYPE),
Box::new(FixedSizeBinary(self.byte_width)),
);
let builder = keys
.into_data()
.into_builder()
.data_type(data_type)
.child_data(vec![values.into_data()]);
DictionaryArray::from(unsafe { builder.build_unchecked() })
}
}
fn get_bytes(values: &FixedSizeBinaryBuilder, byte_width: i32, idx: usize) -> &[u8] {
let values = values.values_slice();
let start = idx * byte_width.as_usize();
let end = idx * byte_width.as_usize() + byte_width.as_usize();
&values[start..end]
}
#[cfg(test)]
mod tests {
use super::*;
use crate::types::Int8Type;
use crate::{FixedSizeBinaryArray, Int8Array};
#[test]
fn test_fixed_size_dictionary_builder() {
let values = ["abc", "def"];
let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
assert_eq!(b.append(values[0]).unwrap(), 0);
b.append_null();
assert_eq!(b.append(values[1]).unwrap(), 1);
assert_eq!(b.append(values[1]).unwrap(), 1);
assert_eq!(b.append(values[0]).unwrap(), 0);
b.append_nulls(2);
assert_eq!(b.append(values[0]).unwrap(), 0);
let array = b.finish();
assert_eq!(
array.keys(),
&Int8Array::from(vec![
Some(0),
None,
Some(1),
Some(1),
Some(0),
None,
None,
Some(0)
]),
);
let ava = array
.values()
.as_any()
.downcast_ref::<FixedSizeBinaryArray>()
.unwrap();
assert_eq!(ava.value(0), values[0].as_bytes());
assert_eq!(ava.value(1), values[1].as_bytes());
}
#[test]
fn test_fixed_size_dictionary_builder_wrong_size() {
let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
let err = b.append(b"too long").unwrap_err().to_string();
assert_eq!(err, "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 8");
let err = b.append("").unwrap_err().to_string();
assert_eq!(err, "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 0");
}
#[test]
fn test_fixed_size_dictionary_builder_finish_cloned() {
let values = ["abc", "def", "ghi"];
let mut builder = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
builder.append(values[0]).unwrap();
builder.append_null();
builder.append(values[1]).unwrap();
builder.append(values[1]).unwrap();
builder.append(values[0]).unwrap();
let mut array = builder.finish_cloned();
assert_eq!(
array.keys(),
&Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
);
let ava = array
.values()
.as_any()
.downcast_ref::<FixedSizeBinaryArray>()
.unwrap();
assert_eq!(ava.value(0), values[0].as_bytes());
assert_eq!(ava.value(1), values[1].as_bytes());
builder.append(values[0]).unwrap();
builder.append(values[2]).unwrap();
builder.append(values[1]).unwrap();
array = builder.finish();
assert_eq!(
array.keys(),
&Int8Array::from(vec![
Some(0),
None,
Some(1),
Some(1),
Some(0),
Some(0),
Some(2),
Some(1)
])
);
let ava2 = array
.values()
.as_any()
.downcast_ref::<FixedSizeBinaryArray>()
.unwrap();
assert_eq!(ava2.value(0), values[0].as_bytes());
assert_eq!(ava2.value(1), values[1].as_bytes());
assert_eq!(ava2.value(2), values[2].as_bytes());
}
}