1use crate::builder::{ArrayBuilder, FixedSizeBinaryBuilder, PrimitiveBuilder};
19use crate::types::ArrowDictionaryKeyType;
20use crate::{Array, ArrayRef, DictionaryArray};
21use arrow_buffer::ArrowNativeType;
22use arrow_schema::DataType::FixedSizeBinary;
23use arrow_schema::{ArrowError, DataType};
24use hashbrown::HashTable;
25use std::any::Any;
26use std::sync::Arc;
27
28#[derive(Debug)]
60pub struct FixedSizeBinaryDictionaryBuilder<K>
61where
62 K: ArrowDictionaryKeyType,
63{
64 state: ahash::RandomState,
65 dedup: HashTable<usize>,
66
67 keys_builder: PrimitiveBuilder<K>,
68 values_builder: FixedSizeBinaryBuilder,
69 byte_width: i32,
70}
71
72impl<K> FixedSizeBinaryDictionaryBuilder<K>
73where
74 K: ArrowDictionaryKeyType,
75{
76 pub fn new(byte_width: i32) -> Self {
78 let keys_builder = PrimitiveBuilder::new();
79 let values_builder = FixedSizeBinaryBuilder::new(byte_width);
80 Self {
81 state: Default::default(),
82 dedup: HashTable::with_capacity(keys_builder.capacity()),
83 keys_builder,
84 values_builder,
85 byte_width,
86 }
87 }
88
89 pub fn with_capacity(keys_capacity: usize, value_capacity: usize, byte_width: i32) -> Self {
95 Self {
96 state: Default::default(),
97 dedup: Default::default(),
98 keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
99 values_builder: FixedSizeBinaryBuilder::with_capacity(value_capacity, byte_width),
100 byte_width,
101 }
102 }
103}
104
105impl<K> ArrayBuilder for FixedSizeBinaryDictionaryBuilder<K>
106where
107 K: ArrowDictionaryKeyType,
108{
109 fn as_any(&self) -> &dyn Any {
111 self
112 }
113
114 fn as_any_mut(&mut self) -> &mut dyn Any {
116 self
117 }
118
119 fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
121 self
122 }
123
124 fn len(&self) -> usize {
126 self.keys_builder.len()
127 }
128
129 fn finish(&mut self) -> ArrayRef {
131 Arc::new(self.finish())
132 }
133
134 fn finish_cloned(&self) -> ArrayRef {
136 Arc::new(self.finish_cloned())
137 }
138}
139
140impl<K> FixedSizeBinaryDictionaryBuilder<K>
141where
142 K: ArrowDictionaryKeyType,
143{
144 fn get_or_insert_key(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
145 let value_bytes: &[u8] = value.as_ref();
146
147 let state = &self.state;
148 let storage = &mut self.values_builder;
149 let hash = state.hash_one(value_bytes);
150
151 let idx = *self
152 .dedup
153 .entry(
154 hash,
155 |idx| value_bytes == get_bytes(storage, self.byte_width, *idx),
156 |idx| state.hash_one(get_bytes(storage, self.byte_width, *idx)),
157 )
158 .or_insert_with(|| {
159 let idx = storage.len();
160 let _ = storage.append_value(value);
161 idx
162 })
163 .get();
164
165 let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?;
166
167 Ok(key)
168 }
169
170 pub fn append(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
176 if self.byte_width != value.as_ref().len() as i32 {
177 Err(ArrowError::InvalidArgumentError(format!(
178 "Invalid input length passed to FixedSizeBinaryBuilder. Expected {} got {}",
179 self.byte_width,
180 value.as_ref().len()
181 )))
182 } else {
183 let key = self.get_or_insert_key(value)?;
184 self.keys_builder.append_value(key);
185 Ok(key)
186 }
187 }
188
189 #[inline]
191 pub fn append_null(&mut self) {
192 self.keys_builder.append_null()
193 }
194
195 #[inline]
197 pub fn append_nulls(&mut self, n: usize) {
198 self.keys_builder.append_nulls(n);
199 }
200
201 pub fn append_value(&mut self, value: impl AsRef<[u8]>) {
207 self.append(value).expect("dictionary key overflow");
208 }
209
210 pub fn finish(&mut self) -> DictionaryArray<K> {
212 self.dedup.clear();
213 let values = self.values_builder.finish();
214 let keys = self.keys_builder.finish();
215
216 let data_type = DataType::Dictionary(
217 Box::new(K::DATA_TYPE),
218 Box::new(FixedSizeBinary(self.byte_width)),
219 );
220
221 let builder = keys
222 .into_data()
223 .into_builder()
224 .data_type(data_type)
225 .child_data(vec![values.into_data()]);
226
227 DictionaryArray::from(unsafe { builder.build_unchecked() })
228 }
229
230 pub fn finish_cloned(&self) -> DictionaryArray<K> {
232 let values = self.values_builder.finish_cloned();
233 let keys = self.keys_builder.finish_cloned();
234
235 let data_type = DataType::Dictionary(
236 Box::new(K::DATA_TYPE),
237 Box::new(FixedSizeBinary(self.byte_width)),
238 );
239
240 let builder = keys
241 .into_data()
242 .into_builder()
243 .data_type(data_type)
244 .child_data(vec![values.into_data()]);
245
246 DictionaryArray::from(unsafe { builder.build_unchecked() })
247 }
248}
249
250fn get_bytes(values: &FixedSizeBinaryBuilder, byte_width: i32, idx: usize) -> &[u8] {
251 let values = values.values_slice();
252 let start = idx * byte_width.as_usize();
253 let end = idx * byte_width.as_usize() + byte_width.as_usize();
254 &values[start..end]
255}
256
257#[cfg(test)]
258mod tests {
259 use super::*;
260
261 use crate::types::Int8Type;
262 use crate::{FixedSizeBinaryArray, Int8Array};
263
264 #[test]
265 fn test_fixed_size_dictionary_builder() {
266 let values = ["abc", "def"];
267
268 let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
269 assert_eq!(b.append(values[0]).unwrap(), 0);
270 b.append_null();
271 assert_eq!(b.append(values[1]).unwrap(), 1);
272 assert_eq!(b.append(values[1]).unwrap(), 1);
273 assert_eq!(b.append(values[0]).unwrap(), 0);
274 b.append_nulls(2);
275 assert_eq!(b.append(values[0]).unwrap(), 0);
276 let array = b.finish();
277
278 assert_eq!(
279 array.keys(),
280 &Int8Array::from(vec![
281 Some(0),
282 None,
283 Some(1),
284 Some(1),
285 Some(0),
286 None,
287 None,
288 Some(0)
289 ]),
290 );
291
292 let ava = array
294 .values()
295 .as_any()
296 .downcast_ref::<FixedSizeBinaryArray>()
297 .unwrap();
298
299 assert_eq!(ava.value(0), values[0].as_bytes());
300 assert_eq!(ava.value(1), values[1].as_bytes());
301 }
302
303 #[test]
304 fn test_fixed_size_dictionary_builder_wrong_size() {
305 let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
306 let err = b.append(b"too long").unwrap_err().to_string();
307 assert_eq!(err, "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 8");
308 let err = b.append("").unwrap_err().to_string();
309 assert_eq!(err, "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 0");
310 }
311
312 #[test]
313 fn test_fixed_size_dictionary_builder_finish_cloned() {
314 let values = ["abc", "def", "ghi"];
315
316 let mut builder = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
317
318 builder.append(values[0]).unwrap();
319 builder.append_null();
320 builder.append(values[1]).unwrap();
321 builder.append(values[1]).unwrap();
322 builder.append(values[0]).unwrap();
323 let mut array = builder.finish_cloned();
324
325 assert_eq!(
326 array.keys(),
327 &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
328 );
329
330 let ava = array
332 .values()
333 .as_any()
334 .downcast_ref::<FixedSizeBinaryArray>()
335 .unwrap();
336
337 assert_eq!(ava.value(0), values[0].as_bytes());
338 assert_eq!(ava.value(1), values[1].as_bytes());
339
340 builder.append(values[0]).unwrap();
341 builder.append(values[2]).unwrap();
342 builder.append(values[1]).unwrap();
343
344 array = builder.finish();
345
346 assert_eq!(
347 array.keys(),
348 &Int8Array::from(vec![
349 Some(0),
350 None,
351 Some(1),
352 Some(1),
353 Some(0),
354 Some(0),
355 Some(2),
356 Some(1)
357 ])
358 );
359
360 let ava2 = array
362 .values()
363 .as_any()
364 .downcast_ref::<FixedSizeBinaryArray>()
365 .unwrap();
366
367 assert_eq!(ava2.value(0), values[0].as_bytes());
368 assert_eq!(ava2.value(1), values[1].as_bytes());
369 assert_eq!(ava2.value(2), values[2].as_bytes());
370 }
371}