1use crate::builder::{ArrayBuilder, FixedSizeBinaryBuilder, PrimitiveBuilder};
19use crate::types::ArrowDictionaryKeyType;
20use crate::{Array, ArrayRef, DictionaryArray, PrimitiveArray};
21use arrow_buffer::ArrowNativeType;
22use arrow_schema::DataType::FixedSizeBinary;
23use arrow_schema::{ArrowError, DataType};
24use hashbrown::HashTable;
25use num_traits::NumCast;
26use std::any::Any;
27use std::sync::Arc;
28
29#[derive(Debug)]
61pub struct FixedSizeBinaryDictionaryBuilder<K>
62where
63 K: ArrowDictionaryKeyType,
64{
65 state: ahash::RandomState,
66 dedup: HashTable<usize>,
67
68 keys_builder: PrimitiveBuilder<K>,
69 values_builder: FixedSizeBinaryBuilder,
70 byte_width: i32,
71}
72
73impl<K> FixedSizeBinaryDictionaryBuilder<K>
74where
75 K: ArrowDictionaryKeyType,
76{
77 pub fn new(byte_width: i32) -> Self {
79 let keys_builder = PrimitiveBuilder::new();
80 let values_builder = FixedSizeBinaryBuilder::new(byte_width);
81 Self {
82 state: Default::default(),
83 dedup: HashTable::with_capacity(keys_builder.capacity()),
84 keys_builder,
85 values_builder,
86 byte_width,
87 }
88 }
89
90 pub fn with_capacity(keys_capacity: usize, value_capacity: usize, byte_width: i32) -> Self {
96 Self {
97 state: Default::default(),
98 dedup: Default::default(),
99 keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
100 values_builder: FixedSizeBinaryBuilder::with_capacity(value_capacity, byte_width),
101 byte_width,
102 }
103 }
104
105 pub fn try_new_from_builder<K2>(
131 mut source: FixedSizeBinaryDictionaryBuilder<K2>,
132 ) -> Result<Self, ArrowError>
133 where
134 K::Native: NumCast,
135 K2: ArrowDictionaryKeyType,
136 K2::Native: NumCast,
137 {
138 let state = source.state;
139 let dedup = source.dedup;
140 let values_builder = source.values_builder;
141 let byte_width = source.byte_width;
142
143 let source_keys = source.keys_builder.finish();
144 let new_keys: PrimitiveArray<K> = source_keys.try_unary(|value| {
145 num_traits::cast::cast::<K2::Native, K::Native>(value).ok_or_else(|| {
146 ArrowError::CastError(format!(
147 "Can't cast dictionary keys from source type {:?} to type {:?}",
148 K2::DATA_TYPE,
149 K::DATA_TYPE
150 ))
151 })
152 })?;
153
154 drop(source_keys);
158
159 Ok(Self {
160 state,
161 dedup,
162 keys_builder: new_keys
163 .into_builder()
164 .expect("underlying buffer has no references"),
165 values_builder,
166 byte_width,
167 })
168 }
169}
170
171impl<K> ArrayBuilder for FixedSizeBinaryDictionaryBuilder<K>
172where
173 K: ArrowDictionaryKeyType,
174{
175 fn as_any(&self) -> &dyn Any {
177 self
178 }
179
180 fn as_any_mut(&mut self) -> &mut dyn Any {
182 self
183 }
184
185 fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
187 self
188 }
189
190 fn len(&self) -> usize {
192 self.keys_builder.len()
193 }
194
195 fn finish(&mut self) -> ArrayRef {
197 Arc::new(self.finish())
198 }
199
200 fn finish_cloned(&self) -> ArrayRef {
202 Arc::new(self.finish_cloned())
203 }
204
205 fn finish_preserve_values(&mut self) -> ArrayRef {
206 Arc::new(self.finish_preserve_values())
207 }
208}
209
210impl<K> FixedSizeBinaryDictionaryBuilder<K>
211where
212 K: ArrowDictionaryKeyType,
213{
214 fn get_or_insert_key(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
215 let value_bytes: &[u8] = value.as_ref();
216
217 let state = &self.state;
218 let storage = &mut self.values_builder;
219 let hash = state.hash_one(value_bytes);
220
221 let idx = *self
222 .dedup
223 .entry(
224 hash,
225 |idx| value_bytes == get_bytes(storage, self.byte_width, *idx),
226 |idx| state.hash_one(get_bytes(storage, self.byte_width, *idx)),
227 )
228 .or_insert_with(|| {
229 let idx = storage.len();
230 let _ = storage.append_value(value);
231 idx
232 })
233 .get();
234
235 let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?;
236
237 Ok(key)
238 }
239
240 pub fn append(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
246 if self.byte_width != value.as_ref().len() as i32 {
247 Err(ArrowError::InvalidArgumentError(format!(
248 "Invalid input length passed to FixedSizeBinaryBuilder. Expected {} got {}",
249 self.byte_width,
250 value.as_ref().len()
251 )))
252 } else {
253 let key = self.get_or_insert_key(value)?;
254 self.keys_builder.append_value(key);
255 Ok(key)
256 }
257 }
258
259 pub fn append_n(
264 &mut self,
265 value: impl AsRef<[u8]>,
266 count: usize,
267 ) -> Result<K::Native, ArrowError> {
268 if self.byte_width != value.as_ref().len() as i32 {
269 Err(ArrowError::InvalidArgumentError(format!(
270 "Invalid input length passed to FixedSizeBinaryBuilder. Expected {} got {}",
271 self.byte_width,
272 value.as_ref().len()
273 )))
274 } else {
275 let key = self.get_or_insert_key(value)?;
276 self.keys_builder.append_value_n(key, count);
277 Ok(key)
278 }
279 }
280
281 #[inline]
283 pub fn append_null(&mut self) {
284 self.keys_builder.append_null()
285 }
286
287 #[inline]
289 pub fn append_nulls(&mut self, n: usize) {
290 self.keys_builder.append_nulls(n);
291 }
292
293 pub fn append_value(&mut self, value: impl AsRef<[u8]>) {
299 self.append(value).expect("dictionary key overflow");
300 }
301
302 pub fn finish(&mut self) -> DictionaryArray<K> {
304 self.dedup.clear();
305 let values = self.values_builder.finish();
306 let keys = self.keys_builder.finish();
307
308 let data_type = DataType::Dictionary(
309 Box::new(K::DATA_TYPE),
310 Box::new(FixedSizeBinary(self.byte_width)),
311 );
312
313 let builder = keys
314 .into_data()
315 .into_builder()
316 .data_type(data_type)
317 .child_data(vec![values.into_data()]);
318
319 DictionaryArray::from(unsafe { builder.build_unchecked() })
320 }
321
322 pub fn finish_cloned(&self) -> DictionaryArray<K> {
324 let values = self.values_builder.finish_cloned();
325 let keys = self.keys_builder.finish_cloned();
326
327 let data_type = DataType::Dictionary(
328 Box::new(K::DATA_TYPE),
329 Box::new(FixedSizeBinary(self.byte_width)),
330 );
331
332 let builder = keys
333 .into_data()
334 .into_builder()
335 .data_type(data_type)
336 .child_data(vec![values.into_data()]);
337
338 DictionaryArray::from(unsafe { builder.build_unchecked() })
339 }
340
341 pub fn finish_preserve_values(&mut self) -> DictionaryArray<K> {
359 let values = self.values_builder.finish_cloned();
360 let keys = self.keys_builder.finish();
361
362 let data_type = DataType::Dictionary(
363 Box::new(K::DATA_TYPE),
364 Box::new(FixedSizeBinary(self.byte_width)),
365 );
366
367 let builder = keys
368 .into_data()
369 .into_builder()
370 .data_type(data_type)
371 .child_data(vec![values.into_data()]);
372
373 DictionaryArray::from(unsafe { builder.build_unchecked() })
374 }
375}
376
377fn get_bytes(values: &FixedSizeBinaryBuilder, byte_width: i32, idx: usize) -> &[u8] {
378 let values = values.values_slice();
379 let start = idx * byte_width.as_usize();
380 let end = idx * byte_width.as_usize() + byte_width.as_usize();
381 &values[start..end]
382}
383
384#[cfg(test)]
385mod tests {
386 use super::*;
387
388 use crate::types::{Int8Type, Int16Type, Int32Type, UInt8Type, UInt16Type};
389 use crate::{ArrowPrimitiveType, FixedSizeBinaryArray, Int8Array};
390
391 #[test]
392 fn test_fixed_size_dictionary_builder() {
393 let values = ["abc", "def"];
394
395 let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
396 assert_eq!(b.append(values[0]).unwrap(), 0);
397 b.append_null();
398 assert_eq!(b.append(values[1]).unwrap(), 1);
399 assert_eq!(b.append(values[1]).unwrap(), 1);
400 assert_eq!(b.append(values[0]).unwrap(), 0);
401 b.append_nulls(2);
402 assert_eq!(b.append(values[0]).unwrap(), 0);
403 let array = b.finish();
404
405 assert_eq!(
406 array.keys(),
407 &Int8Array::from(vec![
408 Some(0),
409 None,
410 Some(1),
411 Some(1),
412 Some(0),
413 None,
414 None,
415 Some(0)
416 ]),
417 );
418
419 let ava = array
421 .values()
422 .as_any()
423 .downcast_ref::<FixedSizeBinaryArray>()
424 .unwrap();
425
426 assert_eq!(ava.value(0), values[0].as_bytes());
427 assert_eq!(ava.value(1), values[1].as_bytes());
428 }
429
430 #[test]
431 fn test_fixed_size_dictionary_builder_append_n() {
432 let values = ["abc", "def"];
433 let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
434 assert_eq!(b.append_n(values[0], 2).unwrap(), 0);
435 assert_eq!(b.append_n(values[1], 3).unwrap(), 1);
436 assert_eq!(b.append_n(values[0], 2).unwrap(), 0);
437 let array = b.finish();
438
439 assert_eq!(
440 array.keys(),
441 &Int8Array::from(vec![
442 Some(0),
443 Some(0),
444 Some(1),
445 Some(1),
446 Some(1),
447 Some(0),
448 Some(0),
449 ]),
450 );
451
452 let ava = array
454 .values()
455 .as_any()
456 .downcast_ref::<FixedSizeBinaryArray>()
457 .unwrap();
458
459 assert_eq!(ava.value(0), values[0].as_bytes());
460 assert_eq!(ava.value(1), values[1].as_bytes());
461 }
462
463 #[test]
464 fn test_fixed_size_dictionary_builder_wrong_size() {
465 let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
466 let err = b.append(b"too long").unwrap_err().to_string();
467 assert_eq!(
468 err,
469 "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 8"
470 );
471 let err = b.append("").unwrap_err().to_string();
472 assert_eq!(
473 err,
474 "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 0"
475 );
476 let err = b.append_n("a", 3).unwrap_err().to_string();
477 assert_eq!(
478 err,
479 "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 1"
480 );
481 }
482
483 #[test]
484 fn test_fixed_size_dictionary_builder_finish_cloned() {
485 let values = ["abc", "def", "ghi"];
486
487 let mut builder = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
488
489 builder.append(values[0]).unwrap();
490 builder.append_null();
491 builder.append(values[1]).unwrap();
492 builder.append(values[1]).unwrap();
493 builder.append(values[0]).unwrap();
494 let mut array = builder.finish_cloned();
495
496 assert_eq!(
497 array.keys(),
498 &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
499 );
500
501 let ava = array
503 .values()
504 .as_any()
505 .downcast_ref::<FixedSizeBinaryArray>()
506 .unwrap();
507
508 assert_eq!(ava.value(0), values[0].as_bytes());
509 assert_eq!(ava.value(1), values[1].as_bytes());
510
511 builder.append(values[0]).unwrap();
512 builder.append(values[2]).unwrap();
513 builder.append(values[1]).unwrap();
514
515 array = builder.finish();
516
517 assert_eq!(
518 array.keys(),
519 &Int8Array::from(vec![
520 Some(0),
521 None,
522 Some(1),
523 Some(1),
524 Some(0),
525 Some(0),
526 Some(2),
527 Some(1)
528 ])
529 );
530
531 let ava2 = array
533 .values()
534 .as_any()
535 .downcast_ref::<FixedSizeBinaryArray>()
536 .unwrap();
537
538 assert_eq!(ava2.value(0), values[0].as_bytes());
539 assert_eq!(ava2.value(1), values[1].as_bytes());
540 assert_eq!(ava2.value(2), values[2].as_bytes());
541 }
542
543 fn _test_try_new_from_builder_generic_for_key_types<K1, K2>(values: Vec<[u8; 3]>)
544 where
545 K1: ArrowDictionaryKeyType,
546 K1::Native: NumCast,
547 K2: ArrowDictionaryKeyType,
548 K2::Native: NumCast + From<u8>,
549 {
550 let mut source = FixedSizeBinaryDictionaryBuilder::<K1>::new(3);
551 source.append_value(values[0]);
552 source.append_null();
553 source.append_value(values[1]);
554 source.append_value(values[2]);
555
556 let mut result =
557 FixedSizeBinaryDictionaryBuilder::<K2>::try_new_from_builder(source).unwrap();
558 let array = result.finish();
559
560 let mut expected_keys_builder = PrimitiveBuilder::<K2>::new();
561 expected_keys_builder
562 .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(0u8));
563 expected_keys_builder.append_null();
564 expected_keys_builder
565 .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(1u8));
566 expected_keys_builder
567 .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(2u8));
568 let expected_keys = expected_keys_builder.finish();
569 assert_eq!(array.keys(), &expected_keys);
570
571 let av = array.values();
572 let ava = av.as_any().downcast_ref::<FixedSizeBinaryArray>().unwrap();
573 assert_eq!(ava.value(0), values[0]);
574 assert_eq!(ava.value(1), values[1]);
575 assert_eq!(ava.value(2), values[2]);
576 }
577
578 #[test]
579 fn test_try_new_from_builder() {
580 let values = vec![[1, 2, 3], [5, 6, 7], [6, 7, 8]];
581 _test_try_new_from_builder_generic_for_key_types::<UInt8Type, UInt16Type>(values.clone());
583 _test_try_new_from_builder_generic_for_key_types::<UInt16Type, UInt8Type>(values.clone());
585 _test_try_new_from_builder_generic_for_key_types::<Int8Type, Int16Type>(values.clone());
587 _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type>(values.clone());
589 _test_try_new_from_builder_generic_for_key_types::<UInt8Type, Int16Type>(values.clone());
591 _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt8Type>(values.clone());
592 _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt16Type>(values.clone());
593 _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type>(values.clone());
594 }
595
596 #[test]
597 fn test_try_new_from_builder_cast_fails() {
598 let mut source_builder = FixedSizeBinaryDictionaryBuilder::<UInt16Type>::new(2);
599 for i in 0u16..257u16 {
600 source_builder.append_value(vec![(i >> 8) as u8, i as u8]);
601 }
602
603 let result =
606 FixedSizeBinaryDictionaryBuilder::<UInt8Type>::try_new_from_builder(source_builder);
607 assert!(result.is_err());
608 if let Err(e) = result {
609 assert!(matches!(e, ArrowError::CastError(_)));
610 assert_eq!(
611 e.to_string(),
612 "Cast error: Can't cast dictionary keys from source type UInt16 to type UInt8"
613 );
614 }
615 }
616
617 #[test]
618 fn test_finish_preserve_values() {
619 let mut builder = FixedSizeBinaryDictionaryBuilder::<Int32Type>::new(3);
621 builder.append_value("aaa");
622 builder.append_value("bbb");
623 builder.append_value("ccc");
624 let dict = builder.finish_preserve_values();
625 assert_eq!(dict.keys().values(), &[0, 1, 2]);
626 let values = dict
627 .downcast_dict::<FixedSizeBinaryArray>()
628 .unwrap()
629 .into_iter()
630 .collect::<Vec<_>>();
631 assert_eq!(
632 values,
633 vec![
634 Some("aaa".as_bytes()),
635 Some("bbb".as_bytes()),
636 Some("ccc".as_bytes())
637 ]
638 );
639
640 builder.append_value("ddd");
642 builder.append_value("eee");
643 let dict2 = builder.finish_preserve_values();
644
645 assert_eq!(dict2.keys().values(), &[3, 4]);
648 let values = dict2
649 .downcast_dict::<FixedSizeBinaryArray>()
650 .unwrap()
651 .into_iter()
652 .collect::<Vec<_>>();
653 assert_eq!(values, [Some("ddd".as_bytes()), Some("eee".as_bytes())]);
654
655 let all_values = dict2
657 .values()
658 .as_any()
659 .downcast_ref::<FixedSizeBinaryArray>()
660 .unwrap()
661 .into_iter()
662 .collect::<Vec<_>>();
663 assert_eq!(
664 all_values,
665 [
666 Some("aaa".as_bytes()),
667 Some("bbb".as_bytes()),
668 Some("ccc".as_bytes()),
669 Some("ddd".as_bytes()),
670 Some("eee".as_bytes())
671 ]
672 );
673 }
674}