Skip to main content

arrow_array/builder/
generic_bytes_dictionary_builder.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder};
19use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType};
20use crate::{
21    Array, ArrayRef, DictionaryArray, GenericByteArray, PrimitiveArray, TypedDictionaryArray,
22};
23use arrow_buffer::ArrowNativeType;
24use arrow_schema::{ArrowError, DataType};
25use hashbrown::HashTable;
26use num_traits::NumCast;
27use std::any::Any;
28use std::sync::Arc;
29
30/// Builder for [`DictionaryArray`] of [`GenericByteArray`]
31///
32/// For example to map a set of byte indices to String values. Note that
33/// the use of a `HashMap` here will not scale to very large arrays or
34/// result in an ordered dictionary.
35#[derive(Debug)]
36pub struct GenericByteDictionaryBuilder<K, T>
37where
38    K: ArrowDictionaryKeyType,
39    T: ByteArrayType,
40{
41    state: ahash::RandomState,
42    dedup: HashTable<usize>,
43
44    keys_builder: PrimitiveBuilder<K>,
45    values_builder: GenericByteBuilder<T>,
46}
47
48impl<K, T> Default for GenericByteDictionaryBuilder<K, T>
49where
50    K: ArrowDictionaryKeyType,
51    T: ByteArrayType,
52{
53    fn default() -> Self {
54        Self::new()
55    }
56}
57
58impl<K, T> GenericByteDictionaryBuilder<K, T>
59where
60    K: ArrowDictionaryKeyType,
61    T: ByteArrayType,
62{
63    /// Creates a new `GenericByteDictionaryBuilder`
64    pub fn new() -> Self {
65        let keys_builder = PrimitiveBuilder::new();
66        let values_builder = GenericByteBuilder::<T>::new();
67        Self {
68            state: Default::default(),
69            dedup: HashTable::with_capacity(keys_builder.capacity()),
70            keys_builder,
71            values_builder,
72        }
73    }
74
75    /// Creates a new `GenericByteDictionaryBuilder` with the provided capacities
76    ///
77    /// `keys_capacity`: the number of keys, i.e. length of array to build
78    /// `value_capacity`: the number of distinct dictionary values, i.e. size of dictionary
79    /// `data_capacity`: the total number of bytes of all distinct bytes in the dictionary
80    pub fn with_capacity(
81        keys_capacity: usize,
82        value_capacity: usize,
83        data_capacity: usize,
84    ) -> Self {
85        Self {
86            state: Default::default(),
87            dedup: Default::default(),
88            keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
89            values_builder: GenericByteBuilder::<T>::with_capacity(value_capacity, data_capacity),
90        }
91    }
92
93    /// Creates a new `GenericByteDictionaryBuilder` from a keys capacity and a dictionary
94    /// which is initialized with the given values.
95    /// The indices of those dictionary values are used as keys.
96    ///
97    /// # Example
98    ///
99    /// ```
100    /// # use arrow_array::builder::StringDictionaryBuilder;
101    /// # use arrow_array::{Int16Array, StringArray};
102    ///
103    /// let dictionary_values = StringArray::from(vec![None, Some("abc"), Some("def")]);
104    ///
105    /// let mut builder = StringDictionaryBuilder::new_with_dictionary(3, &dictionary_values).unwrap();
106    /// builder.append("def").unwrap();
107    /// builder.append_null();
108    /// builder.append("abc").unwrap();
109    ///
110    /// let dictionary_array = builder.finish();
111    ///
112    /// let keys = dictionary_array.keys();
113    ///
114    /// assert_eq!(keys, &Int16Array::from(vec![Some(2), None, Some(1)]));
115    /// ```
116    pub fn new_with_dictionary(
117        keys_capacity: usize,
118        dictionary_values: &GenericByteArray<T>,
119    ) -> Result<Self, ArrowError> {
120        let state = ahash::RandomState::default();
121        let dict_len = dictionary_values.len();
122
123        let mut dedup = HashTable::with_capacity(dict_len);
124
125        let values_len = dictionary_values.value_data().len();
126        let mut values_builder = GenericByteBuilder::<T>::with_capacity(dict_len, values_len);
127
128        K::Native::from_usize(dictionary_values.len())
129            .ok_or(ArrowError::DictionaryKeyOverflowError)?;
130
131        for (idx, maybe_value) in dictionary_values.iter().enumerate() {
132            match maybe_value {
133                Some(value) => {
134                    let value_bytes: &[u8] = value.as_ref();
135                    let hash = state.hash_one(value_bytes);
136
137                    dedup
138                        .entry(
139                            hash,
140                            |idx: &usize| value_bytes == get_bytes(&values_builder, *idx),
141                            |idx: &usize| state.hash_one(get_bytes(&values_builder, *idx)),
142                        )
143                        .or_insert(idx);
144
145                    values_builder.append_value(value);
146                }
147                None => values_builder.append_null(),
148            }
149        }
150
151        Ok(Self {
152            state,
153            dedup,
154            keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
155            values_builder,
156        })
157    }
158
159    /// Creates a new `GenericByteDictionaryBuilder` from the existing builder with the same
160    /// keys and values, but with a new data type for the keys.
161    ///
162    /// # Example
163    /// ```
164    /// #
165    /// # use arrow_array::builder::StringDictionaryBuilder;
166    /// # use arrow_array::types::{UInt8Type, UInt16Type};
167    /// # use arrow_array::UInt16Array;
168    /// # use arrow_schema::ArrowError;
169    ///
170    /// let mut u8_keyed_builder = StringDictionaryBuilder::<UInt8Type>::new();
171    ///
172    /// // appending too many values causes the dictionary to overflow
173    /// for i in 0..256 {
174    ///     u8_keyed_builder.append_value(format!("{}", i));
175    /// }
176    /// let result = u8_keyed_builder.append("256");
177    /// assert!(matches!(result, Err(ArrowError::DictionaryKeyOverflowError{})));
178    ///
179    /// // we need to upgrade to a larger key type
180    /// let mut u16_keyed_builder = StringDictionaryBuilder::<UInt16Type>::try_new_from_builder(u8_keyed_builder).unwrap();
181    /// let dictionary_array = u16_keyed_builder.finish();
182    /// let keys = dictionary_array.keys();
183    ///
184    /// assert_eq!(keys, &UInt16Array::from_iter(0..256));
185    /// ```
186    pub fn try_new_from_builder<K2>(
187        mut source: GenericByteDictionaryBuilder<K2, T>,
188    ) -> Result<Self, ArrowError>
189    where
190        K::Native: NumCast,
191        K2: ArrowDictionaryKeyType,
192        K2::Native: NumCast,
193    {
194        let state = source.state;
195        let dedup = source.dedup;
196        let values_builder = source.values_builder;
197
198        let source_keys = source.keys_builder.finish();
199        let new_keys: PrimitiveArray<K> = source_keys.try_unary(|value| {
200            num_traits::cast::cast::<K2::Native, K::Native>(value).ok_or_else(|| {
201                ArrowError::CastError(format!(
202                    "Can't cast dictionary keys from source type {:?} to type {:?}",
203                    K2::DATA_TYPE,
204                    K::DATA_TYPE
205                ))
206            })
207        })?;
208
209        // drop source key here because currently source_keys and new_keys are holding reference to
210        // the same underlying null_buffer. Below we want to call new_keys.into_builder() it must
211        // be the only reference holder.
212        drop(source_keys);
213
214        Ok(Self {
215            state,
216            dedup,
217            keys_builder: new_keys
218                .into_builder()
219                .expect("underlying buffer has no references"),
220            values_builder,
221        })
222    }
223}
224
225impl<K, T> ArrayBuilder for GenericByteDictionaryBuilder<K, T>
226where
227    K: ArrowDictionaryKeyType,
228    T: ByteArrayType,
229{
230    /// Returns the builder as an non-mutable `Any` reference.
231    fn as_any(&self) -> &dyn Any {
232        self
233    }
234
235    /// Returns the builder as an mutable `Any` reference.
236    fn as_any_mut(&mut self) -> &mut dyn Any {
237        self
238    }
239
240    /// Returns the boxed builder as a box of `Any`.
241    fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
242        self
243    }
244
245    /// Returns the number of array slots in the builder
246    fn len(&self) -> usize {
247        self.keys_builder.len()
248    }
249
250    /// Builds the array and reset this builder.
251    fn finish(&mut self) -> ArrayRef {
252        Arc::new(self.finish())
253    }
254
255    /// Builds the array without resetting the builder.
256    fn finish_cloned(&self) -> ArrayRef {
257        Arc::new(self.finish_cloned())
258    }
259
260    fn finish_preserve_values(&mut self) -> ArrayRef {
261        Arc::new(self.finish_preserve_values())
262    }
263}
264
265impl<K, T> GenericByteDictionaryBuilder<K, T>
266where
267    K: ArrowDictionaryKeyType,
268    T: ByteArrayType,
269{
270    fn get_or_insert_key(&mut self, value: impl AsRef<T::Native>) -> Result<K::Native, ArrowError> {
271        let value_native: &T::Native = value.as_ref();
272        let value_bytes: &[u8] = value_native.as_ref();
273
274        let state = &self.state;
275        let storage = &mut self.values_builder;
276        let hash = state.hash_one(value_bytes);
277
278        let idx = *self
279            .dedup
280            .entry(
281                hash,
282                |idx| value_bytes == get_bytes(storage, *idx),
283                |idx| state.hash_one(get_bytes(storage, *idx)),
284            )
285            .or_insert_with(|| {
286                let idx = storage.len();
287                storage.append_value(value);
288                idx
289            })
290            .get();
291
292        let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?;
293
294        Ok(key)
295    }
296
297    /// Append a value to the array. Return an existing index
298    /// if already present in the values array or a new index if the
299    /// value is appended to the values array.
300    ///
301    /// Returns an error if the new index would overflow the key type.
302    pub fn append(&mut self, value: impl AsRef<T::Native>) -> Result<K::Native, ArrowError> {
303        let key = self.get_or_insert_key(value)?;
304        self.keys_builder.append_value(key);
305        Ok(key)
306    }
307
308    /// Append a value multiple times to the array.
309    /// This is the same as `append` but allows to append the same value multiple times without doing multiple lookups.
310    ///
311    /// Returns an error if the new index would overflow the key type.
312    pub fn append_n(
313        &mut self,
314        value: impl AsRef<T::Native>,
315        count: usize,
316    ) -> Result<K::Native, ArrowError> {
317        let key = self.get_or_insert_key(value)?;
318        self.keys_builder.append_value_n(key, count);
319        Ok(key)
320    }
321
322    /// Infallibly append a value to this builder
323    ///
324    /// # Panics
325    ///
326    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
327    pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
328        self.append(value).expect("dictionary key overflow");
329    }
330
331    /// Infallibly append a value to this builder repeatedly `count` times.
332    /// This is the same as `append_value` but allows to append the same value multiple times without doing multiple lookups.
333    ///
334    /// # Panics
335    ///
336    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
337    pub fn append_values(&mut self, value: impl AsRef<T::Native>, count: usize) {
338        self.append_n(value, count)
339            .expect("dictionary key overflow");
340    }
341
342    /// Appends a null slot into the builder
343    #[inline]
344    pub fn append_null(&mut self) {
345        self.keys_builder.append_null()
346    }
347
348    /// Infallibly append `n` null slots into the builder
349    #[inline]
350    pub fn append_nulls(&mut self, n: usize) {
351        self.keys_builder.append_nulls(n)
352    }
353
354    /// Append an `Option` value into the builder
355    ///
356    /// # Panics
357    ///
358    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
359    #[inline]
360    pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
361        match value {
362            None => self.append_null(),
363            Some(v) => self.append_value(v),
364        };
365    }
366
367    /// Append an `Option` value into the builder repeatedly `count` times.
368    /// This is the same as `append_option` but allows to append the same value multiple times without doing multiple lookups.
369    ///
370    /// # Panics
371    ///
372    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
373    pub fn append_options(&mut self, value: Option<impl AsRef<T::Native>>, count: usize) {
374        match value {
375            None => self.keys_builder.append_nulls(count),
376            Some(v) => self.append_values(v, count),
377        };
378    }
379
380    /// Extends builder with an existing dictionary array.
381    ///
382    /// This is the same as [`Self::extend`] but is faster as it translates
383    /// the dictionary values once rather than doing a lookup for each item in the iterator
384    ///
385    /// when dictionary values are null (the actual mapped values) the keys are null
386    ///
387    pub fn extend_dictionary(
388        &mut self,
389        dictionary: &TypedDictionaryArray<K, GenericByteArray<T>>,
390    ) -> Result<(), ArrowError> {
391        let values = dictionary.values();
392
393        let v_len = values.len();
394        let k_len = dictionary.keys().len();
395        if v_len == 0 && k_len == 0 {
396            return Ok(());
397        }
398
399        // All nulls
400        if v_len == 0 {
401            self.append_nulls(k_len);
402            return Ok(());
403        }
404
405        if k_len == 0 {
406            return Err(ArrowError::InvalidArgumentError(
407                "Dictionary keys should not be empty when values are not empty".to_string(),
408            ));
409        }
410
411        // Orphan values will be carried over to the new dictionary
412        let mapped_values = values
413            .iter()
414            // Dictionary values can technically be null, so we need to handle that
415            .map(|dict_value| {
416                dict_value
417                    .map(|dict_value| self.get_or_insert_key(dict_value))
418                    .transpose()
419            })
420            .collect::<Result<Vec<_>, _>>()?;
421
422        // Just insert the keys without additional lookups
423        dictionary.keys().iter().for_each(|key| match key {
424            None => self.append_null(),
425            Some(original_dict_index) => {
426                let index = original_dict_index.as_usize().min(v_len - 1);
427                match mapped_values[index] {
428                    None => self.append_null(),
429                    Some(mapped_value) => self.keys_builder.append_value(mapped_value),
430                }
431            }
432        });
433
434        Ok(())
435    }
436
437    /// Builds the `DictionaryArray` and reset this builder.
438    pub fn finish(&mut self) -> DictionaryArray<K> {
439        self.dedup.clear();
440        let values = self.values_builder.finish();
441        let keys = self.keys_builder.finish();
442
443        let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE));
444
445        let builder = keys
446            .into_data()
447            .into_builder()
448            .data_type(data_type)
449            .child_data(vec![values.into_data()]);
450
451        DictionaryArray::from(unsafe { builder.build_unchecked() })
452    }
453
454    /// Builds the `DictionaryArray` without resetting the builder.
455    pub fn finish_cloned(&self) -> DictionaryArray<K> {
456        let values = self.values_builder.finish_cloned();
457        let keys = self.keys_builder.finish_cloned();
458
459        let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE));
460
461        let builder = keys
462            .into_data()
463            .into_builder()
464            .data_type(data_type)
465            .child_data(vec![values.into_data()]);
466
467        DictionaryArray::from(unsafe { builder.build_unchecked() })
468    }
469
470    /// Builds the `DictionaryArray` without resetting the values builder or
471    /// the internal de-duplication map.
472    ///
473    /// The advantage of doing this is that the values will represent the entire
474    /// set of what has been built so-far by this builder and ensures
475    /// consistency in the assignment of keys to values across multiple calls
476    /// to `finish_preserve_values`. This enables ipc writers to efficiently
477    /// emit delta dictionaries.
478    ///
479    /// The downside to this is that building the record requires creating a
480    /// copy of the values, which can become slowly more expensive if the
481    /// dictionary grows.
482    ///
483    /// Additionally, if record batches from multiple different dictionary
484    /// builders for the same column are fed into a single ipc writer, beware
485    /// that entire dictionaries are likely to be re-sent frequently even when
486    /// the majority of the values are not used by the current record batch.
487    pub fn finish_preserve_values(&mut self) -> DictionaryArray<K> {
488        let values = self.values_builder.finish_cloned();
489        let keys = self.keys_builder.finish();
490
491        let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE));
492
493        let builder = keys
494            .into_data()
495            .into_builder()
496            .data_type(data_type)
497            .child_data(vec![values.into_data()]);
498
499        DictionaryArray::from(unsafe { builder.build_unchecked() })
500    }
501
502    /// Returns the current null buffer as a slice
503    pub fn validity_slice(&self) -> Option<&[u8]> {
504        self.keys_builder.validity_slice()
505    }
506}
507
508impl<K: ArrowDictionaryKeyType, T: ByteArrayType, V: AsRef<T::Native>> Extend<Option<V>>
509    for GenericByteDictionaryBuilder<K, T>
510{
511    #[inline]
512    fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) {
513        for v in iter {
514            self.append_option(v)
515        }
516    }
517}
518
519fn get_bytes<T: ByteArrayType>(values: &GenericByteBuilder<T>, idx: usize) -> &[u8] {
520    let offsets = values.offsets_slice();
521    let values = values.values_slice();
522
523    let end_offset = offsets[idx + 1].as_usize();
524    let start_offset = offsets[idx].as_usize();
525
526    &values[start_offset..end_offset]
527}
528
529/// Builder for [`DictionaryArray`] of [`StringArray`](crate::array::StringArray)
530///
531/// ```
532/// // Create a dictionary array indexed by bytes whose values are Strings.
533/// // It can thus hold up to 256 distinct string values.
534///
535/// # use arrow_array::builder::StringDictionaryBuilder;
536/// # use arrow_array::{Int8Array, StringArray};
537/// # use arrow_array::types::Int8Type;
538///
539/// let mut builder = StringDictionaryBuilder::<Int8Type>::new();
540///
541/// // The builder builds the dictionary value by value
542/// builder.append("abc").unwrap();
543/// builder.append_null();
544/// builder.append_n("def", 2).unwrap();  // appends "def" twice with a single lookup
545/// builder.append("abc").unwrap();
546/// let array = builder.finish();
547///
548/// assert_eq!(
549///   array.keys(),
550///   &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
551/// );
552///
553/// // Values are polymorphic and so require a downcast.
554/// let av = array.values();
555/// let ava: &StringArray = av.as_any().downcast_ref::<StringArray>().unwrap();
556///
557/// assert_eq!(ava.value(0), "abc");
558/// assert_eq!(ava.value(1), "def");
559///
560/// ```
561pub type StringDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericStringType<i32>>;
562
563/// Builder for [`DictionaryArray`] of [`LargeStringArray`](crate::array::LargeStringArray)
564pub type LargeStringDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericStringType<i64>>;
565
566/// Builder for [`DictionaryArray`] of [`BinaryArray`](crate::array::BinaryArray)
567///
568/// ```
569/// // Create a dictionary array indexed by bytes whose values are binary.
570/// // It can thus hold up to 256 distinct binary values.
571///
572/// # use arrow_array::builder::BinaryDictionaryBuilder;
573/// # use arrow_array::{BinaryArray, Int8Array};
574/// # use arrow_array::types::Int8Type;
575///
576/// let mut builder = BinaryDictionaryBuilder::<Int8Type>::new();
577///
578/// // The builder builds the dictionary value by value
579/// builder.append(b"abc").unwrap();
580/// builder.append_null();
581/// builder.append(b"def").unwrap();
582/// builder.append(b"def").unwrap();
583/// builder.append(b"abc").unwrap();
584/// let array = builder.finish();
585///
586/// assert_eq!(
587///   array.keys(),
588///   &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
589/// );
590///
591/// // Values are polymorphic and so require a downcast.
592/// let av = array.values();
593/// let ava: &BinaryArray = av.as_any().downcast_ref::<BinaryArray>().unwrap();
594///
595/// assert_eq!(ava.value(0), b"abc");
596/// assert_eq!(ava.value(1), b"def");
597///
598/// ```
599pub type BinaryDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericBinaryType<i32>>;
600
601/// Builder for [`DictionaryArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray)
602pub type LargeBinaryDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericBinaryType<i64>>;
603
604#[cfg(test)]
605mod tests {
606    use super::*;
607
608    use crate::array::Int8Array;
609    use crate::cast::AsArray;
610    use crate::types::{Int8Type, Int16Type, Int32Type, UInt8Type, UInt16Type, Utf8Type};
611    use crate::{ArrowPrimitiveType, BinaryArray, StringArray};
612
613    fn test_bytes_dictionary_builder<T>(values: Vec<&T::Native>)
614    where
615        T: ByteArrayType,
616        <T as ByteArrayType>::Native: PartialEq,
617        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
618    {
619        let mut builder = GenericByteDictionaryBuilder::<Int8Type, T>::new();
620        builder.append(values[0]).unwrap();
621        builder.append_null();
622        builder.append(values[1]).unwrap();
623        builder.append(values[1]).unwrap();
624        builder.append(values[0]).unwrap();
625        let array = builder.finish();
626
627        assert_eq!(
628            array.keys(),
629            &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
630        );
631
632        // Values are polymorphic and so require a downcast.
633        let av = array.values();
634        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
635
636        assert_eq!(*ava.value(0), *values[0]);
637        assert_eq!(*ava.value(1), *values[1]);
638    }
639
640    #[test]
641    fn test_string_dictionary_builder() {
642        test_bytes_dictionary_builder::<GenericStringType<i32>>(vec!["abc", "def"]);
643    }
644
645    #[test]
646    fn test_binary_dictionary_builder() {
647        test_bytes_dictionary_builder::<GenericBinaryType<i32>>(vec![b"abc", b"def"]);
648    }
649
650    fn test_bytes_dictionary_builder_finish_cloned<T>(values: Vec<&T::Native>)
651    where
652        T: ByteArrayType,
653        <T as ByteArrayType>::Native: PartialEq,
654        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
655    {
656        let mut builder = GenericByteDictionaryBuilder::<Int8Type, T>::new();
657
658        builder.append(values[0]).unwrap();
659        builder.append_null();
660        builder.append(values[1]).unwrap();
661        builder.append(values[1]).unwrap();
662        builder.append(values[0]).unwrap();
663        let mut array = builder.finish_cloned();
664
665        assert_eq!(
666            array.keys(),
667            &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
668        );
669
670        // Values are polymorphic and so require a downcast.
671        let av = array.values();
672        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
673
674        assert_eq!(ava.value(0), values[0]);
675        assert_eq!(ava.value(1), values[1]);
676
677        builder.append(values[0]).unwrap();
678        builder.append(values[2]).unwrap();
679        builder.append(values[1]).unwrap();
680
681        array = builder.finish();
682
683        assert_eq!(
684            array.keys(),
685            &Int8Array::from(vec![
686                Some(0),
687                None,
688                Some(1),
689                Some(1),
690                Some(0),
691                Some(0),
692                Some(2),
693                Some(1)
694            ])
695        );
696
697        // Values are polymorphic and so require a downcast.
698        let av2 = array.values();
699        let ava2: &GenericByteArray<T> =
700            av2.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
701
702        assert_eq!(ava2.value(0), values[0]);
703        assert_eq!(ava2.value(1), values[1]);
704        assert_eq!(ava2.value(2), values[2]);
705    }
706
707    #[test]
708    fn test_string_dictionary_builder_finish_cloned() {
709        test_bytes_dictionary_builder_finish_cloned::<GenericStringType<i32>>(vec![
710            "abc", "def", "ghi",
711        ]);
712    }
713
714    #[test]
715    fn test_binary_dictionary_builder_finish_cloned() {
716        test_bytes_dictionary_builder_finish_cloned::<GenericBinaryType<i32>>(vec![
717            b"abc", b"def", b"ghi",
718        ]);
719    }
720
721    fn _test_try_new_from_builder_generic_for_key_types<K1, K2, T>(values: Vec<&T::Native>)
722    where
723        K1: ArrowDictionaryKeyType,
724        K1::Native: NumCast,
725        K2: ArrowDictionaryKeyType,
726        K2::Native: NumCast + From<u8>,
727        T: ByteArrayType,
728        <T as ByteArrayType>::Native: PartialEq + AsRef<<T as ByteArrayType>::Native>,
729    {
730        let mut source = GenericByteDictionaryBuilder::<K1, T>::new();
731        source.append(values[0]).unwrap();
732        source.append(values[1]).unwrap();
733        source.append_null();
734        source.append(values[2]).unwrap();
735
736        let mut result =
737            GenericByteDictionaryBuilder::<K2, T>::try_new_from_builder(source).unwrap();
738        let array = result.finish();
739
740        let mut expected_keys_builder = PrimitiveBuilder::<K2>::new();
741        expected_keys_builder
742            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(0u8));
743        expected_keys_builder
744            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(1u8));
745        expected_keys_builder.append_null();
746        expected_keys_builder
747            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(2u8));
748        let expected_keys = expected_keys_builder.finish();
749        assert_eq!(array.keys(), &expected_keys);
750
751        let av = array.values();
752        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
753        assert_eq!(ava.value(0), values[0]);
754        assert_eq!(ava.value(1), values[1]);
755        assert_eq!(ava.value(2), values[2]);
756    }
757
758    fn test_try_new_from_builder<T>(values: Vec<&T::Native>)
759    where
760        T: ByteArrayType,
761        <T as ByteArrayType>::Native: PartialEq + AsRef<<T as ByteArrayType>::Native>,
762    {
763        // test cast to bigger size unsigned
764        _test_try_new_from_builder_generic_for_key_types::<UInt8Type, UInt16Type, T>(
765            values.clone(),
766        );
767        // test cast going to smaller size unsigned
768        _test_try_new_from_builder_generic_for_key_types::<UInt16Type, UInt8Type, T>(
769            values.clone(),
770        );
771        // test cast going to bigger size signed
772        _test_try_new_from_builder_generic_for_key_types::<Int8Type, Int16Type, T>(values.clone());
773        // test cast going to smaller size signed
774        _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type, T>(values.clone());
775        // test going from signed to signed for different size changes
776        _test_try_new_from_builder_generic_for_key_types::<UInt8Type, Int16Type, T>(values.clone());
777        _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt8Type, T>(values.clone());
778        _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt16Type, T>(values.clone());
779        _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type, T>(values.clone());
780    }
781
782    #[test]
783    fn test_string_dictionary_builder_try_new_from_builder() {
784        test_try_new_from_builder::<GenericStringType<i32>>(vec!["abc", "def", "ghi"]);
785    }
786
787    #[test]
788    fn test_binary_dictionary_builder_try_new_from_builder() {
789        test_try_new_from_builder::<GenericBinaryType<i32>>(vec![b"abc", b"def", b"ghi"]);
790    }
791
792    #[test]
793    fn test_try_new_from_builder_cast_fails() {
794        let mut source_builder = StringDictionaryBuilder::<UInt16Type>::new();
795        for i in 0..257 {
796            source_builder.append_value(format!("val{i}"));
797        }
798
799        // there should be too many values that we can't downcast to the underlying type
800        // we have keys that wouldn't fit into UInt8Type
801        let result = StringDictionaryBuilder::<UInt8Type>::try_new_from_builder(source_builder);
802        assert!(result.is_err());
803        if let Err(e) = result {
804            assert!(matches!(e, ArrowError::CastError(_)));
805            assert_eq!(
806                e.to_string(),
807                "Cast error: Can't cast dictionary keys from source type UInt16 to type UInt8"
808            );
809        }
810    }
811
812    fn test_bytes_dictionary_builder_with_existing_dictionary<T>(
813        dictionary: GenericByteArray<T>,
814        values: Vec<&T::Native>,
815    ) where
816        T: ByteArrayType,
817        <T as ByteArrayType>::Native: PartialEq,
818        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
819    {
820        let mut builder =
821            GenericByteDictionaryBuilder::<Int8Type, T>::new_with_dictionary(6, &dictionary)
822                .unwrap();
823        builder.append(values[0]).unwrap();
824        builder.append_null();
825        builder.append(values[1]).unwrap();
826        builder.append(values[1]).unwrap();
827        builder.append(values[0]).unwrap();
828        builder.append(values[2]).unwrap();
829        let array = builder.finish();
830
831        assert_eq!(
832            array.keys(),
833            &Int8Array::from(vec![Some(2), None, Some(1), Some(1), Some(2), Some(3)])
834        );
835
836        // Values are polymorphic and so require a downcast.
837        let av = array.values();
838        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
839
840        assert!(!ava.is_valid(0));
841        assert_eq!(ava.value(1), values[1]);
842        assert_eq!(ava.value(2), values[0]);
843        assert_eq!(ava.value(3), values[2]);
844    }
845
846    #[test]
847    fn test_string_dictionary_builder_with_existing_dictionary() {
848        test_bytes_dictionary_builder_with_existing_dictionary::<GenericStringType<i32>>(
849            StringArray::from(vec![None, Some("def"), Some("abc")]),
850            vec!["abc", "def", "ghi"],
851        );
852    }
853
854    #[test]
855    fn test_binary_dictionary_builder_with_existing_dictionary() {
856        let values: Vec<Option<&[u8]>> = vec![None, Some(b"def"), Some(b"abc")];
857        test_bytes_dictionary_builder_with_existing_dictionary::<GenericBinaryType<i32>>(
858            BinaryArray::from(values),
859            vec![b"abc", b"def", b"ghi"],
860        );
861    }
862
863    fn test_bytes_dictionary_builder_with_reserved_null_value<T>(
864        dictionary: GenericByteArray<T>,
865        values: Vec<&T::Native>,
866    ) where
867        T: ByteArrayType,
868        <T as ByteArrayType>::Native: PartialEq,
869        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
870    {
871        let mut builder =
872            GenericByteDictionaryBuilder::<Int16Type, T>::new_with_dictionary(4, &dictionary)
873                .unwrap();
874        builder.append(values[0]).unwrap();
875        builder.append_null();
876        builder.append(values[1]).unwrap();
877        builder.append(values[0]).unwrap();
878        let array = builder.finish();
879
880        assert!(array.is_null(1));
881        assert!(!array.is_valid(1));
882
883        let keys = array.keys();
884
885        assert_eq!(keys.value(0), 1);
886        assert!(keys.is_null(1));
887        // zero initialization is currently guaranteed by Buffer allocation and resizing
888        assert_eq!(keys.value(1), 0);
889        assert_eq!(keys.value(2), 2);
890        assert_eq!(keys.value(3), 1);
891    }
892
893    #[test]
894    fn test_string_dictionary_builder_with_reserved_null_value() {
895        let v: Vec<Option<&str>> = vec![None];
896        test_bytes_dictionary_builder_with_reserved_null_value::<GenericStringType<i32>>(
897            StringArray::from(v),
898            vec!["abc", "def"],
899        );
900    }
901
902    #[test]
903    fn test_binary_dictionary_builder_with_reserved_null_value() {
904        let values: Vec<Option<&[u8]>> = vec![None];
905        test_bytes_dictionary_builder_with_reserved_null_value::<GenericBinaryType<i32>>(
906            BinaryArray::from(values),
907            vec![b"abc", b"def"],
908        );
909    }
910
911    #[test]
912    fn test_extend() {
913        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
914        builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some));
915        builder.extend(["c", "d", "a"].into_iter().map(Some));
916        let dict = builder.finish();
917        assert_eq!(dict.keys().values(), &[0, 1, 2, 0, 1, 2, 2, 3, 0]);
918        assert_eq!(dict.values().len(), 4);
919    }
920
921    #[test]
922    fn test_extend_dictionary() {
923        let some_dict = {
924            let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
925            builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some));
926            builder.extend([None::<&str>]);
927            builder.extend(["c", "d", "a"].into_iter().map(Some));
928            builder.append_null();
929            builder.finish()
930        };
931
932        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
933        builder.extend(["e", "e", "f", "e", "d"].into_iter().map(Some));
934        builder
935            .extend_dictionary(&some_dict.downcast_dict().unwrap())
936            .unwrap();
937        let dict = builder.finish();
938
939        assert_eq!(dict.values().len(), 6);
940
941        let values = dict
942            .downcast_dict::<GenericByteArray<Utf8Type>>()
943            .unwrap()
944            .into_iter()
945            .collect::<Vec<_>>();
946
947        assert_eq!(
948            values,
949            [
950                Some("e"),
951                Some("e"),
952                Some("f"),
953                Some("e"),
954                Some("d"),
955                Some("a"),
956                Some("b"),
957                Some("c"),
958                Some("a"),
959                Some("b"),
960                Some("c"),
961                None,
962                Some("c"),
963                Some("d"),
964                Some("a"),
965                None
966            ]
967        );
968    }
969    #[test]
970    fn test_extend_dictionary_with_null_in_mapped_value() {
971        let some_dict = {
972            let mut values_builder = GenericByteBuilder::<Utf8Type>::new();
973            let mut keys_builder = PrimitiveBuilder::<Int32Type>::new();
974
975            // Manually build a dictionary values that the mapped values have null
976            values_builder.append_null();
977            keys_builder.append_value(0);
978            values_builder.append_value("I like worm hugs");
979            keys_builder.append_value(1);
980
981            let values = values_builder.finish();
982            let keys = keys_builder.finish();
983
984            let data_type = DataType::Dictionary(
985                Box::new(Int32Type::DATA_TYPE),
986                Box::new(Utf8Type::DATA_TYPE),
987            );
988
989            let builder = keys
990                .into_data()
991                .into_builder()
992                .data_type(data_type)
993                .child_data(vec![values.into_data()]);
994
995            DictionaryArray::from(unsafe { builder.build_unchecked() })
996        };
997
998        let some_dict_values = some_dict.values().as_string::<i32>();
999        assert_eq!(
1000            some_dict_values.into_iter().collect::<Vec<_>>(),
1001            &[None, Some("I like worm hugs")]
1002        );
1003
1004        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
1005        builder
1006            .extend_dictionary(&some_dict.downcast_dict().unwrap())
1007            .unwrap();
1008        let dict = builder.finish();
1009
1010        assert_eq!(dict.values().len(), 1);
1011
1012        let values = dict
1013            .downcast_dict::<GenericByteArray<Utf8Type>>()
1014            .unwrap()
1015            .into_iter()
1016            .collect::<Vec<_>>();
1017
1018        assert_eq!(values, [None, Some("I like worm hugs")]);
1019    }
1020
1021    #[test]
1022    fn test_extend_all_null_dictionary() {
1023        let some_dict = {
1024            let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
1025            builder.append_nulls(2);
1026            builder.finish()
1027        };
1028
1029        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
1030        builder
1031            .extend_dictionary(&some_dict.downcast_dict().unwrap())
1032            .unwrap();
1033        let dict = builder.finish();
1034
1035        assert_eq!(dict.values().len(), 0);
1036
1037        let values = dict
1038            .downcast_dict::<GenericByteArray<Utf8Type>>()
1039            .unwrap()
1040            .into_iter()
1041            .collect::<Vec<_>>();
1042
1043        assert_eq!(values, [None, None]);
1044    }
1045
1046    #[test]
1047    fn test_finish_preserve_values() {
1048        // Create the first dictionary
1049        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
1050        builder.append("a").unwrap();
1051        builder.append("b").unwrap();
1052        builder.append("c").unwrap();
1053        let dict = builder.finish_preserve_values();
1054        assert_eq!(dict.keys().values(), &[0, 1, 2]);
1055        assert_eq!(dict.values().len(), 3);
1056        let values = dict
1057            .downcast_dict::<GenericByteArray<Utf8Type>>()
1058            .unwrap()
1059            .into_iter()
1060            .collect::<Vec<_>>();
1061        assert_eq!(values, [Some("a"), Some("b"), Some("c")]);
1062
1063        // Create a new dictionary
1064        builder.append("d").unwrap();
1065        builder.append("e").unwrap();
1066        let dict2 = builder.finish_preserve_values();
1067
1068        // Make sure the keys are assigned after the old ones and we have the
1069        // right values
1070        assert_eq!(dict2.keys().values(), &[3, 4]);
1071        let values = dict2
1072            .downcast_dict::<GenericByteArray<Utf8Type>>()
1073            .unwrap()
1074            .into_iter()
1075            .collect::<Vec<_>>();
1076        assert_eq!(values, [Some("d"), Some("e")]);
1077
1078        // Check that we have all of the expected values
1079        assert_eq!(dict2.values().len(), 5);
1080        let all_values = dict2
1081            .values()
1082            .as_any()
1083            .downcast_ref::<StringArray>()
1084            .unwrap()
1085            .into_iter()
1086            .collect::<Vec<_>>();
1087        assert_eq!(
1088            all_values,
1089            [Some("a"), Some("b"), Some("c"), Some("d"), Some("e"),]
1090        );
1091    }
1092}