re_arrow2/array/dictionary/
mod.rs

1use std::{hash::Hash, hint::unreachable_unchecked, sync::Arc};
2
3use crate::{
4    bitmap::{
5        utils::{BitmapIter, ZipValidity},
6        Bitmap,
7    },
8    datatypes::{DataType, IntegerType},
9    error::Error,
10    scalar::{new_scalar, Scalar},
11    trusted_len::TrustedLen,
12    types::NativeType,
13};
14
15#[cfg(feature = "arrow")]
16mod data;
17mod ffi;
18pub(super) mod fmt;
19mod iterator;
20mod mutable;
21use crate::array::specification::check_indexes_unchecked;
22mod typed_iterator;
23mod value_map;
24
25use crate::array::dictionary::typed_iterator::{DictValue, DictionaryValuesIterTyped};
26pub use iterator::*;
27pub use mutable::*;
28
29use super::{new_empty_array, primitive::PrimitiveArray, Array};
30use super::{new_null_array, specification::check_indexes};
31
32/// Trait denoting [`NativeType`]s that can be used as keys of a dictionary.
33/// # Safety
34///
35/// Any implementation of this trait must ensure that `always_fits_usize` only
36/// returns `true` if all values succeeds on `value::try_into::<usize>().unwrap()`.
37pub unsafe trait DictionaryKey: NativeType + TryInto<usize> + TryFrom<usize> + Hash {
38    /// The corresponding [`IntegerType`] of this key
39    const KEY_TYPE: IntegerType;
40
41    /// Represents this key as a `usize`.
42    /// # Safety
43    /// The caller _must_ have checked that the value can be casted to `usize`.
44    #[inline]
45    unsafe fn as_usize(self) -> usize {
46        match self.try_into() {
47            Ok(v) => v,
48            Err(_) => unreachable_unchecked(),
49        }
50    }
51
52    /// If the key type always can be converted to `usize`.
53    fn always_fits_usize() -> bool {
54        false
55    }
56}
57
58unsafe impl DictionaryKey for i8 {
59    const KEY_TYPE: IntegerType = IntegerType::Int8;
60}
61unsafe impl DictionaryKey for i16 {
62    const KEY_TYPE: IntegerType = IntegerType::Int16;
63}
64unsafe impl DictionaryKey for i32 {
65    const KEY_TYPE: IntegerType = IntegerType::Int32;
66}
67unsafe impl DictionaryKey for i64 {
68    const KEY_TYPE: IntegerType = IntegerType::Int64;
69}
70unsafe impl DictionaryKey for u8 {
71    const KEY_TYPE: IntegerType = IntegerType::UInt8;
72
73    fn always_fits_usize() -> bool {
74        true
75    }
76}
77unsafe impl DictionaryKey for u16 {
78    const KEY_TYPE: IntegerType = IntegerType::UInt16;
79
80    fn always_fits_usize() -> bool {
81        true
82    }
83}
84unsafe impl DictionaryKey for u32 {
85    const KEY_TYPE: IntegerType = IntegerType::UInt32;
86
87    fn always_fits_usize() -> bool {
88        true
89    }
90}
91unsafe impl DictionaryKey for u64 {
92    const KEY_TYPE: IntegerType = IntegerType::UInt64;
93
94    #[cfg(target_pointer_width = "64")]
95    fn always_fits_usize() -> bool {
96        true
97    }
98}
99
100/// An [`Array`] whose values are stored as indices. This [`Array`] is useful when the cardinality of
101/// values is low compared to the length of the [`Array`].
102///
103/// # Safety
104/// This struct guarantees that each item of [`DictionaryArray::keys`] is castable to `usize` and
105/// its value is smaller than [`DictionaryArray::values`]`.len()`. In other words, you can safely
106/// use `unchecked` calls to retrive the values
107#[derive(Clone)]
108pub struct DictionaryArray<K: DictionaryKey> {
109    data_type: DataType,
110    keys: PrimitiveArray<K>,
111    values: Box<dyn Array>,
112}
113
114fn check_data_type(
115    key_type: IntegerType,
116    data_type: &DataType,
117    values_data_type: &DataType,
118) -> Result<(), Error> {
119    if let DataType::Dictionary(key, value, _) = data_type.to_logical_type() {
120        if *key != key_type {
121            return Err(Error::oos(
122                "DictionaryArray must be initialized with a DataType::Dictionary whose integer is compatible to its keys",
123            ));
124        }
125        if value.as_ref().to_logical_type() != values_data_type.to_logical_type() {
126            return Err(Error::oos(
127                "DictionaryArray must be initialized with a DataType::Dictionary whose value is equal to its values",
128            ));
129        }
130    } else {
131        return Err(Error::oos(
132            "DictionaryArray must be initialized with logical DataType::Dictionary",
133        ));
134    }
135    Ok(())
136}
137
138impl<K: DictionaryKey> DictionaryArray<K> {
139    /// Returns a new [`DictionaryArray`].
140    /// # Implementation
141    /// This function is `O(N)` where `N` is the length of keys
142    /// # Errors
143    /// This function errors iff
144    /// * the `data_type`'s logical type is not a `DictionaryArray`
145    /// * the `data_type`'s keys is not compatible with `keys`
146    /// * the `data_type`'s values's data_type is not equal with `values.data_type()`
147    /// * any of the keys's values is not represented in `usize` or is `>= values.len()`
148    pub fn try_new(
149        data_type: DataType,
150        keys: PrimitiveArray<K>,
151        values: Box<dyn Array>,
152    ) -> Result<Self, Error> {
153        check_data_type(K::KEY_TYPE, &data_type, values.data_type())?;
154
155        if keys.null_count() != keys.len() {
156            if K::always_fits_usize() {
157                // safety: we just checked that conversion to `usize` always
158                // succeeds
159                unsafe { check_indexes_unchecked(keys.values(), values.len()) }?;
160            } else {
161                check_indexes(keys.values(), values.len())?;
162            }
163        }
164
165        Ok(Self {
166            data_type,
167            keys,
168            values,
169        })
170    }
171
172    /// Returns a new [`DictionaryArray`].
173    /// # Implementation
174    /// This function is `O(N)` where `N` is the length of keys
175    /// # Errors
176    /// This function errors iff
177    /// * any of the keys's values is not represented in `usize` or is `>= values.len()`
178    pub fn try_from_keys(keys: PrimitiveArray<K>, values: Box<dyn Array>) -> Result<Self, Error> {
179        let data_type = Self::default_data_type(values.data_type().clone());
180        Self::try_new(data_type, keys, values)
181    }
182
183    /// Returns a new [`DictionaryArray`].
184    /// # Errors
185    /// This function errors iff
186    /// * the `data_type`'s logical type is not a `DictionaryArray`
187    /// * the `data_type`'s keys is not compatible with `keys`
188    /// * the `data_type`'s values's data_type is not equal with `values.data_type()`
189    /// # Safety
190    /// The caller must ensure that every keys's values is represented in `usize` and is `< values.len()`
191    pub unsafe fn try_new_unchecked(
192        data_type: DataType,
193        keys: PrimitiveArray<K>,
194        values: Box<dyn Array>,
195    ) -> Result<Self, Error> {
196        check_data_type(K::KEY_TYPE, &data_type, values.data_type())?;
197
198        Ok(Self {
199            data_type,
200            keys,
201            values,
202        })
203    }
204
205    /// Returns a new empty [`DictionaryArray`].
206    pub fn new_empty(data_type: DataType) -> Self {
207        let values = Self::try_get_child(&data_type).unwrap();
208        let values = new_empty_array(values.clone());
209        Self::try_new(
210            data_type,
211            PrimitiveArray::<K>::new_empty(K::PRIMITIVE.into()),
212            values,
213        )
214        .unwrap()
215    }
216
217    /// Returns an [`DictionaryArray`] whose all elements are null
218    #[inline]
219    pub fn new_null(data_type: DataType, length: usize) -> Self {
220        let values = Self::try_get_child(&data_type).unwrap();
221        let values = new_null_array(values.clone(), 1);
222        Self::try_new(
223            data_type,
224            PrimitiveArray::<K>::new_null(K::PRIMITIVE.into(), length),
225            values,
226        )
227        .unwrap()
228    }
229
230    /// Returns an iterator of [`Option<Box<dyn Scalar>>`].
231    /// # Implementation
232    /// This function will allocate a new [`Scalar`] per item and is usually not performant.
233    /// Consider calling `keys_iter` and `values`, downcasting `values`, and iterating over that.
234    pub fn iter(&self) -> ZipValidity<Box<dyn Scalar>, DictionaryValuesIter<K>, BitmapIter> {
235        ZipValidity::new_with_validity(DictionaryValuesIter::new(self), self.keys.validity())
236    }
237
238    /// Returns an iterator of [`Box<dyn Scalar>`]
239    /// # Implementation
240    /// This function will allocate a new [`Scalar`] per item and is usually not performant.
241    /// Consider calling `keys_iter` and `values`, downcasting `values`, and iterating over that.
242    pub fn values_iter(&self) -> DictionaryValuesIter<K> {
243        DictionaryValuesIter::new(self)
244    }
245
246    /// Returns an iterator over the the values [`V::IterValue`].
247    ///
248    /// # Panics
249    ///
250    /// Panics if the keys of this [`DictionaryArray`] have any null types.
251    /// If they do [`DictionaryArray::iter_typed`] should be called
252    pub fn values_iter_typed<V: DictValue>(
253        &self,
254    ) -> Result<DictionaryValuesIterTyped<K, V>, Error> {
255        let keys = &self.keys;
256        assert_eq!(keys.null_count(), 0);
257        let values = self.values.as_ref();
258        let values = V::downcast_values(values)?;
259        Ok(unsafe { DictionaryValuesIterTyped::new(keys, values) })
260    }
261
262    /// Returns an iterator over the the optional values of  [`Option<V::IterValue>`].
263    ///
264    /// # Panics
265    ///
266    /// This function panics if the `values` array
267    pub fn iter_typed<V: DictValue>(
268        &self,
269    ) -> Result<ZipValidity<V::IterValue<'_>, DictionaryValuesIterTyped<K, V>, BitmapIter>, Error>
270    {
271        let keys = &self.keys;
272        let values = self.values.as_ref();
273        let values = V::downcast_values(values)?;
274        let values_iter = unsafe { DictionaryValuesIterTyped::new(keys, values) };
275        Ok(ZipValidity::new_with_validity(values_iter, self.validity()))
276    }
277
278    /// Returns the [`DataType`] of this [`DictionaryArray`]
279    #[inline]
280    pub fn data_type(&self) -> &DataType {
281        &self.data_type
282    }
283
284    /// Returns whether the values of this [`DictionaryArray`] are ordered
285    #[inline]
286    pub fn is_ordered(&self) -> bool {
287        match self.data_type.to_logical_type() {
288            DataType::Dictionary(_, _, is_ordered) => *is_ordered,
289            _ => unreachable!(),
290        }
291    }
292
293    pub(crate) fn default_data_type(values_datatype: DataType) -> DataType {
294        DataType::Dictionary(K::KEY_TYPE, Arc::new(values_datatype), false)
295    }
296
297    /// Slices this [`DictionaryArray`].
298    /// # Panics
299    /// iff `offset + length > self.len()`.
300    pub fn slice(&mut self, offset: usize, length: usize) {
301        self.keys.slice(offset, length);
302    }
303
304    /// Slices this [`DictionaryArray`].
305    /// # Safety
306    /// Safe iff `offset + length <= self.len()`.
307    pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
308        self.keys.slice_unchecked(offset, length);
309    }
310
311    impl_sliced!();
312
313    /// Returns this [`DictionaryArray`] with a new validity.
314    /// # Panic
315    /// This function panics iff `validity.len() != self.len()`.
316    #[must_use]
317    pub fn with_validity(mut self, validity: Option<Bitmap>) -> Self {
318        self.set_validity(validity);
319        self
320    }
321
322    /// Sets the validity of the keys of this [`DictionaryArray`].
323    /// # Panics
324    /// This function panics iff `validity.len() != self.len()`.
325    pub fn set_validity(&mut self, validity: Option<Bitmap>) {
326        self.keys.set_validity(validity);
327    }
328
329    impl_into_array!();
330
331    /// Returns the length of this array
332    #[inline]
333    pub fn len(&self) -> usize {
334        self.keys.len()
335    }
336
337    /// The optional validity. Equivalent to `self.keys().validity()`.
338    #[inline]
339    pub fn validity(&self) -> Option<&Bitmap> {
340        self.keys.validity()
341    }
342
343    /// Returns the keys of the [`DictionaryArray`]. These keys can be used to fetch values
344    /// from `values`.
345    #[inline]
346    pub fn keys(&self) -> &PrimitiveArray<K> {
347        &self.keys
348    }
349
350    /// Returns an iterator of the keys' values of the [`DictionaryArray`] as `usize`
351    #[inline]
352    pub fn keys_values_iter(&self) -> impl TrustedLen<Item = usize> + Clone + '_ {
353        // safety - invariant of the struct
354        self.keys.values_iter().map(|x| unsafe { x.as_usize() })
355    }
356
357    /// Returns an iterator of the keys' of the [`DictionaryArray`] as `usize`
358    #[inline]
359    pub fn keys_iter(&self) -> impl TrustedLen<Item = Option<usize>> + Clone + '_ {
360        // safety - invariant of the struct
361        self.keys.iter().map(|x| x.map(|x| unsafe { x.as_usize() }))
362    }
363
364    /// Returns the keys' value of the [`DictionaryArray`] as `usize`
365    /// # Panics
366    /// This function panics iff `index >= self.len()`
367    #[inline]
368    pub fn key_value(&self, index: usize) -> usize {
369        // safety - invariant of the struct
370        unsafe { self.keys.values()[index].as_usize() }
371    }
372
373    /// Returns the values of the [`DictionaryArray`].
374    #[inline]
375    pub fn values(&self) -> &Box<dyn Array> {
376        &self.values
377    }
378
379    /// Returns the value of the [`DictionaryArray`] at position `i`.
380    /// # Implementation
381    /// This function will allocate a new [`Scalar`] and is usually not performant.
382    /// Consider calling `keys` and `values`, downcasting `values`, and iterating over that.
383    /// # Panic
384    /// This function panics iff `index >= self.len()`
385    #[inline]
386    pub fn value(&self, index: usize) -> Box<dyn Scalar> {
387        // safety - invariant of this struct
388        let index = unsafe { self.keys.value(index).as_usize() };
389        new_scalar(self.values.as_ref(), index)
390    }
391
392    pub(crate) fn try_get_child(data_type: &DataType) -> Result<&DataType, Error> {
393        Ok(match data_type.to_logical_type() {
394            DataType::Dictionary(_, values, _) => values.as_ref(),
395            _ => {
396                return Err(Error::oos(
397                    "Dictionaries must be initialized with DataType::Dictionary",
398                ))
399            }
400        })
401    }
402}
403
404impl<K: DictionaryKey> Array for DictionaryArray<K> {
405    impl_common_array!();
406
407    fn validity(&self) -> Option<&Bitmap> {
408        self.keys.validity()
409    }
410
411    #[inline]
412    fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
413        Box::new(self.clone().with_validity(validity))
414    }
415}