polars_arrow/array/dictionary/
mod.rs1use std::hash::Hash;
2use std::hint::unreachable_unchecked;
3
4use crate::bitmap::Bitmap;
5use crate::bitmap::utils::{BitmapIter, ZipValidity};
6use crate::datatypes::{ArrowDataType, IntegerType};
7use crate::scalar::{Scalar, new_scalar};
8use crate::trusted_len::TrustedLen;
9use crate::types::NativeType;
10
11mod ffi;
12pub(super) mod fmt;
13mod iterator;
14mod mutable;
15use crate::array::specification::check_indexes_unchecked;
16mod typed_iterator;
17mod value_map;
18
19pub use iterator::*;
20pub use mutable::*;
21use polars_error::{PolarsResult, polars_bail};
22
23use super::primitive::PrimitiveArray;
24use super::specification::check_indexes;
25use super::{Array, Splitable, new_empty_array, new_null_array};
26use crate::array::dictionary::typed_iterator::{
27 DictValue, DictionaryIterTyped, DictionaryValuesIterTyped,
28};
29
30pub unsafe trait DictionaryKey: NativeType + TryInto<usize> + TryFrom<usize> + Hash {
36 const KEY_TYPE: IntegerType;
38 const MAX_USIZE_VALUE: usize;
39
40 #[inline]
45 unsafe fn as_usize(self) -> usize {
46 match self.try_into() {
47 Ok(v) => v,
48 Err(_) => unreachable_unchecked(),
49 }
50 }
51
52 #[inline]
57 unsafe fn from_usize_unchecked(x: usize) -> Self {
58 debug_assert!(Self::try_from(x).is_ok());
59 unsafe { Self::try_from(x).unwrap_unchecked() }
60 }
61
62 fn always_fits_usize() -> bool {
64 false
65 }
66}
67
68unsafe impl DictionaryKey for i8 {
69 const KEY_TYPE: IntegerType = IntegerType::Int8;
70 const MAX_USIZE_VALUE: usize = i8::MAX as usize;
71}
72unsafe impl DictionaryKey for i16 {
73 const KEY_TYPE: IntegerType = IntegerType::Int16;
74 const MAX_USIZE_VALUE: usize = i16::MAX as usize;
75}
76unsafe impl DictionaryKey for i32 {
77 const KEY_TYPE: IntegerType = IntegerType::Int32;
78 const MAX_USIZE_VALUE: usize = i32::MAX as usize;
79}
80unsafe impl DictionaryKey for i64 {
81 const KEY_TYPE: IntegerType = IntegerType::Int64;
82 const MAX_USIZE_VALUE: usize = i64::MAX as usize;
83}
84unsafe impl DictionaryKey for i128 {
85 const KEY_TYPE: IntegerType = IntegerType::Int128;
86 const MAX_USIZE_VALUE: usize = i128::MAX as usize;
87}
88unsafe impl DictionaryKey for u8 {
89 const KEY_TYPE: IntegerType = IntegerType::UInt8;
90 const MAX_USIZE_VALUE: usize = u8::MAX as usize;
91
92 fn always_fits_usize() -> bool {
93 true
94 }
95}
96unsafe impl DictionaryKey for u16 {
97 const KEY_TYPE: IntegerType = IntegerType::UInt16;
98 const MAX_USIZE_VALUE: usize = u16::MAX as usize;
99
100 fn always_fits_usize() -> bool {
101 true
102 }
103}
104unsafe impl DictionaryKey for u32 {
105 const KEY_TYPE: IntegerType = IntegerType::UInt32;
106 const MAX_USIZE_VALUE: usize = u32::MAX as usize;
107
108 fn always_fits_usize() -> bool {
109 true
110 }
111}
112unsafe impl DictionaryKey for u64 {
113 const KEY_TYPE: IntegerType = IntegerType::UInt64;
114 const MAX_USIZE_VALUE: usize = u64::MAX as usize;
115
116 #[cfg(target_pointer_width = "64")]
117 fn always_fits_usize() -> bool {
118 true
119 }
120}
121unsafe impl DictionaryKey for u128 {
122 const KEY_TYPE: IntegerType = IntegerType::UInt128;
123 const MAX_USIZE_VALUE: usize = u128::MAX as usize;
124}
125
126#[derive(Clone)]
134pub struct DictionaryArray<K: DictionaryKey> {
135 dtype: ArrowDataType,
136 keys: PrimitiveArray<K>,
137 values: Box<dyn Array>,
138}
139
140fn check_dtype(
141 key_type: IntegerType,
142 dtype: &ArrowDataType,
143 values_dtype: &ArrowDataType,
144) -> PolarsResult<()> {
145 if let ArrowDataType::Dictionary(key, value, _) = dtype.to_storage() {
146 if *key != key_type {
147 polars_bail!(ComputeError: "DictionaryArray must be initialized with a DataType::Dictionary whose integer is compatible to its keys")
148 }
149 if value.as_ref().to_storage() != values_dtype.to_storage() {
150 polars_bail!(ComputeError: "DictionaryArray must be initialized with a DataType::Dictionary whose value is equal to its values")
151 }
152 } else {
153 polars_bail!(ComputeError: "DictionaryArray must be initialized with logical DataType::Dictionary")
154 }
155 Ok(())
156}
157
158impl<K: DictionaryKey> DictionaryArray<K> {
159 pub fn try_new(
169 dtype: ArrowDataType,
170 keys: PrimitiveArray<K>,
171 values: Box<dyn Array>,
172 ) -> PolarsResult<Self> {
173 check_dtype(K::KEY_TYPE, &dtype, values.dtype())?;
174
175 if keys.null_count() != keys.len() {
176 if K::always_fits_usize() {
177 unsafe { check_indexes_unchecked(keys.values(), values.len()) }?;
180 } else {
181 check_indexes(keys.values(), values.len())?;
182 }
183 }
184
185 Ok(Self {
186 dtype,
187 keys,
188 values,
189 })
190 }
191
192 pub fn try_from_keys(
199 keys: PrimitiveArray<K>,
200 values: Box<dyn Array>,
201 ordered: bool,
202 ) -> PolarsResult<Self> {
203 let dtype = Self::default_dtype(values.dtype().clone(), ordered);
204 Self::try_new(dtype, keys, values)
205 }
206
207 pub unsafe fn try_new_unchecked(
217 dtype: ArrowDataType,
218 keys: PrimitiveArray<K>,
219 values: Box<dyn Array>,
220 ) -> PolarsResult<Self> {
221 check_dtype(K::KEY_TYPE, &dtype, values.dtype())?;
222
223 Ok(Self {
224 dtype,
225 keys,
226 values,
227 })
228 }
229
230 pub fn new_empty(dtype: ArrowDataType) -> Self {
232 let values = Self::try_get_child(&dtype).unwrap();
233 let values = new_empty_array(values.clone());
234 Self::try_new(
235 dtype,
236 PrimitiveArray::<K>::new_empty(K::PRIMITIVE.into()),
237 values,
238 )
239 .unwrap()
240 }
241
242 #[inline]
244 pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
245 let values = Self::try_get_child(&dtype).unwrap();
246 let values = new_null_array(values.clone(), 1);
247 Self::try_new(
248 dtype,
249 PrimitiveArray::<K>::new_null(K::PRIMITIVE.into(), length),
250 values,
251 )
252 .unwrap()
253 }
254
255 pub fn iter(
260 &self,
261 ) -> ZipValidity<Box<dyn Scalar>, DictionaryValuesIter<'_, K>, BitmapIter<'_>> {
262 ZipValidity::new_with_validity(DictionaryValuesIter::new(self), self.keys.validity())
263 }
264
265 pub fn values_iter(&self) -> DictionaryValuesIter<'_, K> {
270 DictionaryValuesIter::new(self)
271 }
272
273 pub fn values_iter_typed<V: DictValue>(
280 &self,
281 ) -> PolarsResult<DictionaryValuesIterTyped<'_, K, V>> {
282 let keys = &self.keys;
283 assert_eq!(keys.null_count(), 0);
284 let values = self.values.as_ref();
285 let values = V::downcast_values(values)?;
286 Ok(DictionaryValuesIterTyped::new(keys, values))
287 }
288
289 pub fn iter_typed<V: DictValue>(&self) -> PolarsResult<DictionaryIterTyped<'_, K, V>> {
291 let keys = &self.keys;
292 let values = self.values.as_ref();
293 let values = V::downcast_values(values)?;
294 Ok(DictionaryIterTyped::new(keys, values))
295 }
296
297 #[inline]
299 pub fn dtype(&self) -> &ArrowDataType {
300 &self.dtype
301 }
302
303 #[inline]
305 pub fn is_ordered(&self) -> bool {
306 match self.dtype.to_storage() {
307 ArrowDataType::Dictionary(_, _, is_ordered) => *is_ordered,
308 _ => unreachable!(),
309 }
310 }
311
312 pub(crate) fn default_dtype(values_datatype: ArrowDataType, ordered: bool) -> ArrowDataType {
313 ArrowDataType::Dictionary(K::KEY_TYPE, Box::new(values_datatype), ordered)
314 }
315
316 pub fn slice(&mut self, offset: usize, length: usize) {
320 self.keys.slice(offset, length);
321 }
322
323 pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
328 self.keys.slice_unchecked(offset, length);
329 }
330
331 impl_sliced!();
332
333 #[must_use]
337 pub fn with_validity(mut self, validity: Option<Bitmap>) -> Self {
338 self.set_validity(validity);
339 self
340 }
341
342 pub fn set_validity(&mut self, validity: Option<Bitmap>) {
346 self.keys.set_validity(validity);
347 }
348
349 impl_into_array!();
350
351 #[inline]
353 pub fn len(&self) -> usize {
354 self.keys.len()
355 }
356
357 #[inline]
359 pub fn validity(&self) -> Option<&Bitmap> {
360 self.keys.validity()
361 }
362
363 #[inline]
366 pub fn keys(&self) -> &PrimitiveArray<K> {
367 &self.keys
368 }
369
370 #[inline]
372 pub fn keys_values_iter(&self) -> impl TrustedLen<Item = usize> + Clone + '_ {
373 self.keys.values_iter().map(|x| unsafe { x.as_usize() })
375 }
376
377 #[inline]
379 pub fn keys_iter(&self) -> impl TrustedLen<Item = Option<usize>> + Clone + '_ {
380 self.keys.iter().map(|x| x.map(|x| unsafe { x.as_usize() }))
382 }
383
384 #[inline]
388 pub fn key_value(&self, index: usize) -> usize {
389 unsafe { self.keys.values()[index].as_usize() }
391 }
392
393 #[inline]
395 pub fn values(&self) -> &Box<dyn Array> {
396 &self.values
397 }
398
399 #[inline]
406 pub fn value(&self, index: usize) -> Box<dyn Scalar> {
407 let index = unsafe { self.keys.value(index).as_usize() };
409 new_scalar(self.values.as_ref(), index)
410 }
411
412 pub(crate) fn try_get_child(dtype: &ArrowDataType) -> PolarsResult<&ArrowDataType> {
413 Ok(match dtype.to_storage() {
414 ArrowDataType::Dictionary(_, values, _) => values.as_ref(),
415 _ => {
416 polars_bail!(ComputeError: "Dictionaries must be initialized with DataType::Dictionary")
417 },
418 })
419 }
420
421 pub fn take(self) -> (ArrowDataType, PrimitiveArray<K>, Box<dyn Array>) {
422 (self.dtype, self.keys, self.values)
423 }
424}
425
426impl<K: DictionaryKey> Array for DictionaryArray<K> {
427 impl_common_array!();
428
429 fn validity(&self) -> Option<&Bitmap> {
430 self.keys.validity()
431 }
432
433 #[inline]
434 fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
435 Box::new(self.clone().with_validity(validity))
436 }
437}
438
439impl<K: DictionaryKey> Splitable for DictionaryArray<K> {
440 fn check_bound(&self, offset: usize) -> bool {
441 offset < self.len()
442 }
443
444 unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
445 let (lhs_keys, rhs_keys) = unsafe { Splitable::split_at_unchecked(&self.keys, offset) };
446
447 (
448 Self {
449 dtype: self.dtype.clone(),
450 keys: lhs_keys,
451 values: self.values.clone(),
452 },
453 Self {
454 dtype: self.dtype.clone(),
455 keys: rhs_keys,
456 values: self.values.clone(),
457 },
458 )
459 }
460}