Skip to main content

typed_arrow/bridge/
dictionary.rs

1//! Dictionary-encoded column bindings and key mapping.
2
3use std::marker::PhantomData;
4
5use arrow_array::{
6    builder::{
7        BinaryDictionaryBuilder, FixedSizeBinaryDictionaryBuilder, LargeBinaryDictionaryBuilder,
8        LargeStringDictionaryBuilder, PrimitiveDictionaryBuilder, StringDictionaryBuilder,
9    },
10    types::{
11        Float32Type, Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, UInt8Type, UInt16Type,
12        UInt32Type, UInt64Type,
13    },
14};
15use arrow_schema::DataType;
16
17use super::{ArrowBinding, binary::LargeBinary, strings::LargeUtf8};
18
19/// Wrapper denoting an Arrow Dictionary column with key type `K` and values of `V`.
20///
21/// The inner value is intentionally not exposed. Construct with `Dictionary::new`
22/// and access the contained value via `Dictionary::value` or `Dictionary::into_value`.
23///
24/// This prevents accidental reliance on representation details (e.g., raw keys) and
25/// keeps the API focused on appending logical values. The builder handles interning to keys.
26#[derive(Debug, Clone, PartialEq)]
27#[repr(transparent)]
28pub struct Dictionary<K, V>(V, PhantomData<K>);
29
30impl<K, V> Dictionary<K, V> {
31    /// Create a new dictionary value wrapper.
32    #[inline]
33    pub fn new(value: V) -> Self {
34        Self(value, PhantomData)
35    }
36
37    /// Borrow the contained logical value.
38    #[inline]
39    pub fn value(&self) -> &V {
40        &self.0
41    }
42
43    /// Consume and return the contained logical value.
44    #[inline]
45    pub fn into_value(self) -> V {
46        self.0
47    }
48}
49
50// Serialize/Deserialize implementation is transparent: forwards to that for V.
51#[cfg(feature = "serde")]
52impl<'de, K, V> serde::de::Deserialize<'de> for Dictionary<K, V>
53where
54    V: serde::de::Deserialize<'de>,
55{
56    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
57    where
58        D: serde::de::Deserializer<'de>,
59    {
60        Ok(Self(V::deserialize(deserializer)?, PhantomData))
61    }
62}
63
64#[cfg(feature = "serde")]
65impl<K, V> serde::Serialize for Dictionary<K, V>
66where
67    V: serde::Serialize,
68{
69    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
70        self.0.serialize(serializer)
71    }
72}
73
74impl<K, V> From<V> for Dictionary<K, V> {
75    #[inline]
76    fn from(value: V) -> Self {
77        Self::new(value)
78    }
79}
80
81/// Dictionary key mapping from Rust integer to Arrow key type.
82pub trait DictKey {
83    /// Arrow key type corresponding to this Rust integer key.
84    type ArrowKey;
85
86    /// The Arrow `DataType` for the key.
87    fn data_type() -> DataType;
88}
89
90macro_rules! impl_dict_key {
91    ($rust:ty, $arrow:ty, $dt:expr) => {
92        impl DictKey for $rust {
93            type ArrowKey = $arrow;
94            fn data_type() -> DataType {
95                $dt
96            }
97        }
98    };
99}
100
101impl_dict_key!(i8, Int8Type, DataType::Int8);
102impl_dict_key!(i16, Int16Type, DataType::Int16);
103impl_dict_key!(i32, Int32Type, DataType::Int32);
104impl_dict_key!(i64, Int64Type, DataType::Int64);
105impl_dict_key!(u8, UInt8Type, DataType::UInt8);
106impl_dict_key!(u16, UInt16Type, DataType::UInt16);
107impl_dict_key!(u32, UInt32Type, DataType::UInt32);
108impl_dict_key!(u64, UInt64Type, DataType::UInt64);
109
110// Utf8 values
111impl<K> ArrowBinding for Dictionary<K, String>
112where
113    K: DictKey,
114    <K as DictKey>::ArrowKey: arrow_array::types::ArrowDictionaryKeyType,
115{
116    type Builder = StringDictionaryBuilder<<K as DictKey>::ArrowKey>;
117    type Array = arrow_array::DictionaryArray<<K as DictKey>::ArrowKey>;
118    fn data_type() -> DataType {
119        DataType::Dictionary(
120            Box::new(<K as DictKey>::data_type()),
121            Box::new(DataType::Utf8),
122        )
123    }
124    fn new_builder(_capacity: usize) -> Self::Builder {
125        StringDictionaryBuilder::new()
126    }
127    fn append_value(b: &mut Self::Builder, v: &Self) {
128        let _ = b.append(v.value().as_str());
129    }
130    fn append_null(b: &mut Self::Builder) {
131        b.append_null();
132    }
133    fn finish(mut b: Self::Builder) -> Self::Array {
134        b.finish()
135    }
136}
137
138// Binary values
139impl<K> ArrowBinding for Dictionary<K, Vec<u8>>
140where
141    K: DictKey,
142    <K as DictKey>::ArrowKey: arrow_array::types::ArrowDictionaryKeyType,
143{
144    type Builder = BinaryDictionaryBuilder<<K as DictKey>::ArrowKey>;
145    type Array = arrow_array::DictionaryArray<<K as DictKey>::ArrowKey>;
146    fn data_type() -> DataType {
147        DataType::Dictionary(
148            Box::new(<K as DictKey>::data_type()),
149            Box::new(DataType::Binary),
150        )
151    }
152    fn new_builder(_capacity: usize) -> Self::Builder {
153        BinaryDictionaryBuilder::new()
154    }
155    fn append_value(b: &mut Self::Builder, v: &Self) {
156        let _ = b.append(v.value().as_slice());
157    }
158    fn append_null(b: &mut Self::Builder) {
159        b.append_null();
160    }
161    fn finish(mut b: Self::Builder) -> Self::Array {
162        b.finish()
163    }
164}
165
166// FixedSizeBinary values: [u8; N]
167impl<K, const N: usize> ArrowBinding for Dictionary<K, [u8; N]>
168where
169    K: DictKey,
170    <K as DictKey>::ArrowKey: arrow_array::types::ArrowDictionaryKeyType,
171{
172    type Builder = FixedSizeBinaryDictionaryBuilder<<K as DictKey>::ArrowKey>;
173    type Array = arrow_array::DictionaryArray<<K as DictKey>::ArrowKey>;
174    fn data_type() -> DataType {
175        DataType::Dictionary(
176            Box::new(<K as DictKey>::data_type()),
177            Box::new(DataType::FixedSizeBinary(
178                i32::try_from(N).expect("width fits i32"),
179            )),
180        )
181    }
182    fn new_builder(_capacity: usize) -> Self::Builder {
183        // Builder enforces width on appended values; pass byte width
184        FixedSizeBinaryDictionaryBuilder::new(i32::try_from(N).expect("width fits i32"))
185    }
186    fn append_value(b: &mut Self::Builder, v: &Self) {
187        let _ = b.append(*v.value());
188    }
189    fn append_null(b: &mut Self::Builder) {
190        b.append_null();
191    }
192    fn finish(mut b: Self::Builder) -> Self::Array {
193        b.finish()
194    }
195}
196
197// LargeBinary values
198impl<K> ArrowBinding for Dictionary<K, LargeBinary>
199where
200    K: DictKey,
201    <K as DictKey>::ArrowKey: arrow_array::types::ArrowDictionaryKeyType,
202{
203    type Builder = LargeBinaryDictionaryBuilder<<K as DictKey>::ArrowKey>;
204    type Array = arrow_array::DictionaryArray<<K as DictKey>::ArrowKey>;
205    fn data_type() -> DataType {
206        DataType::Dictionary(
207            Box::new(<K as DictKey>::data_type()),
208            Box::new(DataType::LargeBinary),
209        )
210    }
211    fn new_builder(_capacity: usize) -> Self::Builder {
212        LargeBinaryDictionaryBuilder::new()
213    }
214    fn append_value(b: &mut Self::Builder, v: &Self) {
215        let _ = b.append(v.value().as_slice());
216    }
217    fn append_null(b: &mut Self::Builder) {
218        b.append_null();
219    }
220    fn finish(mut b: Self::Builder) -> Self::Array {
221        b.finish()
222    }
223}
224
225// LargeUtf8 values
226impl<K> ArrowBinding for Dictionary<K, LargeUtf8>
227where
228    K: DictKey,
229    <K as DictKey>::ArrowKey: arrow_array::types::ArrowDictionaryKeyType,
230{
231    type Builder = LargeStringDictionaryBuilder<<K as DictKey>::ArrowKey>;
232    type Array = arrow_array::DictionaryArray<<K as DictKey>::ArrowKey>;
233    fn data_type() -> DataType {
234        DataType::Dictionary(
235            Box::new(<K as DictKey>::data_type()),
236            Box::new(DataType::LargeUtf8),
237        )
238    }
239    fn new_builder(_capacity: usize) -> Self::Builder {
240        LargeStringDictionaryBuilder::new()
241    }
242    fn append_value(b: &mut Self::Builder, v: &Self) {
243        let _ = b.append(v.value().as_str());
244    }
245    fn append_null(b: &mut Self::Builder) {
246        b.append_null();
247    }
248    fn finish(mut b: Self::Builder) -> Self::Array {
249        b.finish()
250    }
251}
252
253// Primitive values via macro
254macro_rules! impl_dict_primitive_value {
255    ($rust:ty, $atype:ty, $dt:expr) => {
256        impl<K> ArrowBinding for Dictionary<K, $rust>
257        where
258            K: DictKey,
259            <K as DictKey>::ArrowKey: arrow_array::types::ArrowDictionaryKeyType,
260        {
261            type Builder = PrimitiveDictionaryBuilder<<K as DictKey>::ArrowKey, $atype>;
262            type Array = arrow_array::DictionaryArray<<K as DictKey>::ArrowKey>;
263            fn data_type() -> DataType {
264                DataType::Dictionary(Box::new(<K as DictKey>::data_type()), Box::new($dt))
265            }
266            fn new_builder(_capacity: usize) -> Self::Builder {
267                PrimitiveDictionaryBuilder::<_, $atype>::new()
268            }
269            fn append_value(b: &mut Self::Builder, v: &Self) {
270                let _ = b.append(*v.value());
271            }
272            fn append_null(b: &mut Self::Builder) {
273                b.append_null();
274            }
275            fn finish(mut b: Self::Builder) -> Self::Array {
276                b.finish()
277            }
278        }
279    };
280}
281
282impl_dict_primitive_value!(i8, Int8Type, DataType::Int8);
283impl_dict_primitive_value!(i16, Int16Type, DataType::Int16);
284impl_dict_primitive_value!(i32, Int32Type, DataType::Int32);
285impl_dict_primitive_value!(i64, Int64Type, DataType::Int64);
286impl_dict_primitive_value!(u8, UInt8Type, DataType::UInt8);
287impl_dict_primitive_value!(u16, UInt16Type, DataType::UInt16);
288impl_dict_primitive_value!(u32, UInt32Type, DataType::UInt32);
289impl_dict_primitive_value!(u64, UInt64Type, DataType::UInt64);
290impl_dict_primitive_value!(f32, Float32Type, DataType::Float32);
291impl_dict_primitive_value!(f64, Float64Type, DataType::Float64);
292
293// ArrowBindingView implementation for Dictionary types
294// Decodes the dictionary value at the given index
295#[cfg(feature = "views")]
296impl<K, V> super::ArrowBindingView for Dictionary<K, V>
297where
298    K: DictKey + 'static,
299    V: ArrowBinding + super::ArrowBindingView + 'static,
300    <K as DictKey>::ArrowKey: arrow_array::types::ArrowDictionaryKeyType,
301{
302    type Array = arrow_array::DictionaryArray<<K as DictKey>::ArrowKey>;
303    type View<'a>
304        = V::View<'a>
305    where
306        Self: 'a;
307
308    fn get_view(
309        array: &Self::Array,
310        index: usize,
311    ) -> Result<Self::View<'_>, crate::schema::ViewAccessError> {
312        use arrow_array::Array;
313        use arrow_buffer::ArrowNativeType;
314
315        if index >= array.len() {
316            return Err(crate::schema::ViewAccessError::OutOfBounds {
317                index,
318                len: array.len(),
319                field_name: None,
320            });
321        }
322        if array.is_null(index) {
323            return Err(crate::schema::ViewAccessError::UnexpectedNull {
324                index,
325                field_name: None,
326            });
327        }
328
329        // Get the key (dictionary index) for this row
330        let keys = array.keys();
331        let key_value = keys.value(index);
332        let dict_index = key_value.as_usize();
333
334        // Get the values array and downcast to the correct type
335        let values_array = array.values();
336        let typed_values = values_array
337            .as_any()
338            .downcast_ref::<<V as super::ArrowBindingView>::Array>()
339            .ok_or_else(|| crate::schema::ViewAccessError::TypeMismatch {
340                expected: V::data_type(),
341                actual: values_array.data_type().clone(),
342                field_name: None,
343            })?;
344
345        // Return a view of the decoded value
346        V::get_view(typed_values, dict_index)
347    }
348}
349
350// TryFrom implementations for converting views to owned Dictionary types
351// Note: Dictionary<K, V> only stores V at runtime; K is a compile-time marker
352// for the encoding strategy. The view is just V::View, so we convert from that.
353
354// String (Utf8)
355#[cfg(feature = "views")]
356impl<K> TryFrom<&str> for Dictionary<K, String>
357where
358    K: DictKey,
359{
360    type Error = crate::schema::ViewAccessError;
361
362    fn try_from(view: &str) -> Result<Self, Self::Error> {
363        Ok(Dictionary::new(view.into()))
364    }
365}
366
367// Binary
368#[cfg(feature = "views")]
369impl<K> TryFrom<&[u8]> for Dictionary<K, Vec<u8>>
370where
371    K: DictKey,
372{
373    type Error = crate::schema::ViewAccessError;
374
375    fn try_from(view: &[u8]) -> Result<Self, Self::Error> {
376        Ok(Dictionary::new(view.to_vec()))
377    }
378}
379
380// FixedSizeBinary
381#[cfg(feature = "views")]
382impl<K, const N: usize> TryFrom<&[u8]> for Dictionary<K, [u8; N]>
383where
384    K: DictKey,
385{
386    type Error = crate::schema::ViewAccessError;
387
388    fn try_from(view: &[u8]) -> Result<Self, Self::Error> {
389        let arr: [u8; N] =
390            view.try_into()
391                .map_err(|_| crate::schema::ViewAccessError::TypeMismatch {
392                    expected: arrow_schema::DataType::FixedSizeBinary(N as i32),
393                    actual: arrow_schema::DataType::Binary,
394                    field_name: None,
395                })?;
396        Ok(Dictionary::new(arr))
397    }
398}
399
400// LargeBinary
401#[cfg(feature = "views")]
402impl<K> TryFrom<&[u8]> for Dictionary<K, super::binary::LargeBinary>
403where
404    K: DictKey,
405{
406    type Error = crate::schema::ViewAccessError;
407
408    fn try_from(view: &[u8]) -> Result<Self, Self::Error> {
409        Ok(Dictionary::new(super::binary::LargeBinary::new(
410            view.to_vec(),
411        )))
412    }
413}
414
415// LargeUtf8
416#[cfg(feature = "views")]
417impl<K> TryFrom<&str> for Dictionary<K, super::strings::LargeUtf8>
418where
419    K: DictKey,
420{
421    type Error = crate::schema::ViewAccessError;
422
423    fn try_from(view: &str) -> Result<Self, Self::Error> {
424        Ok(Dictionary::new(super::strings::LargeUtf8::new(
425            view.to_string(),
426        )))
427    }
428}
429
430// Note: Primitive types (i8, i16, i32, i64, u8, u16, u32, u64, f32, f64) are already
431// covered by the generic impl From<V> for Dictionary<K, V> above (line 49).
432// Rust automatically provides TryFrom<V> with Error = Infallible via the blanket impl,
433// which works with our E: Into<ViewAccessError> bounds in generic code.