typed_arrow/bridge/
dictionary.rs

1//! Dictionary-encoded column bindings and key mapping.
2
3use std::marker::PhantomData;
4
5use arrow_array::{
6    builder::{
7        BinaryDictionaryBuilder, FixedSizeBinaryDictionaryBuilder, LargeBinaryDictionaryBuilder,
8        LargeStringDictionaryBuilder, PrimitiveDictionaryBuilder, StringDictionaryBuilder,
9    },
10    types::{
11        Float32Type, Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, UInt8Type, UInt16Type,
12        UInt32Type, UInt64Type,
13    },
14};
15use arrow_schema::DataType;
16
17use super::{ArrowBinding, binary::LargeBinary, strings::LargeUtf8};
18
19/// Wrapper denoting an Arrow Dictionary column with key type `K` and values of `V`.
20///
21/// The inner value is intentionally not exposed. Construct with `Dictionary::new`
22/// and access the contained value via `Dictionary::value` or `Dictionary::into_value`.
23///
24/// This prevents accidental reliance on representation details (e.g., raw keys) and
25/// keeps the API focused on appending logical values. The builder handles interning to keys.
26#[repr(transparent)]
27pub struct Dictionary<K, V>(V, PhantomData<K>);
28
29impl<K, V> Dictionary<K, V> {
30    /// Create a new dictionary value wrapper.
31    #[inline]
32    pub fn new(value: V) -> Self {
33        Self(value, PhantomData)
34    }
35
36    /// Borrow the contained logical value.
37    #[inline]
38    pub fn value(&self) -> &V {
39        &self.0
40    }
41
42    /// Consume and return the contained logical value.
43    #[inline]
44    pub fn into_value(self) -> V {
45        self.0
46    }
47}
48
49impl<K, V> From<V> for Dictionary<K, V> {
50    #[inline]
51    fn from(value: V) -> Self {
52        Self::new(value)
53    }
54}
55
56/// Dictionary key mapping from Rust integer to Arrow key type.
57pub trait DictKey {
58    /// Arrow key type corresponding to this Rust integer key.
59    type ArrowKey;
60
61    /// The Arrow `DataType` for the key.
62    fn data_type() -> DataType;
63}
64
65macro_rules! impl_dict_key {
66    ($rust:ty, $arrow:ty, $dt:expr) => {
67        impl DictKey for $rust {
68            type ArrowKey = $arrow;
69            fn data_type() -> DataType {
70                $dt
71            }
72        }
73    };
74}
75
76impl_dict_key!(i8, Int8Type, DataType::Int8);
77impl_dict_key!(i16, Int16Type, DataType::Int16);
78impl_dict_key!(i32, Int32Type, DataType::Int32);
79impl_dict_key!(i64, Int64Type, DataType::Int64);
80impl_dict_key!(u8, UInt8Type, DataType::UInt8);
81impl_dict_key!(u16, UInt16Type, DataType::UInt16);
82impl_dict_key!(u32, UInt32Type, DataType::UInt32);
83impl_dict_key!(u64, UInt64Type, DataType::UInt64);
84
85// Utf8 values
86impl<K> ArrowBinding for Dictionary<K, String>
87where
88    K: DictKey,
89    <K as DictKey>::ArrowKey: arrow_array::types::ArrowDictionaryKeyType,
90{
91    type Builder = StringDictionaryBuilder<<K as DictKey>::ArrowKey>;
92    type Array = arrow_array::DictionaryArray<<K as DictKey>::ArrowKey>;
93    fn data_type() -> DataType {
94        DataType::Dictionary(
95            Box::new(<K as DictKey>::data_type()),
96            Box::new(DataType::Utf8),
97        )
98    }
99    fn new_builder(_capacity: usize) -> Self::Builder {
100        StringDictionaryBuilder::new()
101    }
102    fn append_value(b: &mut Self::Builder, v: &Self) {
103        let _ = b.append(v.value().as_str());
104    }
105    fn append_null(b: &mut Self::Builder) {
106        b.append_null();
107    }
108    fn finish(mut b: Self::Builder) -> Self::Array {
109        b.finish()
110    }
111}
112
113// Binary values
114impl<K> ArrowBinding for Dictionary<K, Vec<u8>>
115where
116    K: DictKey,
117    <K as DictKey>::ArrowKey: arrow_array::types::ArrowDictionaryKeyType,
118{
119    type Builder = BinaryDictionaryBuilder<<K as DictKey>::ArrowKey>;
120    type Array = arrow_array::DictionaryArray<<K as DictKey>::ArrowKey>;
121    fn data_type() -> DataType {
122        DataType::Dictionary(
123            Box::new(<K as DictKey>::data_type()),
124            Box::new(DataType::Binary),
125        )
126    }
127    fn new_builder(_capacity: usize) -> Self::Builder {
128        BinaryDictionaryBuilder::new()
129    }
130    fn append_value(b: &mut Self::Builder, v: &Self) {
131        let _ = b.append(v.value().as_slice());
132    }
133    fn append_null(b: &mut Self::Builder) {
134        b.append_null();
135    }
136    fn finish(mut b: Self::Builder) -> Self::Array {
137        b.finish()
138    }
139}
140
141// FixedSizeBinary values: [u8; N]
142impl<K, const N: usize> ArrowBinding for Dictionary<K, [u8; N]>
143where
144    K: DictKey,
145    <K as DictKey>::ArrowKey: arrow_array::types::ArrowDictionaryKeyType,
146{
147    type Builder = FixedSizeBinaryDictionaryBuilder<<K as DictKey>::ArrowKey>;
148    type Array = arrow_array::DictionaryArray<<K as DictKey>::ArrowKey>;
149    fn data_type() -> DataType {
150        DataType::Dictionary(
151            Box::new(<K as DictKey>::data_type()),
152            Box::new(DataType::FixedSizeBinary(
153                i32::try_from(N).expect("width fits i32"),
154            )),
155        )
156    }
157    fn new_builder(_capacity: usize) -> Self::Builder {
158        // Builder enforces width on appended values; pass byte width
159        FixedSizeBinaryDictionaryBuilder::new(i32::try_from(N).expect("width fits i32"))
160    }
161    fn append_value(b: &mut Self::Builder, v: &Self) {
162        let _ = b.append(*v.value());
163    }
164    fn append_null(b: &mut Self::Builder) {
165        b.append_null();
166    }
167    fn finish(mut b: Self::Builder) -> Self::Array {
168        b.finish()
169    }
170}
171
172// LargeBinary values
173impl<K> ArrowBinding for Dictionary<K, LargeBinary>
174where
175    K: DictKey,
176    <K as DictKey>::ArrowKey: arrow_array::types::ArrowDictionaryKeyType,
177{
178    type Builder = LargeBinaryDictionaryBuilder<<K as DictKey>::ArrowKey>;
179    type Array = arrow_array::DictionaryArray<<K as DictKey>::ArrowKey>;
180    fn data_type() -> DataType {
181        DataType::Dictionary(
182            Box::new(<K as DictKey>::data_type()),
183            Box::new(DataType::LargeBinary),
184        )
185    }
186    fn new_builder(_capacity: usize) -> Self::Builder {
187        LargeBinaryDictionaryBuilder::new()
188    }
189    fn append_value(b: &mut Self::Builder, v: &Self) {
190        let _ = b.append(v.value().as_slice());
191    }
192    fn append_null(b: &mut Self::Builder) {
193        b.append_null();
194    }
195    fn finish(mut b: Self::Builder) -> Self::Array {
196        b.finish()
197    }
198}
199
200// LargeUtf8 values
201impl<K> ArrowBinding for Dictionary<K, LargeUtf8>
202where
203    K: DictKey,
204    <K as DictKey>::ArrowKey: arrow_array::types::ArrowDictionaryKeyType,
205{
206    type Builder = LargeStringDictionaryBuilder<<K as DictKey>::ArrowKey>;
207    type Array = arrow_array::DictionaryArray<<K as DictKey>::ArrowKey>;
208    fn data_type() -> DataType {
209        DataType::Dictionary(
210            Box::new(<K as DictKey>::data_type()),
211            Box::new(DataType::LargeUtf8),
212        )
213    }
214    fn new_builder(_capacity: usize) -> Self::Builder {
215        LargeStringDictionaryBuilder::new()
216    }
217    fn append_value(b: &mut Self::Builder, v: &Self) {
218        let _ = b.append(v.value().as_str());
219    }
220    fn append_null(b: &mut Self::Builder) {
221        b.append_null();
222    }
223    fn finish(mut b: Self::Builder) -> Self::Array {
224        b.finish()
225    }
226}
227
228// Primitive values via macro
229macro_rules! impl_dict_primitive_value {
230    ($rust:ty, $atype:ty, $dt:expr) => {
231        impl<K> ArrowBinding for Dictionary<K, $rust>
232        where
233            K: DictKey,
234            <K as DictKey>::ArrowKey: arrow_array::types::ArrowDictionaryKeyType,
235        {
236            type Builder = PrimitiveDictionaryBuilder<<K as DictKey>::ArrowKey, $atype>;
237            type Array = arrow_array::DictionaryArray<<K as DictKey>::ArrowKey>;
238            fn data_type() -> DataType {
239                DataType::Dictionary(Box::new(<K as DictKey>::data_type()), Box::new($dt))
240            }
241            fn new_builder(_capacity: usize) -> Self::Builder {
242                PrimitiveDictionaryBuilder::<_, $atype>::new()
243            }
244            fn append_value(b: &mut Self::Builder, v: &Self) {
245                let _ = b.append(*v.value());
246            }
247            fn append_null(b: &mut Self::Builder) {
248                b.append_null();
249            }
250            fn finish(mut b: Self::Builder) -> Self::Array {
251                b.finish()
252            }
253        }
254    };
255}
256
257impl_dict_primitive_value!(i8, Int8Type, DataType::Int8);
258impl_dict_primitive_value!(i16, Int16Type, DataType::Int16);
259impl_dict_primitive_value!(i32, Int32Type, DataType::Int32);
260impl_dict_primitive_value!(i64, Int64Type, DataType::Int64);
261impl_dict_primitive_value!(u8, UInt8Type, DataType::UInt8);
262impl_dict_primitive_value!(u16, UInt16Type, DataType::UInt16);
263impl_dict_primitive_value!(u32, UInt32Type, DataType::UInt32);
264impl_dict_primitive_value!(u64, UInt64Type, DataType::UInt64);
265impl_dict_primitive_value!(f32, Float32Type, DataType::Float32);
266impl_dict_primitive_value!(f64, Float64Type, DataType::Float64);
267
268// ArrowBindingView implementation for Dictionary types
269// Decodes the dictionary value at the given index
270#[cfg(feature = "views")]
271impl<K, V> super::ArrowBindingView for Dictionary<K, V>
272where
273    K: DictKey + 'static,
274    V: ArrowBinding + super::ArrowBindingView + 'static,
275    <K as DictKey>::ArrowKey: arrow_array::types::ArrowDictionaryKeyType,
276{
277    type Array = arrow_array::DictionaryArray<<K as DictKey>::ArrowKey>;
278    type View<'a>
279        = V::View<'a>
280    where
281        Self: 'a;
282
283    fn get_view(
284        array: &Self::Array,
285        index: usize,
286    ) -> Result<Self::View<'_>, crate::schema::ViewAccessError> {
287        use arrow_array::Array;
288        use arrow_buffer::ArrowNativeType;
289
290        if index >= array.len() {
291            return Err(crate::schema::ViewAccessError::OutOfBounds {
292                index,
293                len: array.len(),
294                field_name: None,
295            });
296        }
297        if array.is_null(index) {
298            return Err(crate::schema::ViewAccessError::UnexpectedNull {
299                index,
300                field_name: None,
301            });
302        }
303
304        // Get the key (dictionary index) for this row
305        let keys = array.keys();
306        let key_value = keys.value(index);
307        let dict_index = key_value.as_usize();
308
309        // Get the values array and downcast to the correct type
310        let values_array = array.values();
311        let typed_values = values_array
312            .as_any()
313            .downcast_ref::<<V as super::ArrowBindingView>::Array>()
314            .ok_or_else(|| crate::schema::ViewAccessError::TypeMismatch {
315                expected: V::data_type(),
316                actual: values_array.data_type().clone(),
317                field_name: None,
318            })?;
319
320        // Return a view of the decoded value
321        V::get_view(typed_values, dict_index)
322    }
323}
324
325// TryFrom implementations for converting views to owned Dictionary types
326// Note: Dictionary<K, V> only stores V at runtime; K is a compile-time marker
327// for the encoding strategy. The view is just V::View, so we convert from that.
328
329// String (Utf8)
330#[cfg(feature = "views")]
331impl<K> TryFrom<&str> for Dictionary<K, String>
332where
333    K: DictKey,
334{
335    type Error = crate::schema::ViewAccessError;
336
337    fn try_from(view: &str) -> Result<Self, Self::Error> {
338        Ok(Dictionary::new(view.into()))
339    }
340}
341
342// Binary
343#[cfg(feature = "views")]
344impl<K> TryFrom<&[u8]> for Dictionary<K, Vec<u8>>
345where
346    K: DictKey,
347{
348    type Error = crate::schema::ViewAccessError;
349
350    fn try_from(view: &[u8]) -> Result<Self, Self::Error> {
351        Ok(Dictionary::new(view.to_vec()))
352    }
353}
354
355// FixedSizeBinary
356#[cfg(feature = "views")]
357impl<K, const N: usize> TryFrom<&[u8]> for Dictionary<K, [u8; N]>
358where
359    K: DictKey,
360{
361    type Error = crate::schema::ViewAccessError;
362
363    fn try_from(view: &[u8]) -> Result<Self, Self::Error> {
364        let arr: [u8; N] =
365            view.try_into()
366                .map_err(|_| crate::schema::ViewAccessError::TypeMismatch {
367                    expected: arrow_schema::DataType::FixedSizeBinary(N as i32),
368                    actual: arrow_schema::DataType::Binary,
369                    field_name: None,
370                })?;
371        Ok(Dictionary::new(arr))
372    }
373}
374
375// LargeBinary
376#[cfg(feature = "views")]
377impl<K> TryFrom<&[u8]> for Dictionary<K, super::binary::LargeBinary>
378where
379    K: DictKey,
380{
381    type Error = crate::schema::ViewAccessError;
382
383    fn try_from(view: &[u8]) -> Result<Self, Self::Error> {
384        Ok(Dictionary::new(super::binary::LargeBinary::new(
385            view.to_vec(),
386        )))
387    }
388}
389
390// LargeUtf8
391#[cfg(feature = "views")]
392impl<K> TryFrom<&str> for Dictionary<K, super::strings::LargeUtf8>
393where
394    K: DictKey,
395{
396    type Error = crate::schema::ViewAccessError;
397
398    fn try_from(view: &str) -> Result<Self, Self::Error> {
399        Ok(Dictionary::new(super::strings::LargeUtf8::new(
400            view.to_string(),
401        )))
402    }
403}
404
405// Note: Primitive types (i8, i16, i32, i64, u8, u16, u32, u64, f32, f64) are already
406// covered by the generic impl From<V> for Dictionary<K, V> above (line 49).
407// Rust automatically provides TryFrom<V> with Error = Infallible via the blanket impl,
408// which works with our E: Into<ViewAccessError> bounds in generic code.