vortex_btrblocks/integer/
dictionary.rs

1//! Dictionary compressor that reuses the unique values in the `IntegerStats`.
2
3use vortex_array::Array;
4use vortex_array::arrays::PrimitiveArray;
5use vortex_array::validity::Validity;
6use vortex_buffer::Buffer;
7use vortex_dict::DictArray;
8use vortex_error::VortexResult;
9
10use crate::integer::IntegerStats;
11use crate::integer::stats::ErasedStats;
12
13macro_rules! typed_encode {
14    ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{
15        let values: Buffer<$typ> = $typed.distinct_values.keys().map(|x| x.0).collect();
16
17        let max_code = values.len();
18        let codes = if max_code <= u8::MAX as usize {
19            let buf =
20                <DictEncoder as Encode<$typ, u8>>::encode(&values, $stats.src.as_slice::<$typ>());
21            PrimitiveArray::new(buf, $validity.clone()).into_array()
22        } else if max_code <= u16::MAX as usize {
23            let buf =
24                <DictEncoder as Encode<$typ, u16>>::encode(&values, $stats.src.as_slice::<$typ>());
25            PrimitiveArray::new(buf, $validity.clone()).into_array()
26        } else {
27            let buf =
28                <DictEncoder as Encode<$typ, u32>>::encode(&values, $stats.src.as_slice::<$typ>());
29            PrimitiveArray::new(buf, $validity.clone()).into_array()
30        };
31
32        let values_validity = match $validity {
33            Validity::NonNullable => Validity::NonNullable,
34            _ => Validity::AllValid,
35        };
36
37        let values = PrimitiveArray::new(values, values_validity).into_array();
38        DictArray::try_new(codes, values)
39    }};
40}
41
42#[allow(clippy::cognitive_complexity)]
43pub fn dictionary_encode(stats: &IntegerStats) -> VortexResult<DictArray> {
44    // We need to preserve the nullability somehow from the original
45    let src_validity = stats.src.validity();
46
47    match &stats.typed {
48        ErasedStats::U8(typed) => typed_encode!(stats, typed, src_validity, u8),
49        ErasedStats::U16(typed) => typed_encode!(stats, typed, src_validity, u16),
50        ErasedStats::U32(typed) => typed_encode!(stats, typed, src_validity, u32),
51        ErasedStats::U64(typed) => typed_encode!(stats, typed, src_validity, u64),
52        ErasedStats::I8(typed) => typed_encode!(stats, typed, src_validity, i8),
53        ErasedStats::I16(typed) => typed_encode!(stats, typed, src_validity, i16),
54        ErasedStats::I32(typed) => typed_encode!(stats, typed, src_validity, i32),
55        ErasedStats::I64(typed) => typed_encode!(stats, typed, src_validity, i64),
56    }
57}
58
59struct DictEncoder;
60
61trait Encode<T, I> {
62    /// Using the distinct value set, turn the values into a set of codes.
63    fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
64}
65
66macro_rules! impl_encode {
67    ($typ:ty) => { impl_encode!($typ, u8, u16, u32); };
68    ($typ:ty, $($ityp:ty),+) => {
69        $(
70        impl Encode<$typ, $ityp> for DictEncoder {
71            #[allow(clippy::cast_possible_truncation)]
72            fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
73                let mut codes =
74                    vortex_array::aliases::hash_map::HashMap::<$typ, $ityp>::with_capacity(
75                        distinct.len(),
76                    );
77                for (code, &value) in distinct.iter().enumerate() {
78                    codes.insert(value, code as $ityp);
79                }
80
81                let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
82                for value in values {
83                    // Any code lookups which fail are for nulls, so their value
84                    // does not matter.
85                    // SAFETY: we have exactly sized output to be as large as values.
86                    unsafe { output.push_unchecked(codes.get(value).copied().unwrap_or_default()) };
87                }
88
89                return output.freeze();
90            }
91        }
92        )*
93    };
94}
95
96impl_encode!(u8);
97impl_encode!(u16);
98impl_encode!(u32);
99impl_encode!(u64);
100impl_encode!(i8);
101impl_encode!(i16);
102impl_encode!(i32);
103impl_encode!(i64);
104
105#[cfg(test)]
106mod tests {
107    use vortex_array::arrays::{BoolArray, PrimitiveArray};
108    use vortex_array::validity::Validity;
109    use vortex_array::{Array, ToCanonical};
110    use vortex_buffer::buffer;
111
112    use crate::CompressorStats;
113    use crate::integer::IntegerStats;
114    use crate::integer::dictionary::dictionary_encode;
115
116    #[test]
117    fn test_dict_encode_integer_stats() {
118        // Create an array that has some nulls
119        let data = buffer![100i32, 200, 100, 0, 100];
120        let validity =
121            Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
122        let array = PrimitiveArray::new(data, validity);
123
124        let stats = IntegerStats::generate(&array);
125        let dict_array = dictionary_encode(&stats).unwrap();
126        assert_eq!(dict_array.values().len(), 2);
127        assert_eq!(dict_array.codes().len(), 5);
128
129        let undict = dict_array.to_primitive().unwrap();
130
131        // We just use code zero, but it doesn't really matter.
132        // We can just shove a whole validity buffer in there instead.
133        assert_eq!(undict.as_slice::<i32>(), &[100i32, 200, 100, 100, 100]);
134    }
135}