vortex_btrblocks/integer/
dictionary.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Dictionary compressor that reuses the unique values in the `IntegerStats`.
5
6use vortex_array::IntoArray;
7use vortex_array::arrays::PrimitiveArray;
8use vortex_array::validity::Validity;
9use vortex_array::vtable::ValidityHelper;
10use vortex_buffer::Buffer;
11use vortex_dict::DictArray;
12
13use crate::integer::IntegerStats;
14use crate::integer::stats::ErasedStats;
15
16macro_rules! typed_encode {
17    ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{
18        let values: Buffer<$typ> = $typed.distinct_values.keys().map(|x| x.0).collect();
19
20        let max_code = values.len();
21        let codes = if max_code <= u8::MAX as usize {
22            let buf =
23                <DictEncoder as Encode<$typ, u8>>::encode(&values, $stats.src.as_slice::<$typ>());
24            PrimitiveArray::new(buf, $validity.clone()).into_array()
25        } else if max_code <= u16::MAX as usize {
26            let buf =
27                <DictEncoder as Encode<$typ, u16>>::encode(&values, $stats.src.as_slice::<$typ>());
28            PrimitiveArray::new(buf, $validity.clone()).into_array()
29        } else {
30            let buf =
31                <DictEncoder as Encode<$typ, u32>>::encode(&values, $stats.src.as_slice::<$typ>());
32            PrimitiveArray::new(buf, $validity.clone()).into_array()
33        };
34
35        let values_validity = match $validity {
36            Validity::NonNullable => Validity::NonNullable,
37            _ => Validity::AllValid,
38        };
39
40        let values = PrimitiveArray::new(values, values_validity).into_array();
41        // SAFETY: invariants enforced in DictEncoder
42        unsafe { DictArray::new_unchecked(codes, values) }
43    }};
44}
45
46/// Compresses an integer array into a dictionary arrays according to attached stats.
47#[allow(clippy::cognitive_complexity)]
48pub fn dictionary_encode(stats: &IntegerStats) -> DictArray {
49    // We need to preserve the nullability somehow from the original
50    let src_validity = stats.src.validity();
51
52    match &stats.typed {
53        ErasedStats::U8(typed) => typed_encode!(stats, typed, src_validity, u8),
54        ErasedStats::U16(typed) => typed_encode!(stats, typed, src_validity, u16),
55        ErasedStats::U32(typed) => typed_encode!(stats, typed, src_validity, u32),
56        ErasedStats::U64(typed) => typed_encode!(stats, typed, src_validity, u64),
57        ErasedStats::I8(typed) => typed_encode!(stats, typed, src_validity, i8),
58        ErasedStats::I16(typed) => typed_encode!(stats, typed, src_validity, i16),
59        ErasedStats::I32(typed) => typed_encode!(stats, typed, src_validity, i32),
60        ErasedStats::I64(typed) => typed_encode!(stats, typed, src_validity, i64),
61    }
62}
63
64struct DictEncoder;
65
66trait Encode<T, I> {
67    /// Using the distinct value set, turn the values into a set of codes.
68    fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
69}
70
71macro_rules! impl_encode {
72    ($typ:ty) => { impl_encode!($typ, u8, u16, u32); };
73    ($typ:ty, $($ityp:ty),+) => {
74        $(
75        impl Encode<$typ, $ityp> for DictEncoder {
76            #[allow(clippy::cast_possible_truncation)]
77            fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
78                let mut codes =
79                    vortex_utils::aliases::hash_map::HashMap::<$typ, $ityp>::with_capacity(
80                        distinct.len(),
81                    );
82                for (code, &value) in distinct.iter().enumerate() {
83                    codes.insert(value, code as $ityp);
84                }
85
86                let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
87                for value in values {
88                    // Any code lookups which fail are for nulls, so their value
89                    // does not matter.
90                    // SAFETY: we have exactly sized output to be as large as values.
91                    unsafe { output.push_unchecked(codes.get(value).copied().unwrap_or_default()) };
92                }
93
94                return output.freeze();
95            }
96        }
97        )*
98    };
99}
100
101impl_encode!(u8);
102impl_encode!(u16);
103impl_encode!(u32);
104impl_encode!(u64);
105impl_encode!(i8);
106impl_encode!(i16);
107impl_encode!(i32);
108impl_encode!(i64);
109
110#[cfg(test)]
111mod tests {
112    use vortex_array::arrays::{BoolArray, PrimitiveArray};
113    use vortex_array::validity::Validity;
114    use vortex_array::{Array, IntoArray, ToCanonical};
115    use vortex_buffer::buffer;
116
117    use crate::CompressorStats;
118    use crate::integer::IntegerStats;
119    use crate::integer::dictionary::dictionary_encode;
120
121    #[test]
122    fn test_dict_encode_integer_stats() {
123        // Create an array that has some nulls
124        let data = buffer![100i32, 200, 100, 0, 100];
125        let validity =
126            Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
127        let array = PrimitiveArray::new(data, validity);
128
129        let stats = IntegerStats::generate(&array);
130        let dict_array = dictionary_encode(&stats);
131        assert_eq!(dict_array.values().len(), 2);
132        assert_eq!(dict_array.codes().len(), 5);
133
134        let undict = dict_array.to_primitive();
135
136        // We just use code zero, but it doesn't really matter.
137        // We can just shove a whole validity buffer in there instead.
138        assert_eq!(undict.as_slice::<i32>(), &[100i32, 200, 100, 100, 100]);
139    }
140}