vortex_btrblocks/float/
dictionary.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Float-specific dictionary encoding implementation.
5
6use vortex_array::IntoArray;
7use vortex_array::arrays::{DictArray, PrimitiveArray};
8use vortex_array::validity::Validity;
9use vortex_array::vtable::ValidityHelper;
10use vortex_buffer::Buffer;
11use vortex_dtype::half::f16;
12
13use crate::float::stats::{ErasedDistinctValues, FloatStats};
14
15macro_rules! typed_encode {
16    ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{
17        let values: Buffer<$typ> = $typed.values.iter().map(|x| x.0).collect();
18
19        let max_code = values.len();
20        let codes = if max_code <= u8::MAX as usize {
21            let buf =
22                <DictEncoder as Encode<$typ, u8>>::encode(&values, $stats.src.as_slice::<$typ>());
23            PrimitiveArray::new(buf, $validity.clone()).into_array()
24        } else if max_code <= u16::MAX as usize {
25            let buf =
26                <DictEncoder as Encode<$typ, u16>>::encode(&values, $stats.src.as_slice::<$typ>());
27            PrimitiveArray::new(buf, $validity.clone()).into_array()
28        } else {
29            let buf =
30                <DictEncoder as Encode<$typ, u32>>::encode(&values, $stats.src.as_slice::<$typ>());
31            PrimitiveArray::new(buf, $validity.clone()).into_array()
32        };
33
34        let values_validity = match $validity {
35            Validity::NonNullable => Validity::NonNullable,
36            _ => Validity::AllValid,
37        };
38        let values = PrimitiveArray::new(values, values_validity).into_array();
39
40        // SAFETY: enforced by the DictEncoder
41        unsafe { DictArray::new_unchecked(codes, values) }
42    }};
43}
44
45/// Compresses a floating-point array into a dictionary arrays according to attached stats.
46pub fn dictionary_encode(stats: &FloatStats) -> DictArray {
47    let validity = stats.src.validity();
48    match &stats.distinct_values {
49        ErasedDistinctValues::F16(typed) => typed_encode!(stats, typed, validity, f16),
50        ErasedDistinctValues::F32(typed) => typed_encode!(stats, typed, validity, f32),
51        ErasedDistinctValues::F64(typed) => typed_encode!(stats, typed, validity, f64),
52    }
53}
54
55struct DictEncoder;
56
57trait Encode<T, I> {
58    /// Using the distinct value set, turn the values into a set of codes.
59    fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
60}
61
62macro_rules! impl_encode {
63    ($typ:ty, $utyp:ty) => { impl_encode!($typ, $utyp, u8, u16, u32); };
64    ($typ:ty, $utyp:ty, $($ityp:ty),+) => {
65        $(
66        impl Encode<$typ, $ityp> for DictEncoder {
67            #[allow(clippy::cast_possible_truncation)]
68            fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
69                let mut codes =
70                    vortex_utils::aliases::hash_map::HashMap::<$utyp, $ityp>::with_capacity(
71                        distinct.len(),
72                    );
73                for (code, &value) in distinct.iter().enumerate() {
74                    codes.insert(value.to_bits(), code as $ityp);
75                }
76
77                let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
78                for value in values {
79                    // Any code lookups which fail are for nulls, so their value
80                    // does not matter.
81                    output.push(codes.get(&value.to_bits()).copied().unwrap_or_default());
82                }
83
84                return output.freeze();
85            }
86        }
87        )*
88    };
89}
90
91impl_encode!(f16, u16);
92impl_encode!(f32, u32);
93impl_encode!(f64, u64);
94
95#[cfg(test)]
96mod tests {
97    use vortex_array::arrays::{BoolArray, PrimitiveArray};
98    use vortex_array::validity::Validity;
99    use vortex_array::{Array, IntoArray, assert_arrays_eq};
100    use vortex_buffer::buffer;
101
102    use crate::CompressorStats;
103    use crate::float::dictionary::dictionary_encode;
104    use crate::float::stats::FloatStats;
105
106    #[test]
107    fn test_float_dict_encode() {
108        // Create an array that has some nulls
109        let values = buffer![1f32, 2f32, 2f32, 0f32, 1f32];
110        let validity =
111            Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
112        let array = PrimitiveArray::new(values, validity);
113
114        let stats = FloatStats::generate(&array);
115        let dict_array = dictionary_encode(&stats);
116        assert_eq!(dict_array.values().len(), 2);
117        assert_eq!(dict_array.codes().len(), 5);
118
119        let undict = dict_array;
120
121        // We just use code zero but it doesn't really matter.
122        // We can just shove a whole validity buffer in there instead.
123        let expected = PrimitiveArray::new(
124            buffer![1f32, 2f32, 2f32, 1f32, 1f32],
125            Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()),
126        )
127        .into_array();
128        assert_arrays_eq!(undict.as_ref(), expected.as_ref());
129    }
130}