Skip to main content

vortex_compressor/builtins/dict/
float.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Float-specific dictionary encoding implementation.
5//!
6//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted for
7//! external compatibility.
8
9use vortex_array::ArrayRef;
10use vortex_array::Canonical;
11use vortex_array::IntoArray;
12use vortex_array::arrays::DictArray;
13use vortex_array::arrays::PrimitiveArray;
14use vortex_array::arrays::dict::DictArrayExt;
15use vortex_array::arrays::dict::DictArraySlotsExt;
16use vortex_array::arrays::primitive::PrimitiveArrayExt;
17use vortex_array::dtype::half::f16;
18use vortex_array::validity::Validity;
19use vortex_buffer::Buffer;
20use vortex_error::VortexExpect;
21use vortex_error::VortexResult;
22
23use crate::CascadingCompressor;
24use crate::builtins::FloatDictScheme;
25use crate::builtins::IntDictScheme;
26use crate::builtins::is_float_primitive;
27use crate::ctx::CompressorContext;
28use crate::estimate::CompressionEstimate;
29use crate::scheme::ChildSelection;
30use crate::scheme::DescendantExclusion;
31use crate::scheme::Scheme;
32use crate::scheme::SchemeExt;
33use crate::stats::ArrayAndStats;
34use crate::stats::FloatErasedStats;
35use crate::stats::FloatStats;
36use crate::stats::GenerateStatsOptions;
37
38impl Scheme for FloatDictScheme {
39    fn scheme_name(&self) -> &'static str {
40        "vortex.float.dict"
41    }
42
43    fn matches(&self, canonical: &Canonical) -> bool {
44        is_float_primitive(canonical)
45    }
46
47    fn stats_options(&self) -> GenerateStatsOptions {
48        GenerateStatsOptions {
49            count_distinct_values: true,
50        }
51    }
52
53    /// Children: values=0, codes=1.
54    fn num_children(&self) -> usize {
55        2
56    }
57
58    /// Float dict codes (child 1) are compact unsigned integers that should not be
59    /// dict-encoded again. Float dict values (child 0) flow through ALP into integer-land,
60    /// where integer dict encoding is redundant since the values are already deduplicated at
61    /// the float level.
62    ///
63    /// Additional exclusions for codes (IntSequenceScheme, IntRunEndScheme, FoRScheme,
64    /// ZigZagScheme, SparseScheme, RLE) are expressed as pull rules on those schemes in
65    /// vortex-btrblocks.
66    fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
67        vec![
68            DescendantExclusion {
69                excluded: IntDictScheme.id(),
70                children: ChildSelection::One(1),
71            },
72            DescendantExclusion {
73                excluded: IntDictScheme.id(),
74                children: ChildSelection::One(0),
75            },
76        ]
77    }
78
79    fn expected_compression_ratio(
80        &self,
81        data: &mut ArrayAndStats,
82        _ctx: CompressorContext,
83    ) -> CompressionEstimate {
84        let stats = data.float_stats();
85
86        if stats.value_count() == 0 {
87            return CompressionEstimate::Skip;
88        }
89
90        let distinct_values_count = stats.distinct_count().vortex_expect(
91            "this must be present since `DictScheme` declared that we need distinct values",
92        );
93
94        // If > 50% of the values are distinct, skip dictionary scheme.
95        if distinct_values_count > stats.value_count() / 2 {
96            return CompressionEstimate::Skip;
97        }
98
99        // Let sampling determine the expected ratio.
100        CompressionEstimate::Sample
101    }
102
103    fn compress(
104        &self,
105        compressor: &CascadingCompressor,
106        data: &mut ArrayAndStats,
107        ctx: CompressorContext,
108    ) -> VortexResult<ArrayRef> {
109        // TODO(connor): Fight the borrow checker (needs interior mutability)!
110        let stats = data.float_stats().clone();
111        let dict = dictionary_encode(data.array_as_primitive(), &stats)?;
112
113        let has_all_values_referenced = dict.has_all_values_referenced();
114
115        // Values = child 0.
116        let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?;
117
118        // Codes = child 1.
119        let narrowed_codes = dict
120            .codes()
121            .clone()
122            .execute::<PrimitiveArray>(&mut compressor.execution_ctx())?
123            .narrow()?
124            .into_array();
125        let compressed_codes = compressor.compress_child(&narrowed_codes, &ctx, self.id(), 1)?;
126
127        // SAFETY: compressing codes or values does not alter the invariants.
128        unsafe {
129            Ok(
130                DictArray::new_unchecked(compressed_codes, compressed_values)
131                    .set_all_values_referenced(has_all_values_referenced)
132                    .into_array(),
133            )
134        }
135    }
136}
137
138/// Encodes a typed float array into a [`DictArray`] using the pre-computed distinct values.
139macro_rules! typed_encode {
140    ($source_array:ident, $stats:ident, $typed:ident, $typ:ty) => {{
141        let distinct = $typed.distinct().vortex_expect(
142            "this must be present since `DictScheme` declared that we need distinct values",
143        );
144
145        let values_validity = match $source_array.validity()? {
146            Validity::NonNullable => Validity::NonNullable,
147            _ => Validity::AllValid,
148        };
149        let codes_validity = $source_array.validity()?;
150
151        let values: Buffer<$typ> = distinct.distinct_values().iter().map(|x| x.0).collect();
152
153        let max_code = values.len();
154        let codes = if max_code <= u8::MAX as usize {
155            let buf = <DictEncoder as Encode<$typ, u8>>::encode(
156                &values,
157                $source_array.as_slice::<$typ>(),
158            );
159            PrimitiveArray::new(buf, codes_validity).into_array()
160        } else if max_code <= u16::MAX as usize {
161            let buf = <DictEncoder as Encode<$typ, u16>>::encode(
162                &values,
163                $source_array.as_slice::<$typ>(),
164            );
165            PrimitiveArray::new(buf, codes_validity).into_array()
166        } else {
167            let buf = <DictEncoder as Encode<$typ, u32>>::encode(
168                &values,
169                $source_array.as_slice::<$typ>(),
170            );
171            PrimitiveArray::new(buf, codes_validity).into_array()
172        };
173
174        let values = PrimitiveArray::new(values, values_validity).into_array();
175        // SAFETY: enforced by the DictEncoder.
176        Ok(unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) })
177    }};
178}
179
180/// Compresses a floating-point array into a dictionary array according to attached stats.
181///
182/// # Errors
183///
184/// Returns an error if unable to compute validity.
185pub fn dictionary_encode(array: PrimitiveArray, stats: &FloatStats) -> VortexResult<DictArray> {
186    match stats.erased() {
187        FloatErasedStats::F16(typed) => typed_encode!(array, stats, typed, f16),
188        FloatErasedStats::F32(typed) => typed_encode!(array, stats, typed, f32),
189        FloatErasedStats::F64(typed) => typed_encode!(array, stats, typed, f64),
190    }
191}
192
193/// Stateless encoder that maps values to dictionary codes via a `HashMap`.
194struct DictEncoder;
195
196/// Trait for encoding values of type `T` into codes of type `I`.
197trait Encode<T, I> {
198    /// Using the distinct value set, turn the values into a set of codes.
199    fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
200}
201
202/// Implements [`Encode`] for a float type using its bit representation as the hash key.
203macro_rules! impl_encode {
204    ($typ:ty, $utyp:ty) => { impl_encode!($typ, $utyp, u8, u16, u32); };
205    ($typ:ty, $utyp:ty, $($ityp:ty),+) => {
206        $(
207        impl Encode<$typ, $ityp> for DictEncoder {
208            #[allow(clippy::cast_possible_truncation)]
209            fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
210                let mut codes =
211                    vortex_utils::aliases::hash_map::HashMap::<$utyp, $ityp>::with_capacity(
212                        distinct.len(),
213                    );
214                for (code, &value) in distinct.iter().enumerate() {
215                    codes.insert(value.to_bits(), code as $ityp);
216                }
217
218                let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
219                for value in values {
220                    // Any code lookups which fail are for nulls, so their value does not matter.
221                    output.push(codes.get(&value.to_bits()).copied().unwrap_or_default());
222                }
223
224                output.freeze()
225            }
226        }
227        )*
228    };
229}
230
231impl_encode!(f16, u16);
232impl_encode!(f32, u32);
233impl_encode!(f64, u64);
234
235#[cfg(test)]
236mod tests {
237    use vortex_array::IntoArray;
238    use vortex_array::ToCanonical;
239    use vortex_array::arrays::BoolArray;
240    use vortex_array::arrays::PrimitiveArray;
241    use vortex_array::arrays::dict::DictArraySlotsExt;
242    use vortex_array::assert_arrays_eq;
243    use vortex_array::validity::Validity;
244    use vortex_buffer::buffer;
245
246    use super::dictionary_encode;
247    use crate::stats::FloatStats;
248    use crate::stats::GenerateStatsOptions;
249
250    #[test]
251    fn test_float_dict_encode() {
252        let values = buffer![1f32, 2f32, 2f32, 0f32, 1f32];
253        let validity =
254            Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
255        let array = PrimitiveArray::new(values, validity);
256
257        let stats = FloatStats::generate_opts(
258            &array,
259            GenerateStatsOptions {
260                count_distinct_values: true,
261            },
262        );
263        let dict_array = dictionary_encode(array, &stats).unwrap();
264        assert_eq!(dict_array.values().len(), 2);
265        assert_eq!(dict_array.codes().len(), 5);
266
267        let expected = PrimitiveArray::new(
268            buffer![1f32, 2f32, 2f32, 1f32, 1f32],
269            Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()),
270        )
271        .into_array();
272        let undict = dict_array.as_array().to_primitive().into_array();
273        assert_arrays_eq!(undict, expected);
274    }
275}