Skip to main content

vortex_compressor/builtins/dict/
float.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Float-specific dictionary encoding implementation.
5//!
6//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted for
7//! external compatibility.
8
9use vortex_array::ArrayRef;
10use vortex_array::ArrayView;
11use vortex_array::Canonical;
12use vortex_array::IntoArray;
13use vortex_array::arrays::DictArray;
14use vortex_array::arrays::Primitive;
15use vortex_array::arrays::PrimitiveArray;
16use vortex_array::arrays::dict::DictArrayExt;
17use vortex_array::arrays::dict::DictArraySlotsExt;
18use vortex_array::arrays::primitive::PrimitiveArrayExt;
19use vortex_array::dtype::half::f16;
20use vortex_array::validity::Validity;
21use vortex_buffer::Buffer;
22use vortex_error::VortexExpect;
23use vortex_error::VortexResult;
24
25use crate::CascadingCompressor;
26use crate::builtins::FloatDictScheme;
27use crate::builtins::IntDictScheme;
28use crate::builtins::is_float_primitive;
29use crate::ctx::CompressorContext;
30use crate::estimate::CompressionEstimate;
31use crate::estimate::DeferredEstimate;
32use crate::estimate::EstimateVerdict;
33use crate::scheme::ChildSelection;
34use crate::scheme::DescendantExclusion;
35use crate::scheme::Scheme;
36use crate::scheme::SchemeExt;
37use crate::stats::ArrayAndStats;
38use crate::stats::FloatErasedStats;
39use crate::stats::FloatStats;
40use crate::stats::GenerateStatsOptions;
41
42impl Scheme for FloatDictScheme {
43    fn scheme_name(&self) -> &'static str {
44        "vortex.float.dict"
45    }
46
47    fn matches(&self, canonical: &Canonical) -> bool {
48        is_float_primitive(canonical)
49    }
50
51    fn stats_options(&self) -> GenerateStatsOptions {
52        GenerateStatsOptions {
53            count_distinct_values: true,
54        }
55    }
56
57    /// Children: values=0, codes=1.
58    fn num_children(&self) -> usize {
59        2
60    }
61
62    /// Float dict codes (child 1) are compact unsigned integers that should not be
63    /// dict-encoded again. Float dict values (child 0) flow through ALP into integer-land,
64    /// where integer dict encoding is redundant since the values are already deduplicated at
65    /// the float level.
66    ///
67    /// Additional exclusions for codes (IntSequenceScheme, IntRunEndScheme, FoRScheme,
68    /// ZigZagScheme, SparseScheme, RLE) are expressed as pull rules on those schemes in
69    /// vortex-btrblocks.
70    fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
71        vec![
72            DescendantExclusion {
73                excluded: IntDictScheme.id(),
74                children: ChildSelection::One(1),
75            },
76            DescendantExclusion {
77                excluded: IntDictScheme.id(),
78                children: ChildSelection::One(0),
79            },
80        ]
81    }
82
83    fn expected_compression_ratio(
84        &self,
85        data: &mut ArrayAndStats,
86        _ctx: CompressorContext,
87    ) -> CompressionEstimate {
88        let stats = data.float_stats();
89
90        if stats.value_count() == 0 {
91            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
92        }
93
94        let distinct_values_count = stats.distinct_count().vortex_expect(
95            "this must be present since `DictScheme` declared that we need distinct values",
96        );
97
98        // If > 50% of the values are distinct, skip dictionary scheme.
99        if distinct_values_count > stats.value_count() / 2 {
100            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
101        }
102
103        // Let sampling determine the expected ratio.
104        CompressionEstimate::Deferred(DeferredEstimate::Sample)
105    }
106
107    fn compress(
108        &self,
109        compressor: &CascadingCompressor,
110        data: &mut ArrayAndStats,
111        ctx: CompressorContext,
112    ) -> VortexResult<ArrayRef> {
113        // TODO(connor): Fight the borrow checker (needs interior mutability)!
114        let stats = data.float_stats().clone();
115        let dict = dictionary_encode(data.array_as_primitive(), &stats)?;
116
117        let has_all_values_referenced = dict.has_all_values_referenced();
118
119        // Values = child 0.
120        let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?;
121
122        // Codes = child 1.
123        let narrowed_codes = dict
124            .codes()
125            .clone()
126            .execute::<PrimitiveArray>(&mut compressor.execution_ctx())?
127            .narrow()?
128            .into_array();
129        let compressed_codes = compressor.compress_child(&narrowed_codes, &ctx, self.id(), 1)?;
130
131        // SAFETY: compressing codes or values does not alter the invariants.
132        unsafe {
133            Ok(
134                DictArray::new_unchecked(compressed_codes, compressed_values)
135                    .set_all_values_referenced(has_all_values_referenced)
136                    .into_array(),
137            )
138        }
139    }
140}
141
142/// Encodes a typed float array into a [`DictArray`] using the pre-computed distinct values.
143macro_rules! typed_encode {
144    ($source_array:ident, $stats:ident, $typed:ident, $typ:ty) => {{
145        let distinct = $typed.distinct().vortex_expect(
146            "this must be present since `DictScheme` declared that we need distinct values",
147        );
148
149        let values_validity = match $source_array.validity()? {
150            Validity::NonNullable => Validity::NonNullable,
151            _ => Validity::AllValid,
152        };
153        let codes_validity = $source_array.validity()?;
154
155        let values: Buffer<$typ> = distinct.distinct_values().iter().map(|x| x.0).collect();
156
157        let max_code = values.len();
158        let codes = if max_code <= u8::MAX as usize {
159            let buf = <DictEncoder as Encode<$typ, u8>>::encode(
160                &values,
161                $source_array.as_slice::<$typ>(),
162            );
163            PrimitiveArray::new(buf, codes_validity).into_array()
164        } else if max_code <= u16::MAX as usize {
165            let buf = <DictEncoder as Encode<$typ, u16>>::encode(
166                &values,
167                $source_array.as_slice::<$typ>(),
168            );
169            PrimitiveArray::new(buf, codes_validity).into_array()
170        } else {
171            let buf = <DictEncoder as Encode<$typ, u32>>::encode(
172                &values,
173                $source_array.as_slice::<$typ>(),
174            );
175            PrimitiveArray::new(buf, codes_validity).into_array()
176        };
177
178        let values = PrimitiveArray::new(values, values_validity).into_array();
179        // SAFETY: enforced by the DictEncoder.
180        Ok(unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) })
181    }};
182}
183
184/// Compresses a floating-point array into a dictionary array according to attached stats.
185///
186/// # Errors
187///
188/// Returns an error if unable to compute validity.
189pub fn dictionary_encode(
190    array: ArrayView<'_, Primitive>,
191    stats: &FloatStats,
192) -> VortexResult<DictArray> {
193    match stats.erased() {
194        FloatErasedStats::F16(typed) => typed_encode!(array, stats, typed, f16),
195        FloatErasedStats::F32(typed) => typed_encode!(array, stats, typed, f32),
196        FloatErasedStats::F64(typed) => typed_encode!(array, stats, typed, f64),
197    }
198}
199
200/// Stateless encoder that maps values to dictionary codes via a `HashMap`.
201struct DictEncoder;
202
203/// Trait for encoding values of type `T` into codes of type `I`.
204trait Encode<T, I> {
205    /// Using the distinct value set, turn the values into a set of codes.
206    fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
207}
208
209/// Implements [`Encode`] for a float type using its bit representation as the hash key.
210macro_rules! impl_encode {
211    ($typ:ty, $utyp:ty) => { impl_encode!($typ, $utyp, u8, u16, u32); };
212    ($typ:ty, $utyp:ty, $($ityp:ty),+) => {
213        $(
214        impl Encode<$typ, $ityp> for DictEncoder {
215            #[expect(clippy::cast_possible_truncation)]
216            fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
217                let mut codes =
218                    vortex_utils::aliases::hash_map::HashMap::<$utyp, $ityp>::with_capacity(
219                        distinct.len(),
220                    );
221                for (code, &value) in distinct.iter().enumerate() {
222                    codes.insert(value.to_bits(), code as $ityp);
223                }
224
225                let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
226                for value in values {
227                    // Any code lookups which fail are for nulls, so their value does not matter.
228                    output.push(codes.get(&value.to_bits()).copied().unwrap_or_default());
229                }
230
231                output.freeze()
232            }
233        }
234        )*
235    };
236}
237
238impl_encode!(f16, u16);
239impl_encode!(f32, u32);
240impl_encode!(f64, u64);
241
242#[cfg(test)]
243mod tests {
244    use vortex_array::IntoArray;
245    use vortex_array::ToCanonical;
246    use vortex_array::arrays::BoolArray;
247    use vortex_array::arrays::PrimitiveArray;
248    use vortex_array::arrays::dict::DictArraySlotsExt;
249    use vortex_array::assert_arrays_eq;
250    use vortex_array::validity::Validity;
251    use vortex_buffer::buffer;
252
253    use super::dictionary_encode;
254    use crate::stats::FloatStats;
255    use crate::stats::GenerateStatsOptions;
256
257    #[test]
258    fn test_float_dict_encode() {
259        let values = buffer![1f32, 2f32, 2f32, 0f32, 1f32];
260        let validity =
261            Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
262        let array = PrimitiveArray::new(values, validity);
263
264        let stats = FloatStats::generate_opts(
265            &array,
266            GenerateStatsOptions {
267                count_distinct_values: true,
268            },
269        );
270        let dict_array = dictionary_encode(array.as_view(), &stats).unwrap();
271        assert_eq!(dict_array.values().len(), 2);
272        assert_eq!(dict_array.codes().len(), 5);
273
274        let expected = PrimitiveArray::new(
275            buffer![1f32, 2f32, 2f32, 1f32, 1f32],
276            Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()),
277        )
278        .into_array();
279        let undict = dict_array.as_array().to_primitive().into_array();
280        assert_arrays_eq!(undict, expected);
281    }
282}