Skip to main content

vortex_compressor/builtins/dict/
float.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Float-specific dictionary encoding implementation.
5//!
6//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted for
7//! external compatibility.
8
9use vortex_array::ArrayRef;
10use vortex_array::ArrayView;
11use vortex_array::Canonical;
12use vortex_array::ExecutionCtx;
13use vortex_array::IntoArray;
14use vortex_array::arrays::DictArray;
15use vortex_array::arrays::Primitive;
16use vortex_array::arrays::PrimitiveArray;
17use vortex_array::arrays::dict::DictArrayExt;
18use vortex_array::arrays::dict::DictArraySlotsExt;
19use vortex_array::arrays::primitive::PrimitiveArrayExt;
20use vortex_array::dtype::half::f16;
21use vortex_array::validity::Validity;
22use vortex_buffer::Buffer;
23use vortex_error::VortexExpect;
24use vortex_error::VortexResult;
25
26use crate::CascadingCompressor;
27use crate::builtins::FloatDictScheme;
28use crate::builtins::IntDictScheme;
29use crate::builtins::is_float_primitive;
30use crate::ctx::CompressorContext;
31use crate::estimate::CompressionEstimate;
32use crate::estimate::DeferredEstimate;
33use crate::estimate::EstimateVerdict;
34use crate::scheme::ChildSelection;
35use crate::scheme::DescendantExclusion;
36use crate::scheme::Scheme;
37use crate::scheme::SchemeExt;
38use crate::stats::ArrayAndStats;
39use crate::stats::FloatErasedStats;
40use crate::stats::FloatStats;
41use crate::stats::GenerateStatsOptions;
42
43impl Scheme for FloatDictScheme {
44    fn scheme_name(&self) -> &'static str {
45        "vortex.float.dict"
46    }
47
48    fn matches(&self, canonical: &Canonical) -> bool {
49        is_float_primitive(canonical)
50    }
51
52    fn stats_options(&self) -> GenerateStatsOptions {
53        GenerateStatsOptions {
54            count_distinct_values: true,
55        }
56    }
57
58    /// Children: values=0, codes=1.
59    fn num_children(&self) -> usize {
60        2
61    }
62
63    /// Float dict codes (child 1) are compact unsigned integers that should not be
64    /// dict-encoded again. Float dict values (child 0) flow through ALP into integer-land,
65    /// where integer dict encoding is redundant since the values are already deduplicated at
66    /// the float level.
67    ///
68    /// Additional exclusions for codes (IntSequenceScheme, IntRunEndScheme, FoRScheme,
69    /// ZigZagScheme, SparseScheme, RLE) are expressed as pull rules on those schemes in
70    /// vortex-btrblocks.
71    fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
72        vec![
73            DescendantExclusion {
74                excluded: IntDictScheme.id(),
75                children: ChildSelection::One(1),
76            },
77            DescendantExclusion {
78                excluded: IntDictScheme.id(),
79                children: ChildSelection::One(0),
80            },
81        ]
82    }
83
84    fn expected_compression_ratio(
85        &self,
86        data: &ArrayAndStats,
87        _compress_ctx: CompressorContext,
88        exec_ctx: &mut ExecutionCtx,
89    ) -> CompressionEstimate {
90        let stats = data.float_stats(exec_ctx);
91
92        if stats.value_count() == 0 {
93            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
94        }
95
96        let distinct_values_count = stats.distinct_count().vortex_expect(
97            "this must be present since `DictScheme` declared that we need distinct values",
98        );
99
100        // If > 50% of the values are distinct, skip dictionary scheme.
101        if distinct_values_count > stats.value_count() / 2 {
102            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
103        }
104
105        // Let sampling determine the expected ratio.
106        CompressionEstimate::Deferred(DeferredEstimate::Sample)
107    }
108
109    fn compress(
110        &self,
111        compressor: &CascadingCompressor,
112        data: &ArrayAndStats,
113        compress_ctx: CompressorContext,
114        exec_ctx: &mut ExecutionCtx,
115    ) -> VortexResult<ArrayRef> {
116        let stats = data.float_stats(exec_ctx);
117        let dict = dictionary_encode(data.array_as_primitive(), &stats)?;
118
119        let has_all_values_referenced = dict.has_all_values_referenced();
120
121        // Values = child 0.
122        let compressed_values =
123            compressor.compress_child(dict.values(), &compress_ctx, self.id(), 0, exec_ctx)?;
124
125        // Codes = child 1.
126        let narrowed_codes = dict
127            .codes()
128            .clone()
129            .execute::<PrimitiveArray>(exec_ctx)?
130            .narrow()?
131            .into_array();
132        let compressed_codes =
133            compressor.compress_child(&narrowed_codes, &compress_ctx, self.id(), 1, exec_ctx)?;
134
135        // SAFETY: compressing codes or values does not alter the invariants.
136        unsafe {
137            Ok(
138                DictArray::new_unchecked(compressed_codes, compressed_values)
139                    .set_all_values_referenced(has_all_values_referenced)
140                    .into_array(),
141            )
142        }
143    }
144}
145
146/// Encodes a typed float array into a [`DictArray`] using the pre-computed distinct values.
147macro_rules! typed_encode {
148    ($source_array:ident, $stats:ident, $typed:ident, $typ:ty) => {{
149        let distinct = $typed.distinct().vortex_expect(
150            "this must be present since `DictScheme` declared that we need distinct values",
151        );
152
153        let values_validity = match $source_array.validity()? {
154            Validity::NonNullable => Validity::NonNullable,
155            _ => Validity::AllValid,
156        };
157        let codes_validity = $source_array.validity()?;
158
159        let values: Buffer<$typ> = distinct.distinct_values().iter().map(|x| x.0).collect();
160
161        let max_code = values.len();
162        let codes = if max_code <= u8::MAX as usize {
163            let buf = <DictEncoder as Encode<$typ, u8>>::encode(
164                &values,
165                $source_array.as_slice::<$typ>(),
166            );
167            PrimitiveArray::new(buf, codes_validity).into_array()
168        } else if max_code <= u16::MAX as usize {
169            let buf = <DictEncoder as Encode<$typ, u16>>::encode(
170                &values,
171                $source_array.as_slice::<$typ>(),
172            );
173            PrimitiveArray::new(buf, codes_validity).into_array()
174        } else {
175            let buf = <DictEncoder as Encode<$typ, u32>>::encode(
176                &values,
177                $source_array.as_slice::<$typ>(),
178            );
179            PrimitiveArray::new(buf, codes_validity).into_array()
180        };
181
182        let values = PrimitiveArray::new(values, values_validity).into_array();
183        // SAFETY: enforced by the DictEncoder.
184        Ok(unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) })
185    }};
186}
187
188/// Compresses a floating-point array into a dictionary array according to attached stats.
189///
190/// # Errors
191///
192/// Returns an error if unable to compute validity.
193pub fn dictionary_encode(
194    array: ArrayView<'_, Primitive>,
195    stats: &FloatStats,
196) -> VortexResult<DictArray> {
197    match stats.erased() {
198        FloatErasedStats::F16(typed) => typed_encode!(array, stats, typed, f16),
199        FloatErasedStats::F32(typed) => typed_encode!(array, stats, typed, f32),
200        FloatErasedStats::F64(typed) => typed_encode!(array, stats, typed, f64),
201    }
202}
203
204/// Stateless encoder that maps values to dictionary codes via a `HashMap`.
205struct DictEncoder;
206
207/// Trait for encoding values of type `T` into codes of type `I`.
208trait Encode<T, I> {
209    /// Using the distinct value set, turn the values into a set of codes.
210    fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
211}
212
213/// Implements [`Encode`] for a float type using its bit representation as the hash key.
214macro_rules! impl_encode {
215    ($typ:ty, $utyp:ty) => { impl_encode!($typ, $utyp, u8, u16, u32); };
216    ($typ:ty, $utyp:ty, $($ityp:ty),+) => {
217        $(
218        impl Encode<$typ, $ityp> for DictEncoder {
219            #[expect(clippy::cast_possible_truncation)]
220            fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
221                let mut codes =
222                    vortex_utils::aliases::hash_map::HashMap::<$utyp, $ityp>::with_capacity(
223                        distinct.len(),
224                    );
225                for (code, &value) in distinct.iter().enumerate() {
226                    codes.insert(value.to_bits(), code as $ityp);
227                }
228
229                let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
230                for value in values {
231                    // Any code lookups which fail are for nulls, so their value does not matter.
232                    output.push(codes.get(&value.to_bits()).copied().unwrap_or_default());
233                }
234
235                output.freeze()
236            }
237        }
238        )*
239    };
240}
241
242impl_encode!(f16, u16);
243impl_encode!(f32, u32);
244impl_encode!(f64, u64);
245
246#[cfg(test)]
247mod tests {
248    use vortex_array::IntoArray;
249    use vortex_array::VortexSessionExecute;
250    use vortex_array::arrays::BoolArray;
251    use vortex_array::arrays::PrimitiveArray;
252    use vortex_array::arrays::dict::DictArraySlotsExt;
253    use vortex_array::assert_arrays_eq;
254    use vortex_array::session::ArraySession;
255    use vortex_array::validity::Validity;
256    use vortex_buffer::buffer;
257    use vortex_error::VortexResult;
258    use vortex_session::VortexSession;
259
260    use super::dictionary_encode;
261    use crate::stats::FloatStats;
262    use crate::stats::GenerateStatsOptions;
263
264    #[test]
265    fn test_float_dict_encode() -> VortexResult<()> {
266        let mut ctx = VortexSession::empty()
267            .with::<ArraySession>()
268            .create_execution_ctx();
269        let values = buffer![1f32, 2f32, 2f32, 0f32, 1f32];
270        let validity =
271            Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
272        let array = PrimitiveArray::new(values, validity);
273
274        let stats = FloatStats::generate_opts(
275            &array,
276            GenerateStatsOptions {
277                count_distinct_values: true,
278            },
279            &mut ctx,
280        );
281        let dict_array = dictionary_encode(array.as_view(), &stats)?;
282        assert_eq!(dict_array.values().len(), 2);
283        assert_eq!(dict_array.codes().len(), 5);
284
285        let expected = PrimitiveArray::new(
286            buffer![1f32, 2f32, 2f32, 1f32, 1f32],
287            Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()),
288        )
289        .into_array();
290        let undict = dict_array
291            .as_array()
292            .clone()
293            .execute::<PrimitiveArray>(&mut ctx)?
294            .into_array();
295        assert_arrays_eq!(undict, expected);
296        Ok(())
297    }
298}