Skip to main content

vortex_compressor/builtins/dict/
integer.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Integer-specific dictionary encoding implementation.
5//!
6//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted
7//! for external compatibility.
8
9use vortex_array::ArrayRef;
10use vortex_array::ArrayView;
11use vortex_array::Canonical;
12use vortex_array::IntoArray;
13use vortex_array::arrays::DictArray;
14use vortex_array::arrays::Primitive;
15use vortex_array::arrays::PrimitiveArray;
16use vortex_array::arrays::dict::DictArrayExt;
17use vortex_array::arrays::dict::DictArraySlotsExt;
18use vortex_array::arrays::primitive::PrimitiveArrayExt;
19use vortex_array::validity::Validity;
20use vortex_buffer::Buffer;
21use vortex_error::VortexExpect;
22use vortex_error::VortexResult;
23
24use crate::CascadingCompressor;
25use crate::builtins::IntDictScheme;
26use crate::builtins::is_integer_primitive;
27use crate::ctx::CompressorContext;
28use crate::estimate::CompressionEstimate;
29use crate::estimate::EstimateVerdict;
30use crate::scheme::Scheme;
31use crate::scheme::SchemeExt;
32use crate::stats::ArrayAndStats;
33use crate::stats::GenerateStatsOptions;
34use crate::stats::IntegerErasedStats;
35use crate::stats::IntegerStats;
36
37impl Scheme for IntDictScheme {
38    fn scheme_name(&self) -> &'static str {
39        "vortex.int.dict"
40    }
41
42    fn matches(&self, canonical: &Canonical) -> bool {
43        is_integer_primitive(canonical)
44    }
45
46    fn stats_options(&self) -> GenerateStatsOptions {
47        GenerateStatsOptions {
48            count_distinct_values: true,
49        }
50    }
51
52    /// Children: values=0, codes=1.
53    fn num_children(&self) -> usize {
54        2
55    }
56
57    fn expected_compression_ratio(
58        &self,
59        data: &mut ArrayAndStats,
60        _ctx: CompressorContext,
61    ) -> CompressionEstimate {
62        let bit_width = data.array_as_primitive().ptype().bit_width();
63        let stats = data.integer_stats();
64
65        if stats.value_count() == 0 {
66            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
67        }
68
69        let distinct_values_count = stats.distinct_count().vortex_expect(
70            "this must be present since `DictScheme` declared that we need distinct values",
71        );
72
73        // If > 50% of the values are distinct, skip dictionary scheme.
74        if distinct_values_count > stats.value_count() / 2 {
75            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
76        }
77
78        // Ignore nulls encoding for the estimate. We only focus on values.
79
80        let values_size = bit_width * distinct_values_count as usize;
81
82        // TODO(connor): Should we just hardcode this instead of let the compressor choose?
83        // Assume codes are compressed RLE + BitPacking.
84        let codes_bw = u32::BITS - distinct_values_count.leading_zeros();
85
86        let n_runs = (stats.value_count() / stats.average_run_length()) as usize;
87
88        // Assume that codes will either be BitPack or RLE-BitPack.
89        let codes_size_bp = codes_bw as usize * stats.value_count() as usize;
90        let codes_size_rle_bp = usize::checked_mul(codes_bw as usize + 32, n_runs);
91
92        let codes_size = usize::min(codes_size_bp, codes_size_rle_bp.unwrap_or(usize::MAX));
93
94        let before = stats.value_count() as usize * bit_width;
95
96        CompressionEstimate::Verdict(EstimateVerdict::Ratio(
97            before as f64 / (values_size + codes_size) as f64,
98        ))
99    }
100
101    fn compress(
102        &self,
103        compressor: &CascadingCompressor,
104        data: &mut ArrayAndStats,
105        ctx: CompressorContext,
106    ) -> VortexResult<ArrayRef> {
107        // TODO(connor): Fight the borrow checker (needs interior mutability)!
108        let stats = data.integer_stats().clone();
109        let dict = dictionary_encode(data.array_as_primitive(), &stats)?;
110
111        // Values = child 0.
112        let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?;
113
114        // Codes = child 1.
115        let narrowed_codes = dict
116            .codes()
117            .clone()
118            .execute::<PrimitiveArray>(&mut compressor.execution_ctx())?
119            .narrow()?
120            .into_array();
121        let compressed_codes = compressor.compress_child(&narrowed_codes, &ctx, self.id(), 1)?;
122
123        // SAFETY: compressing codes does not change their values.
124        unsafe {
125            Ok(
126                DictArray::new_unchecked(compressed_codes, compressed_values)
127                    .set_all_values_referenced(dict.has_all_values_referenced())
128                    .into_array(),
129            )
130        }
131    }
132}
133
134/// Encodes a typed integer array into a [`DictArray`] using the pre-computed distinct values.
135macro_rules! typed_encode {
136    ($source_array:ident, $stats:ident, $typed:ident, $typ:ty) => {{
137        let distinct = $typed.distinct().vortex_expect(
138            "this must be present since `DictScheme` declared that we need distinct values",
139        );
140
141        let values_validity = match $source_array.validity()? {
142            Validity::NonNullable => Validity::NonNullable,
143            _ => Validity::AllValid,
144        };
145        let codes_validity = $source_array.validity()?;
146
147        let values: Buffer<$typ> = distinct.distinct_values().keys().map(|x| x.0).collect();
148
149        let max_code = values.len();
150        let codes = if max_code <= u8::MAX as usize {
151            let buf = <DictEncoder as Encode<$typ, u8>>::encode(
152                &values,
153                $source_array.as_slice::<$typ>(),
154            );
155            PrimitiveArray::new(buf, codes_validity).into_array()
156        } else if max_code <= u16::MAX as usize {
157            let buf = <DictEncoder as Encode<$typ, u16>>::encode(
158                &values,
159                $source_array.as_slice::<$typ>(),
160            );
161            PrimitiveArray::new(buf, codes_validity).into_array()
162        } else {
163            let buf = <DictEncoder as Encode<$typ, u32>>::encode(
164                &values,
165                $source_array.as_slice::<$typ>(),
166            );
167            PrimitiveArray::new(buf, codes_validity).into_array()
168        };
169
170        let values = PrimitiveArray::new(values, values_validity).into_array();
171        // SAFETY: invariants enforced in DictEncoder.
172        Ok(unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) })
173    }};
174}
175
176/// Compresses an integer array into a dictionary array according to attached stats.
177///
178/// # Errors
179///
180/// Returns an error if unable to compute validity.
181#[expect(
182    clippy::cognitive_complexity,
183    reason = "complexity from match on all integer types"
184)]
185pub fn dictionary_encode(
186    array: ArrayView<'_, Primitive>,
187    stats: &IntegerStats,
188) -> VortexResult<DictArray> {
189    match stats.erased() {
190        IntegerErasedStats::U8(typed) => typed_encode!(array, stats, typed, u8),
191        IntegerErasedStats::U16(typed) => typed_encode!(array, stats, typed, u16),
192        IntegerErasedStats::U32(typed) => typed_encode!(array, stats, typed, u32),
193        IntegerErasedStats::U64(typed) => typed_encode!(array, stats, typed, u64),
194        IntegerErasedStats::I8(typed) => typed_encode!(array, stats, typed, i8),
195        IntegerErasedStats::I16(typed) => typed_encode!(array, stats, typed, i16),
196        IntegerErasedStats::I32(typed) => typed_encode!(array, stats, typed, i32),
197        IntegerErasedStats::I64(typed) => typed_encode!(array, stats, typed, i64),
198    }
199}
200
201/// Stateless encoder that maps values to dictionary codes via a `HashMap`.
202struct DictEncoder;
203
204/// Trait for encoding values of type `T` into codes of type `I`.
205trait Encode<T, I> {
206    /// Using the distinct value set, turn the values into a set of codes.
207    fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
208}
209
210/// Implements [`Encode`] for an integer type with all code width variants (u8, u16, u32).
211macro_rules! impl_encode {
212    ($typ:ty) => { impl_encode!($typ, u8, u16, u32); };
213    ($typ:ty, $($ityp:ty),+) => {
214        $(
215        impl Encode<$typ, $ityp> for DictEncoder {
216            #[expect(clippy::cast_possible_truncation)]
217            fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
218                let mut codes =
219                    vortex_utils::aliases::hash_map::HashMap::<$typ, $ityp>::with_capacity(
220                        distinct.len(),
221                    );
222                for (code, &value) in distinct.iter().enumerate() {
223                    codes.insert(value, code as $ityp);
224                }
225
226                let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
227                for value in values {
228                    // Any code lookups which fail are for nulls, so their value does not matter.
229                    // SAFETY: we have exactly sized output to be as large as values.
230                    unsafe { output.push_unchecked(codes.get(value).copied().unwrap_or_default()) };
231                }
232
233                output.freeze()
234            }
235        }
236        )*
237    };
238}
239
240impl_encode!(u8);
241impl_encode!(u16);
242impl_encode!(u32);
243impl_encode!(u64);
244impl_encode!(i8);
245impl_encode!(i16);
246impl_encode!(i32);
247impl_encode!(i64);
248
249#[cfg(test)]
250mod tests {
251    use vortex_array::IntoArray;
252    use vortex_array::ToCanonical;
253    use vortex_array::arrays::BoolArray;
254    use vortex_array::arrays::PrimitiveArray;
255    use vortex_array::arrays::dict::DictArraySlotsExt;
256    use vortex_array::assert_arrays_eq;
257    use vortex_array::validity::Validity;
258    use vortex_buffer::buffer;
259
260    use super::dictionary_encode;
261    use crate::stats::IntegerStats;
262
263    #[test]
264    fn test_dict_encode_integer_stats() {
265        let data = buffer![100i32, 200, 100, 0, 100];
266        let validity =
267            Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
268        let array = PrimitiveArray::new(data, validity);
269
270        let stats = IntegerStats::generate_opts(
271            &array,
272            crate::stats::GenerateStatsOptions {
273                count_distinct_values: true,
274            },
275        );
276        let dict_array = dictionary_encode(array.as_view(), &stats).unwrap();
277        assert_eq!(dict_array.values().len(), 2);
278        assert_eq!(dict_array.codes().len(), 5);
279
280        let expected = PrimitiveArray::new(
281            buffer![100i32, 200, 100, 100, 100],
282            Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()),
283        )
284        .into_array();
285        let undict = dict_array.as_array().to_primitive().into_array();
286        assert_arrays_eq!(undict, expected);
287    }
288}