Skip to main content

vortex_compressor/builtins/dict/
integer.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Integer-specific dictionary encoding implementation.
5//!
6//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted
7//! for external compatibility.
8
9use vortex_array::ArrayRef;
10use vortex_array::Canonical;
11use vortex_array::IntoArray;
12use vortex_array::arrays::DictArray;
13use vortex_array::arrays::PrimitiveArray;
14use vortex_array::arrays::dict::DictArrayExt;
15use vortex_array::arrays::dict::DictArraySlotsExt;
16use vortex_array::arrays::primitive::PrimitiveArrayExt;
17use vortex_array::validity::Validity;
18use vortex_buffer::Buffer;
19use vortex_error::VortexExpect;
20use vortex_error::VortexResult;
21
22use crate::CascadingCompressor;
23use crate::builtins::IntDictScheme;
24use crate::builtins::is_integer_primitive;
25use crate::ctx::CompressorContext;
26use crate::estimate::CompressionEstimate;
27use crate::scheme::Scheme;
28use crate::scheme::SchemeExt;
29use crate::stats::ArrayAndStats;
30use crate::stats::GenerateStatsOptions;
31use crate::stats::IntegerErasedStats;
32use crate::stats::IntegerStats;
33
34impl Scheme for IntDictScheme {
35    fn scheme_name(&self) -> &'static str {
36        "vortex.int.dict"
37    }
38
39    fn matches(&self, canonical: &Canonical) -> bool {
40        is_integer_primitive(canonical)
41    }
42
43    fn stats_options(&self) -> GenerateStatsOptions {
44        GenerateStatsOptions {
45            count_distinct_values: true,
46        }
47    }
48
49    /// Children: values=0, codes=1.
50    fn num_children(&self) -> usize {
51        2
52    }
53
54    fn expected_compression_ratio(
55        &self,
56        data: &mut ArrayAndStats,
57        _ctx: CompressorContext,
58    ) -> CompressionEstimate {
59        let bit_width = data.array_as_primitive().ptype().bit_width();
60        let stats = data.integer_stats();
61
62        if stats.value_count() == 0 {
63            return CompressionEstimate::Skip;
64        }
65
66        let distinct_values_count = stats.distinct_count().vortex_expect(
67            "this must be present since `DictScheme` declared that we need distinct values",
68        );
69
70        // If > 50% of the values are distinct, skip dictionary scheme.
71        if distinct_values_count > stats.value_count() / 2 {
72            return CompressionEstimate::Skip;
73        }
74
75        // Ignore nulls encoding for the estimate. We only focus on values.
76
77        let values_size = bit_width * distinct_values_count as usize;
78
79        // TODO(connor): Should we just hardcode this instead of let the compressor choose?
80        // Assume codes are compressed RLE + BitPacking.
81        let codes_bw = u32::BITS - distinct_values_count.leading_zeros();
82
83        let n_runs = (stats.value_count() / stats.average_run_length()) as usize;
84
85        // Assume that codes will either be BitPack or RLE-BitPack.
86        let codes_size_bp = codes_bw as usize * stats.value_count() as usize;
87        let codes_size_rle_bp = usize::checked_mul(codes_bw as usize + 32, n_runs);
88
89        let codes_size = usize::min(codes_size_bp, codes_size_rle_bp.unwrap_or(usize::MAX));
90
91        let before = stats.value_count() as usize * bit_width;
92
93        CompressionEstimate::Ratio(before as f64 / (values_size + codes_size) as f64)
94    }
95
96    fn compress(
97        &self,
98        compressor: &CascadingCompressor,
99        data: &mut ArrayAndStats,
100        ctx: CompressorContext,
101    ) -> VortexResult<ArrayRef> {
102        // TODO(connor): Fight the borrow checker (needs interior mutability)!
103        let stats = data.integer_stats().clone();
104        let dict = dictionary_encode(data.array_as_primitive(), &stats)?;
105
106        // Values = child 0.
107        let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?;
108
109        // Codes = child 1.
110        let narrowed_codes = dict
111            .codes()
112            .clone()
113            .execute::<PrimitiveArray>(&mut compressor.execution_ctx())?
114            .narrow()?
115            .into_array();
116        let compressed_codes = compressor.compress_child(&narrowed_codes, &ctx, self.id(), 1)?;
117
118        // SAFETY: compressing codes does not change their values.
119        unsafe {
120            Ok(
121                DictArray::new_unchecked(compressed_codes, compressed_values)
122                    .set_all_values_referenced(dict.has_all_values_referenced())
123                    .into_array(),
124            )
125        }
126    }
127}
128
129/// Encodes a typed integer array into a [`DictArray`] using the pre-computed distinct values.
130macro_rules! typed_encode {
131    ($source_array:ident, $stats:ident, $typed:ident, $typ:ty) => {{
132        let distinct = $typed.distinct().vortex_expect(
133            "this must be present since `DictScheme` declared that we need distinct values",
134        );
135
136        let values_validity = match $source_array.validity()? {
137            Validity::NonNullable => Validity::NonNullable,
138            _ => Validity::AllValid,
139        };
140        let codes_validity = $source_array.validity()?;
141
142        let values: Buffer<$typ> = distinct.distinct_values().keys().map(|x| x.0).collect();
143
144        let max_code = values.len();
145        let codes = if max_code <= u8::MAX as usize {
146            let buf = <DictEncoder as Encode<$typ, u8>>::encode(
147                &values,
148                $source_array.as_slice::<$typ>(),
149            );
150            PrimitiveArray::new(buf, codes_validity).into_array()
151        } else if max_code <= u16::MAX as usize {
152            let buf = <DictEncoder as Encode<$typ, u16>>::encode(
153                &values,
154                $source_array.as_slice::<$typ>(),
155            );
156            PrimitiveArray::new(buf, codes_validity).into_array()
157        } else {
158            let buf = <DictEncoder as Encode<$typ, u32>>::encode(
159                &values,
160                $source_array.as_slice::<$typ>(),
161            );
162            PrimitiveArray::new(buf, codes_validity).into_array()
163        };
164
165        let values = PrimitiveArray::new(values, values_validity).into_array();
166        // SAFETY: invariants enforced in DictEncoder.
167        Ok(unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) })
168    }};
169}
170
171/// Compresses an integer array into a dictionary array according to attached stats.
172///
173/// # Errors
174///
175/// Returns an error if unable to compute validity.
176#[expect(
177    clippy::cognitive_complexity,
178    reason = "complexity from match on all integer types"
179)]
180pub fn dictionary_encode(array: PrimitiveArray, stats: &IntegerStats) -> VortexResult<DictArray> {
181    match stats.erased() {
182        IntegerErasedStats::U8(typed) => typed_encode!(array, stats, typed, u8),
183        IntegerErasedStats::U16(typed) => typed_encode!(array, stats, typed, u16),
184        IntegerErasedStats::U32(typed) => typed_encode!(array, stats, typed, u32),
185        IntegerErasedStats::U64(typed) => typed_encode!(array, stats, typed, u64),
186        IntegerErasedStats::I8(typed) => typed_encode!(array, stats, typed, i8),
187        IntegerErasedStats::I16(typed) => typed_encode!(array, stats, typed, i16),
188        IntegerErasedStats::I32(typed) => typed_encode!(array, stats, typed, i32),
189        IntegerErasedStats::I64(typed) => typed_encode!(array, stats, typed, i64),
190    }
191}
192
193/// Stateless encoder that maps values to dictionary codes via a `HashMap`.
194struct DictEncoder;
195
196/// Trait for encoding values of type `T` into codes of type `I`.
197trait Encode<T, I> {
198    /// Using the distinct value set, turn the values into a set of codes.
199    fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
200}
201
202/// Implements [`Encode`] for an integer type with all code width variants (u8, u16, u32).
203macro_rules! impl_encode {
204    ($typ:ty) => { impl_encode!($typ, u8, u16, u32); };
205    ($typ:ty, $($ityp:ty),+) => {
206        $(
207        impl Encode<$typ, $ityp> for DictEncoder {
208            #[allow(clippy::cast_possible_truncation)]
209            fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
210                let mut codes =
211                    vortex_utils::aliases::hash_map::HashMap::<$typ, $ityp>::with_capacity(
212                        distinct.len(),
213                    );
214                for (code, &value) in distinct.iter().enumerate() {
215                    codes.insert(value, code as $ityp);
216                }
217
218                let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
219                for value in values {
220                    // Any code lookups which fail are for nulls, so their value does not matter.
221                    // SAFETY: we have exactly sized output to be as large as values.
222                    unsafe { output.push_unchecked(codes.get(value).copied().unwrap_or_default()) };
223                }
224
225                output.freeze()
226            }
227        }
228        )*
229    };
230}
231
232impl_encode!(u8);
233impl_encode!(u16);
234impl_encode!(u32);
235impl_encode!(u64);
236impl_encode!(i8);
237impl_encode!(i16);
238impl_encode!(i32);
239impl_encode!(i64);
240
241#[cfg(test)]
242mod tests {
243    use vortex_array::IntoArray;
244    use vortex_array::ToCanonical;
245    use vortex_array::arrays::BoolArray;
246    use vortex_array::arrays::PrimitiveArray;
247    use vortex_array::arrays::dict::DictArraySlotsExt;
248    use vortex_array::assert_arrays_eq;
249    use vortex_array::validity::Validity;
250    use vortex_buffer::buffer;
251
252    use super::dictionary_encode;
253    use crate::stats::IntegerStats;
254
255    #[test]
256    fn test_dict_encode_integer_stats() {
257        let data = buffer![100i32, 200, 100, 0, 100];
258        let validity =
259            Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
260        let array = PrimitiveArray::new(data, validity);
261
262        let stats = IntegerStats::generate_opts(
263            &array,
264            crate::stats::GenerateStatsOptions {
265                count_distinct_values: true,
266            },
267        );
268        let dict_array = dictionary_encode(array, &stats).unwrap();
269        assert_eq!(dict_array.values().len(), 2);
270        assert_eq!(dict_array.codes().len(), 5);
271
272        let expected = PrimitiveArray::new(
273            buffer![100i32, 200, 100, 100, 100],
274            Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()),
275        )
276        .into_array();
277        let undict = dict_array.as_array().to_primitive().into_array();
278        assert_arrays_eq!(undict, expected);
279    }
280}