vortex_compressor/builtins/dict/
integer.rs1use vortex_array::ArrayRef;
10use vortex_array::ArrayView;
11use vortex_array::Canonical;
12use vortex_array::ExecutionCtx;
13use vortex_array::IntoArray;
14use vortex_array::arrays::DictArray;
15use vortex_array::arrays::Primitive;
16use vortex_array::arrays::PrimitiveArray;
17use vortex_array::arrays::dict::DictArrayExt;
18use vortex_array::arrays::dict::DictArraySlotsExt;
19use vortex_array::arrays::primitive::PrimitiveArrayExt;
20use vortex_array::validity::Validity;
21use vortex_buffer::Buffer;
22use vortex_error::VortexExpect;
23use vortex_error::VortexResult;
24
25use crate::CascadingCompressor;
26use crate::ctx::CompressorContext;
27use crate::estimate::CompressionEstimate;
28use crate::estimate::EstimateVerdict;
29use crate::scheme::Scheme;
30use crate::scheme::SchemeExt;
31use crate::stats::ArrayAndStats;
32use crate::stats::GenerateStatsOptions;
33use crate::stats::IntegerErasedStats;
34use crate::stats::IntegerStats;
35
36#[derive(Debug, Copy, Clone, PartialEq, Eq)]
38pub struct IntDictScheme;
39
40impl Scheme for IntDictScheme {
41 fn scheme_name(&self) -> &'static str {
42 "vortex.int.dict"
43 }
44
45 fn matches(&self, canonical: &Canonical) -> bool {
46 canonical.dtype().is_int()
47 }
48
49 fn stats_options(&self) -> GenerateStatsOptions {
50 GenerateStatsOptions {
51 count_distinct_values: true,
52 }
53 }
54
55 fn num_children(&self) -> usize {
57 2
58 }
59
60 fn expected_compression_ratio(
61 &self,
62 data: &ArrayAndStats,
63 _compress_ctx: CompressorContext,
64 exec_ctx: &mut ExecutionCtx,
65 ) -> CompressionEstimate {
66 let bit_width = data.array_as_primitive().ptype().bit_width();
67 let stats = data.integer_stats(exec_ctx);
68
69 if stats.value_count() == 0 {
70 return CompressionEstimate::Verdict(EstimateVerdict::Skip);
71 }
72
73 let distinct_values_count = stats.distinct_count().vortex_expect(
74 "this must be present since `DictScheme` declared that we need distinct values",
75 );
76
77 if distinct_values_count > stats.value_count() / 2 {
79 return CompressionEstimate::Verdict(EstimateVerdict::Skip);
80 }
81
82 let values_size = bit_width * distinct_values_count as usize;
85
86 let codes_bw = u32::BITS - distinct_values_count.leading_zeros();
89
90 let n_runs = (stats.value_count() / stats.average_run_length()) as usize;
91
92 let codes_size_bp = codes_bw as usize * stats.value_count() as usize;
94 let codes_size_rle_bp = usize::checked_mul(codes_bw as usize + 32, n_runs);
95
96 let codes_size = usize::min(codes_size_bp, codes_size_rle_bp.unwrap_or(usize::MAX));
97
98 let before = stats.value_count() as usize * bit_width;
99
100 CompressionEstimate::Verdict(EstimateVerdict::Ratio(
101 before as f64 / (values_size + codes_size) as f64,
102 ))
103 }
104
105 fn compress(
106 &self,
107 compressor: &CascadingCompressor,
108 data: &ArrayAndStats,
109 compress_ctx: CompressorContext,
110 exec_ctx: &mut ExecutionCtx,
111 ) -> VortexResult<ArrayRef> {
112 let stats = data.integer_stats(exec_ctx);
113 let dict = dictionary_encode(data.array_as_primitive(), &stats)?;
114
115 let compressed_values =
117 compressor.compress_child(dict.values(), &compress_ctx, self.id(), 0, exec_ctx)?;
118
119 let narrowed_codes = dict
121 .codes()
122 .clone()
123 .execute::<PrimitiveArray>(exec_ctx)?
124 .narrow(exec_ctx)?
125 .into_array();
126 let compressed_codes =
127 compressor.compress_child(&narrowed_codes, &compress_ctx, self.id(), 1, exec_ctx)?;
128
129 unsafe {
131 Ok(
132 DictArray::new_unchecked(compressed_codes, compressed_values)
133 .set_all_values_referenced(dict.has_all_values_referenced())
134 .into_array(),
135 )
136 }
137 }
138}
139
140macro_rules! typed_encode {
142 ($source_array:ident, $stats:ident, $typed:ident, $typ:ty) => {{
143 let distinct = $typed.distinct().vortex_expect(
144 "this must be present since `DictScheme` declared that we need distinct values",
145 );
146
147 let values_validity = match $source_array.validity()? {
148 Validity::NonNullable => Validity::NonNullable,
149 _ => Validity::AllValid,
150 };
151 let codes_validity = $source_array.validity()?;
152
153 let values: Buffer<$typ> = distinct.distinct_values().keys().map(|x| x.0).collect();
154
155 let max_code = values.len();
156 let codes = if max_code <= u8::MAX as usize {
157 let buf = <DictEncoder as Encode<$typ, u8>>::encode(
158 &values,
159 $source_array.as_slice::<$typ>(),
160 );
161 PrimitiveArray::new(buf, codes_validity).into_array()
162 } else if max_code <= u16::MAX as usize {
163 let buf = <DictEncoder as Encode<$typ, u16>>::encode(
164 &values,
165 $source_array.as_slice::<$typ>(),
166 );
167 PrimitiveArray::new(buf, codes_validity).into_array()
168 } else {
169 let buf = <DictEncoder as Encode<$typ, u32>>::encode(
170 &values,
171 $source_array.as_slice::<$typ>(),
172 );
173 PrimitiveArray::new(buf, codes_validity).into_array()
174 };
175
176 let values = PrimitiveArray::new(values, values_validity).into_array();
177 Ok(unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) })
179 }};
180}
181
182#[expect(
188 clippy::cognitive_complexity,
189 reason = "complexity from match on all integer types"
190)]
191pub fn dictionary_encode(
192 array: ArrayView<'_, Primitive>,
193 stats: &IntegerStats,
194) -> VortexResult<DictArray> {
195 match stats.erased() {
196 IntegerErasedStats::U8(typed) => typed_encode!(array, stats, typed, u8),
197 IntegerErasedStats::U16(typed) => typed_encode!(array, stats, typed, u16),
198 IntegerErasedStats::U32(typed) => typed_encode!(array, stats, typed, u32),
199 IntegerErasedStats::U64(typed) => typed_encode!(array, stats, typed, u64),
200 IntegerErasedStats::I8(typed) => typed_encode!(array, stats, typed, i8),
201 IntegerErasedStats::I16(typed) => typed_encode!(array, stats, typed, i16),
202 IntegerErasedStats::I32(typed) => typed_encode!(array, stats, typed, i32),
203 IntegerErasedStats::I64(typed) => typed_encode!(array, stats, typed, i64),
204 }
205}
206
207struct DictEncoder;
209
210trait Encode<T, I> {
212 fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
214}
215
216macro_rules! impl_encode {
218 ($typ:ty) => { impl_encode!($typ, u8, u16, u32); };
219 ($typ:ty, $($ityp:ty),+) => {
220 $(
221 impl Encode<$typ, $ityp> for DictEncoder {
222 #[expect(clippy::cast_possible_truncation)]
223 fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
224 let mut codes =
225 vortex_utils::aliases::hash_map::HashMap::<$typ, $ityp>::with_capacity(
226 distinct.len(),
227 );
228 for (code, &value) in distinct.iter().enumerate() {
229 codes.insert(value, code as $ityp);
230 }
231
232 let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
233 for value in values {
234 unsafe { output.push_unchecked(codes.get(value).copied().unwrap_or_default()) };
237 }
238
239 output.freeze()
240 }
241 }
242 )*
243 };
244}
245
246impl_encode!(u8);
247impl_encode!(u16);
248impl_encode!(u32);
249impl_encode!(u64);
250impl_encode!(i8);
251impl_encode!(i16);
252impl_encode!(i32);
253impl_encode!(i64);
254
255#[cfg(test)]
256mod tests {
257 use vortex_array::IntoArray;
258 use vortex_array::VortexSessionExecute;
259 use vortex_array::arrays::BoolArray;
260 use vortex_array::arrays::PrimitiveArray;
261 use vortex_array::arrays::dict::DictArraySlotsExt;
262 use vortex_array::assert_arrays_eq;
263 use vortex_array::session::ArraySession;
264 use vortex_array::validity::Validity;
265 use vortex_buffer::buffer;
266 use vortex_error::VortexResult;
267 use vortex_session::VortexSession;
268
269 use super::dictionary_encode;
270 use crate::stats::IntegerStats;
271
272 #[test]
273 fn test_dict_encode_integer_stats() -> VortexResult<()> {
274 let mut ctx = VortexSession::empty()
275 .with::<ArraySession>()
276 .create_execution_ctx();
277 let data = buffer![100i32, 200, 100, 0, 100];
278 let validity =
279 Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
280 let array = PrimitiveArray::new(data, validity);
281
282 let stats = IntegerStats::generate_opts(
283 &array,
284 crate::stats::GenerateStatsOptions {
285 count_distinct_values: true,
286 },
287 &mut ctx,
288 );
289 let dict_array = dictionary_encode(array.as_view(), &stats)?;
290 assert_eq!(dict_array.values().len(), 2);
291 assert_eq!(dict_array.codes().len(), 5);
292
293 let expected = PrimitiveArray::new(
294 buffer![100i32, 200, 100, 100, 100],
295 Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()),
296 )
297 .into_array();
298 let undict = dict_array
299 .as_array()
300 .clone()
301 .execute::<PrimitiveArray>(&mut ctx)?
302 .into_array();
303 assert_arrays_eq!(undict, expected);
304 Ok(())
305 }
306}