vortex_compressor/builtins/dict/
integer.rs1use vortex_array::ArrayRef;
10use vortex_array::ArrayView;
11use vortex_array::Canonical;
12use vortex_array::ExecutionCtx;
13use vortex_array::IntoArray;
14use vortex_array::arrays::DictArray;
15use vortex_array::arrays::Primitive;
16use vortex_array::arrays::PrimitiveArray;
17use vortex_array::arrays::dict::DictArrayExt;
18use vortex_array::arrays::dict::DictArraySlotsExt;
19use vortex_array::arrays::primitive::PrimitiveArrayExt;
20use vortex_array::validity::Validity;
21use vortex_buffer::Buffer;
22use vortex_error::VortexExpect;
23use vortex_error::VortexResult;
24
25use crate::CascadingCompressor;
26use crate::builtins::IntDictScheme;
27use crate::builtins::is_integer_primitive;
28use crate::ctx::CompressorContext;
29use crate::estimate::CompressionEstimate;
30use crate::estimate::EstimateVerdict;
31use crate::scheme::Scheme;
32use crate::scheme::SchemeExt;
33use crate::stats::ArrayAndStats;
34use crate::stats::GenerateStatsOptions;
35use crate::stats::IntegerErasedStats;
36use crate::stats::IntegerStats;
37
38impl Scheme for IntDictScheme {
39 fn scheme_name(&self) -> &'static str {
40 "vortex.int.dict"
41 }
42
43 fn matches(&self, canonical: &Canonical) -> bool {
44 is_integer_primitive(canonical)
45 }
46
47 fn stats_options(&self) -> GenerateStatsOptions {
48 GenerateStatsOptions {
49 count_distinct_values: true,
50 }
51 }
52
53 fn num_children(&self) -> usize {
55 2
56 }
57
58 fn expected_compression_ratio(
59 &self,
60 data: &ArrayAndStats,
61 _compress_ctx: CompressorContext,
62 exec_ctx: &mut ExecutionCtx,
63 ) -> CompressionEstimate {
64 let bit_width = data.array_as_primitive().ptype().bit_width();
65 let stats = data.integer_stats(exec_ctx);
66
67 if stats.value_count() == 0 {
68 return CompressionEstimate::Verdict(EstimateVerdict::Skip);
69 }
70
71 let distinct_values_count = stats.distinct_count().vortex_expect(
72 "this must be present since `DictScheme` declared that we need distinct values",
73 );
74
75 if distinct_values_count > stats.value_count() / 2 {
77 return CompressionEstimate::Verdict(EstimateVerdict::Skip);
78 }
79
80 let values_size = bit_width * distinct_values_count as usize;
83
84 let codes_bw = u32::BITS - distinct_values_count.leading_zeros();
87
88 let n_runs = (stats.value_count() / stats.average_run_length()) as usize;
89
90 let codes_size_bp = codes_bw as usize * stats.value_count() as usize;
92 let codes_size_rle_bp = usize::checked_mul(codes_bw as usize + 32, n_runs);
93
94 let codes_size = usize::min(codes_size_bp, codes_size_rle_bp.unwrap_or(usize::MAX));
95
96 let before = stats.value_count() as usize * bit_width;
97
98 CompressionEstimate::Verdict(EstimateVerdict::Ratio(
99 before as f64 / (values_size + codes_size) as f64,
100 ))
101 }
102
103 fn compress(
104 &self,
105 compressor: &CascadingCompressor,
106 data: &ArrayAndStats,
107 compress_ctx: CompressorContext,
108 exec_ctx: &mut ExecutionCtx,
109 ) -> VortexResult<ArrayRef> {
110 let stats = data.integer_stats(exec_ctx);
111 let dict = dictionary_encode(data.array_as_primitive(), &stats)?;
112
113 let compressed_values =
115 compressor.compress_child(dict.values(), &compress_ctx, self.id(), 0, exec_ctx)?;
116
117 let narrowed_codes = dict
119 .codes()
120 .clone()
121 .execute::<PrimitiveArray>(exec_ctx)?
122 .narrow()?
123 .into_array();
124 let compressed_codes =
125 compressor.compress_child(&narrowed_codes, &compress_ctx, self.id(), 1, exec_ctx)?;
126
127 unsafe {
129 Ok(
130 DictArray::new_unchecked(compressed_codes, compressed_values)
131 .set_all_values_referenced(dict.has_all_values_referenced())
132 .into_array(),
133 )
134 }
135 }
136}
137
138macro_rules! typed_encode {
140 ($source_array:ident, $stats:ident, $typed:ident, $typ:ty) => {{
141 let distinct = $typed.distinct().vortex_expect(
142 "this must be present since `DictScheme` declared that we need distinct values",
143 );
144
145 let values_validity = match $source_array.validity()? {
146 Validity::NonNullable => Validity::NonNullable,
147 _ => Validity::AllValid,
148 };
149 let codes_validity = $source_array.validity()?;
150
151 let values: Buffer<$typ> = distinct.distinct_values().keys().map(|x| x.0).collect();
152
153 let max_code = values.len();
154 let codes = if max_code <= u8::MAX as usize {
155 let buf = <DictEncoder as Encode<$typ, u8>>::encode(
156 &values,
157 $source_array.as_slice::<$typ>(),
158 );
159 PrimitiveArray::new(buf, codes_validity).into_array()
160 } else if max_code <= u16::MAX as usize {
161 let buf = <DictEncoder as Encode<$typ, u16>>::encode(
162 &values,
163 $source_array.as_slice::<$typ>(),
164 );
165 PrimitiveArray::new(buf, codes_validity).into_array()
166 } else {
167 let buf = <DictEncoder as Encode<$typ, u32>>::encode(
168 &values,
169 $source_array.as_slice::<$typ>(),
170 );
171 PrimitiveArray::new(buf, codes_validity).into_array()
172 };
173
174 let values = PrimitiveArray::new(values, values_validity).into_array();
175 Ok(unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) })
177 }};
178}
179
180#[expect(
186 clippy::cognitive_complexity,
187 reason = "complexity from match on all integer types"
188)]
189pub fn dictionary_encode(
190 array: ArrayView<'_, Primitive>,
191 stats: &IntegerStats,
192) -> VortexResult<DictArray> {
193 match stats.erased() {
194 IntegerErasedStats::U8(typed) => typed_encode!(array, stats, typed, u8),
195 IntegerErasedStats::U16(typed) => typed_encode!(array, stats, typed, u16),
196 IntegerErasedStats::U32(typed) => typed_encode!(array, stats, typed, u32),
197 IntegerErasedStats::U64(typed) => typed_encode!(array, stats, typed, u64),
198 IntegerErasedStats::I8(typed) => typed_encode!(array, stats, typed, i8),
199 IntegerErasedStats::I16(typed) => typed_encode!(array, stats, typed, i16),
200 IntegerErasedStats::I32(typed) => typed_encode!(array, stats, typed, i32),
201 IntegerErasedStats::I64(typed) => typed_encode!(array, stats, typed, i64),
202 }
203}
204
205struct DictEncoder;
207
208trait Encode<T, I> {
210 fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
212}
213
214macro_rules! impl_encode {
216 ($typ:ty) => { impl_encode!($typ, u8, u16, u32); };
217 ($typ:ty, $($ityp:ty),+) => {
218 $(
219 impl Encode<$typ, $ityp> for DictEncoder {
220 #[expect(clippy::cast_possible_truncation)]
221 fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
222 let mut codes =
223 vortex_utils::aliases::hash_map::HashMap::<$typ, $ityp>::with_capacity(
224 distinct.len(),
225 );
226 for (code, &value) in distinct.iter().enumerate() {
227 codes.insert(value, code as $ityp);
228 }
229
230 let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
231 for value in values {
232 unsafe { output.push_unchecked(codes.get(value).copied().unwrap_or_default()) };
235 }
236
237 output.freeze()
238 }
239 }
240 )*
241 };
242}
243
244impl_encode!(u8);
245impl_encode!(u16);
246impl_encode!(u32);
247impl_encode!(u64);
248impl_encode!(i8);
249impl_encode!(i16);
250impl_encode!(i32);
251impl_encode!(i64);
252
253#[cfg(test)]
254mod tests {
255 use vortex_array::IntoArray;
256 use vortex_array::VortexSessionExecute;
257 use vortex_array::arrays::BoolArray;
258 use vortex_array::arrays::PrimitiveArray;
259 use vortex_array::arrays::dict::DictArraySlotsExt;
260 use vortex_array::assert_arrays_eq;
261 use vortex_array::session::ArraySession;
262 use vortex_array::validity::Validity;
263 use vortex_buffer::buffer;
264 use vortex_error::VortexResult;
265 use vortex_session::VortexSession;
266
267 use super::dictionary_encode;
268 use crate::stats::IntegerStats;
269
270 #[test]
271 fn test_dict_encode_integer_stats() -> VortexResult<()> {
272 let mut ctx = VortexSession::empty()
273 .with::<ArraySession>()
274 .create_execution_ctx();
275 let data = buffer![100i32, 200, 100, 0, 100];
276 let validity =
277 Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
278 let array = PrimitiveArray::new(data, validity);
279
280 let stats = IntegerStats::generate_opts(
281 &array,
282 crate::stats::GenerateStatsOptions {
283 count_distinct_values: true,
284 },
285 &mut ctx,
286 );
287 let dict_array = dictionary_encode(array.as_view(), &stats)?;
288 assert_eq!(dict_array.values().len(), 2);
289 assert_eq!(dict_array.codes().len(), 5);
290
291 let expected = PrimitiveArray::new(
292 buffer![100i32, 200, 100, 100, 100],
293 Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()),
294 )
295 .into_array();
296 let undict = dict_array
297 .as_array()
298 .clone()
299 .execute::<PrimitiveArray>(&mut ctx)?
300 .into_array();
301 assert_arrays_eq!(undict, expected);
302 Ok(())
303 }
304}