vortex_compressor/builtins/dict/
integer.rs1use vortex_array::ArrayRef;
10use vortex_array::ArrayView;
11use vortex_array::Canonical;
12use vortex_array::IntoArray;
13use vortex_array::arrays::DictArray;
14use vortex_array::arrays::Primitive;
15use vortex_array::arrays::PrimitiveArray;
16use vortex_array::arrays::dict::DictArrayExt;
17use vortex_array::arrays::dict::DictArraySlotsExt;
18use vortex_array::arrays::primitive::PrimitiveArrayExt;
19use vortex_array::validity::Validity;
20use vortex_buffer::Buffer;
21use vortex_error::VortexExpect;
22use vortex_error::VortexResult;
23
24use crate::CascadingCompressor;
25use crate::builtins::IntDictScheme;
26use crate::builtins::is_integer_primitive;
27use crate::ctx::CompressorContext;
28use crate::estimate::CompressionEstimate;
29use crate::estimate::EstimateVerdict;
30use crate::scheme::Scheme;
31use crate::scheme::SchemeExt;
32use crate::stats::ArrayAndStats;
33use crate::stats::GenerateStatsOptions;
34use crate::stats::IntegerErasedStats;
35use crate::stats::IntegerStats;
36
37impl Scheme for IntDictScheme {
38 fn scheme_name(&self) -> &'static str {
39 "vortex.int.dict"
40 }
41
42 fn matches(&self, canonical: &Canonical) -> bool {
43 is_integer_primitive(canonical)
44 }
45
46 fn stats_options(&self) -> GenerateStatsOptions {
47 GenerateStatsOptions {
48 count_distinct_values: true,
49 }
50 }
51
52 fn num_children(&self) -> usize {
54 2
55 }
56
57 fn expected_compression_ratio(
58 &self,
59 data: &mut ArrayAndStats,
60 _ctx: CompressorContext,
61 ) -> CompressionEstimate {
62 let bit_width = data.array_as_primitive().ptype().bit_width();
63 let stats = data.integer_stats();
64
65 if stats.value_count() == 0 {
66 return CompressionEstimate::Verdict(EstimateVerdict::Skip);
67 }
68
69 let distinct_values_count = stats.distinct_count().vortex_expect(
70 "this must be present since `DictScheme` declared that we need distinct values",
71 );
72
73 if distinct_values_count > stats.value_count() / 2 {
75 return CompressionEstimate::Verdict(EstimateVerdict::Skip);
76 }
77
78 let values_size = bit_width * distinct_values_count as usize;
81
82 let codes_bw = u32::BITS - distinct_values_count.leading_zeros();
85
86 let n_runs = (stats.value_count() / stats.average_run_length()) as usize;
87
88 let codes_size_bp = codes_bw as usize * stats.value_count() as usize;
90 let codes_size_rle_bp = usize::checked_mul(codes_bw as usize + 32, n_runs);
91
92 let codes_size = usize::min(codes_size_bp, codes_size_rle_bp.unwrap_or(usize::MAX));
93
94 let before = stats.value_count() as usize * bit_width;
95
96 CompressionEstimate::Verdict(EstimateVerdict::Ratio(
97 before as f64 / (values_size + codes_size) as f64,
98 ))
99 }
100
101 fn compress(
102 &self,
103 compressor: &CascadingCompressor,
104 data: &mut ArrayAndStats,
105 ctx: CompressorContext,
106 ) -> VortexResult<ArrayRef> {
107 let stats = data.integer_stats().clone();
109 let dict = dictionary_encode(data.array_as_primitive(), &stats)?;
110
111 let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?;
113
114 let narrowed_codes = dict
116 .codes()
117 .clone()
118 .execute::<PrimitiveArray>(&mut compressor.execution_ctx())?
119 .narrow()?
120 .into_array();
121 let compressed_codes = compressor.compress_child(&narrowed_codes, &ctx, self.id(), 1)?;
122
123 unsafe {
125 Ok(
126 DictArray::new_unchecked(compressed_codes, compressed_values)
127 .set_all_values_referenced(dict.has_all_values_referenced())
128 .into_array(),
129 )
130 }
131 }
132}
133
134macro_rules! typed_encode {
136 ($source_array:ident, $stats:ident, $typed:ident, $typ:ty) => {{
137 let distinct = $typed.distinct().vortex_expect(
138 "this must be present since `DictScheme` declared that we need distinct values",
139 );
140
141 let values_validity = match $source_array.validity()? {
142 Validity::NonNullable => Validity::NonNullable,
143 _ => Validity::AllValid,
144 };
145 let codes_validity = $source_array.validity()?;
146
147 let values: Buffer<$typ> = distinct.distinct_values().keys().map(|x| x.0).collect();
148
149 let max_code = values.len();
150 let codes = if max_code <= u8::MAX as usize {
151 let buf = <DictEncoder as Encode<$typ, u8>>::encode(
152 &values,
153 $source_array.as_slice::<$typ>(),
154 );
155 PrimitiveArray::new(buf, codes_validity).into_array()
156 } else if max_code <= u16::MAX as usize {
157 let buf = <DictEncoder as Encode<$typ, u16>>::encode(
158 &values,
159 $source_array.as_slice::<$typ>(),
160 );
161 PrimitiveArray::new(buf, codes_validity).into_array()
162 } else {
163 let buf = <DictEncoder as Encode<$typ, u32>>::encode(
164 &values,
165 $source_array.as_slice::<$typ>(),
166 );
167 PrimitiveArray::new(buf, codes_validity).into_array()
168 };
169
170 let values = PrimitiveArray::new(values, values_validity).into_array();
171 Ok(unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) })
173 }};
174}
175
176#[expect(
182 clippy::cognitive_complexity,
183 reason = "complexity from match on all integer types"
184)]
185pub fn dictionary_encode(
186 array: ArrayView<'_, Primitive>,
187 stats: &IntegerStats,
188) -> VortexResult<DictArray> {
189 match stats.erased() {
190 IntegerErasedStats::U8(typed) => typed_encode!(array, stats, typed, u8),
191 IntegerErasedStats::U16(typed) => typed_encode!(array, stats, typed, u16),
192 IntegerErasedStats::U32(typed) => typed_encode!(array, stats, typed, u32),
193 IntegerErasedStats::U64(typed) => typed_encode!(array, stats, typed, u64),
194 IntegerErasedStats::I8(typed) => typed_encode!(array, stats, typed, i8),
195 IntegerErasedStats::I16(typed) => typed_encode!(array, stats, typed, i16),
196 IntegerErasedStats::I32(typed) => typed_encode!(array, stats, typed, i32),
197 IntegerErasedStats::I64(typed) => typed_encode!(array, stats, typed, i64),
198 }
199}
200
201struct DictEncoder;
203
204trait Encode<T, I> {
206 fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
208}
209
210macro_rules! impl_encode {
212 ($typ:ty) => { impl_encode!($typ, u8, u16, u32); };
213 ($typ:ty, $($ityp:ty),+) => {
214 $(
215 impl Encode<$typ, $ityp> for DictEncoder {
216 #[expect(clippy::cast_possible_truncation)]
217 fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
218 let mut codes =
219 vortex_utils::aliases::hash_map::HashMap::<$typ, $ityp>::with_capacity(
220 distinct.len(),
221 );
222 for (code, &value) in distinct.iter().enumerate() {
223 codes.insert(value, code as $ityp);
224 }
225
226 let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
227 for value in values {
228 unsafe { output.push_unchecked(codes.get(value).copied().unwrap_or_default()) };
231 }
232
233 output.freeze()
234 }
235 }
236 )*
237 };
238}
239
240impl_encode!(u8);
241impl_encode!(u16);
242impl_encode!(u32);
243impl_encode!(u64);
244impl_encode!(i8);
245impl_encode!(i16);
246impl_encode!(i32);
247impl_encode!(i64);
248
249#[cfg(test)]
250mod tests {
251 use vortex_array::IntoArray;
252 use vortex_array::ToCanonical;
253 use vortex_array::arrays::BoolArray;
254 use vortex_array::arrays::PrimitiveArray;
255 use vortex_array::arrays::dict::DictArraySlotsExt;
256 use vortex_array::assert_arrays_eq;
257 use vortex_array::validity::Validity;
258 use vortex_buffer::buffer;
259
260 use super::dictionary_encode;
261 use crate::stats::IntegerStats;
262
263 #[test]
264 fn test_dict_encode_integer_stats() {
265 let data = buffer![100i32, 200, 100, 0, 100];
266 let validity =
267 Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
268 let array = PrimitiveArray::new(data, validity);
269
270 let stats = IntegerStats::generate_opts(
271 &array,
272 crate::stats::GenerateStatsOptions {
273 count_distinct_values: true,
274 },
275 );
276 let dict_array = dictionary_encode(array.as_view(), &stats).unwrap();
277 assert_eq!(dict_array.values().len(), 2);
278 assert_eq!(dict_array.codes().len(), 5);
279
280 let expected = PrimitiveArray::new(
281 buffer![100i32, 200, 100, 100, 100],
282 Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()),
283 )
284 .into_array();
285 let undict = dict_array.as_array().to_primitive().into_array();
286 assert_arrays_eq!(undict, expected);
287 }
288}