vortex_compressor/builtins/dict/
integer.rs1use vortex_array::ArrayRef;
10use vortex_array::Canonical;
11use vortex_array::IntoArray;
12use vortex_array::arrays::DictArray;
13use vortex_array::arrays::PrimitiveArray;
14use vortex_array::arrays::dict::DictArrayExt;
15use vortex_array::arrays::dict::DictArraySlotsExt;
16use vortex_array::arrays::primitive::PrimitiveArrayExt;
17use vortex_array::validity::Validity;
18use vortex_buffer::Buffer;
19use vortex_error::VortexExpect;
20use vortex_error::VortexResult;
21
22use crate::CascadingCompressor;
23use crate::builtins::IntDictScheme;
24use crate::builtins::is_integer_primitive;
25use crate::ctx::CompressorContext;
26use crate::estimate::CompressionEstimate;
27use crate::scheme::Scheme;
28use crate::scheme::SchemeExt;
29use crate::stats::ArrayAndStats;
30use crate::stats::GenerateStatsOptions;
31use crate::stats::IntegerErasedStats;
32use crate::stats::IntegerStats;
33
34impl Scheme for IntDictScheme {
35 fn scheme_name(&self) -> &'static str {
36 "vortex.int.dict"
37 }
38
39 fn matches(&self, canonical: &Canonical) -> bool {
40 is_integer_primitive(canonical)
41 }
42
43 fn stats_options(&self) -> GenerateStatsOptions {
44 GenerateStatsOptions {
45 count_distinct_values: true,
46 }
47 }
48
49 fn num_children(&self) -> usize {
51 2
52 }
53
54 fn expected_compression_ratio(
55 &self,
56 data: &mut ArrayAndStats,
57 _ctx: CompressorContext,
58 ) -> CompressionEstimate {
59 let bit_width = data.array_as_primitive().ptype().bit_width();
60 let stats = data.integer_stats();
61
62 if stats.value_count() == 0 {
63 return CompressionEstimate::Skip;
64 }
65
66 let distinct_values_count = stats.distinct_count().vortex_expect(
67 "this must be present since `DictScheme` declared that we need distinct values",
68 );
69
70 if distinct_values_count > stats.value_count() / 2 {
72 return CompressionEstimate::Skip;
73 }
74
75 let values_size = bit_width * distinct_values_count as usize;
78
79 let codes_bw = u32::BITS - distinct_values_count.leading_zeros();
82
83 let n_runs = (stats.value_count() / stats.average_run_length()) as usize;
84
85 let codes_size_bp = codes_bw as usize * stats.value_count() as usize;
87 let codes_size_rle_bp = usize::checked_mul(codes_bw as usize + 32, n_runs);
88
89 let codes_size = usize::min(codes_size_bp, codes_size_rle_bp.unwrap_or(usize::MAX));
90
91 let before = stats.value_count() as usize * bit_width;
92
93 CompressionEstimate::Ratio(before as f64 / (values_size + codes_size) as f64)
94 }
95
96 fn compress(
97 &self,
98 compressor: &CascadingCompressor,
99 data: &mut ArrayAndStats,
100 ctx: CompressorContext,
101 ) -> VortexResult<ArrayRef> {
102 let stats = data.integer_stats().clone();
104 let dict = dictionary_encode(data.array_as_primitive(), &stats)?;
105
106 let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?;
108
109 let narrowed_codes = dict
111 .codes()
112 .clone()
113 .execute::<PrimitiveArray>(&mut compressor.execution_ctx())?
114 .narrow()?
115 .into_array();
116 let compressed_codes = compressor.compress_child(&narrowed_codes, &ctx, self.id(), 1)?;
117
118 unsafe {
120 Ok(
121 DictArray::new_unchecked(compressed_codes, compressed_values)
122 .set_all_values_referenced(dict.has_all_values_referenced())
123 .into_array(),
124 )
125 }
126 }
127}
128
129macro_rules! typed_encode {
131 ($source_array:ident, $stats:ident, $typed:ident, $typ:ty) => {{
132 let distinct = $typed.distinct().vortex_expect(
133 "this must be present since `DictScheme` declared that we need distinct values",
134 );
135
136 let values_validity = match $source_array.validity()? {
137 Validity::NonNullable => Validity::NonNullable,
138 _ => Validity::AllValid,
139 };
140 let codes_validity = $source_array.validity()?;
141
142 let values: Buffer<$typ> = distinct.distinct_values().keys().map(|x| x.0).collect();
143
144 let max_code = values.len();
145 let codes = if max_code <= u8::MAX as usize {
146 let buf = <DictEncoder as Encode<$typ, u8>>::encode(
147 &values,
148 $source_array.as_slice::<$typ>(),
149 );
150 PrimitiveArray::new(buf, codes_validity).into_array()
151 } else if max_code <= u16::MAX as usize {
152 let buf = <DictEncoder as Encode<$typ, u16>>::encode(
153 &values,
154 $source_array.as_slice::<$typ>(),
155 );
156 PrimitiveArray::new(buf, codes_validity).into_array()
157 } else {
158 let buf = <DictEncoder as Encode<$typ, u32>>::encode(
159 &values,
160 $source_array.as_slice::<$typ>(),
161 );
162 PrimitiveArray::new(buf, codes_validity).into_array()
163 };
164
165 let values = PrimitiveArray::new(values, values_validity).into_array();
166 Ok(unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) })
168 }};
169}
170
171#[expect(
177 clippy::cognitive_complexity,
178 reason = "complexity from match on all integer types"
179)]
180pub fn dictionary_encode(array: PrimitiveArray, stats: &IntegerStats) -> VortexResult<DictArray> {
181 match stats.erased() {
182 IntegerErasedStats::U8(typed) => typed_encode!(array, stats, typed, u8),
183 IntegerErasedStats::U16(typed) => typed_encode!(array, stats, typed, u16),
184 IntegerErasedStats::U32(typed) => typed_encode!(array, stats, typed, u32),
185 IntegerErasedStats::U64(typed) => typed_encode!(array, stats, typed, u64),
186 IntegerErasedStats::I8(typed) => typed_encode!(array, stats, typed, i8),
187 IntegerErasedStats::I16(typed) => typed_encode!(array, stats, typed, i16),
188 IntegerErasedStats::I32(typed) => typed_encode!(array, stats, typed, i32),
189 IntegerErasedStats::I64(typed) => typed_encode!(array, stats, typed, i64),
190 }
191}
192
193struct DictEncoder;
195
196trait Encode<T, I> {
198 fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
200}
201
202macro_rules! impl_encode {
204 ($typ:ty) => { impl_encode!($typ, u8, u16, u32); };
205 ($typ:ty, $($ityp:ty),+) => {
206 $(
207 impl Encode<$typ, $ityp> for DictEncoder {
208 #[allow(clippy::cast_possible_truncation)]
209 fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
210 let mut codes =
211 vortex_utils::aliases::hash_map::HashMap::<$typ, $ityp>::with_capacity(
212 distinct.len(),
213 );
214 for (code, &value) in distinct.iter().enumerate() {
215 codes.insert(value, code as $ityp);
216 }
217
218 let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
219 for value in values {
220 unsafe { output.push_unchecked(codes.get(value).copied().unwrap_or_default()) };
223 }
224
225 output.freeze()
226 }
227 }
228 )*
229 };
230}
231
232impl_encode!(u8);
233impl_encode!(u16);
234impl_encode!(u32);
235impl_encode!(u64);
236impl_encode!(i8);
237impl_encode!(i16);
238impl_encode!(i32);
239impl_encode!(i64);
240
241#[cfg(test)]
242mod tests {
243 use vortex_array::IntoArray;
244 use vortex_array::ToCanonical;
245 use vortex_array::arrays::BoolArray;
246 use vortex_array::arrays::PrimitiveArray;
247 use vortex_array::arrays::dict::DictArraySlotsExt;
248 use vortex_array::assert_arrays_eq;
249 use vortex_array::validity::Validity;
250 use vortex_buffer::buffer;
251
252 use super::dictionary_encode;
253 use crate::stats::IntegerStats;
254
255 #[test]
256 fn test_dict_encode_integer_stats() {
257 let data = buffer![100i32, 200, 100, 0, 100];
258 let validity =
259 Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
260 let array = PrimitiveArray::new(data, validity);
261
262 let stats = IntegerStats::generate_opts(
263 &array,
264 crate::stats::GenerateStatsOptions {
265 count_distinct_values: true,
266 },
267 );
268 let dict_array = dictionary_encode(array, &stats).unwrap();
269 assert_eq!(dict_array.values().len(), 2);
270 assert_eq!(dict_array.codes().len(), 5);
271
272 let expected = PrimitiveArray::new(
273 buffer![100i32, 200, 100, 100, 100],
274 Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()),
275 )
276 .into_array();
277 let undict = dict_array.as_array().to_primitive().into_array();
278 assert_arrays_eq!(undict, expected);
279 }
280}