vortex_compressor/builtins/dict/
float.rs1use vortex_array::ArrayRef;
10use vortex_array::ArrayView;
11use vortex_array::Canonical;
12use vortex_array::ExecutionCtx;
13use vortex_array::IntoArray;
14use vortex_array::arrays::DictArray;
15use vortex_array::arrays::Primitive;
16use vortex_array::arrays::PrimitiveArray;
17use vortex_array::arrays::dict::DictArrayExt;
18use vortex_array::arrays::dict::DictArraySlotsExt;
19use vortex_array::arrays::primitive::PrimitiveArrayExt;
20use vortex_array::dtype::half::f16;
21use vortex_array::validity::Validity;
22use vortex_buffer::Buffer;
23use vortex_error::VortexExpect;
24use vortex_error::VortexResult;
25
26use crate::CascadingCompressor;
27use crate::builtins::FloatDictScheme;
28use crate::builtins::IntDictScheme;
29use crate::builtins::is_float_primitive;
30use crate::ctx::CompressorContext;
31use crate::estimate::CompressionEstimate;
32use crate::estimate::DeferredEstimate;
33use crate::estimate::EstimateVerdict;
34use crate::scheme::ChildSelection;
35use crate::scheme::DescendantExclusion;
36use crate::scheme::Scheme;
37use crate::scheme::SchemeExt;
38use crate::stats::ArrayAndStats;
39use crate::stats::FloatErasedStats;
40use crate::stats::FloatStats;
41use crate::stats::GenerateStatsOptions;
42
43impl Scheme for FloatDictScheme {
44 fn scheme_name(&self) -> &'static str {
45 "vortex.float.dict"
46 }
47
48 fn matches(&self, canonical: &Canonical) -> bool {
49 is_float_primitive(canonical)
50 }
51
52 fn stats_options(&self) -> GenerateStatsOptions {
53 GenerateStatsOptions {
54 count_distinct_values: true,
55 }
56 }
57
58 fn num_children(&self) -> usize {
60 2
61 }
62
63 fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
72 vec![
73 DescendantExclusion {
74 excluded: IntDictScheme.id(),
75 children: ChildSelection::One(1),
76 },
77 DescendantExclusion {
78 excluded: IntDictScheme.id(),
79 children: ChildSelection::One(0),
80 },
81 ]
82 }
83
84 fn expected_compression_ratio(
85 &self,
86 data: &ArrayAndStats,
87 _compress_ctx: CompressorContext,
88 exec_ctx: &mut ExecutionCtx,
89 ) -> CompressionEstimate {
90 let stats = data.float_stats(exec_ctx);
91
92 if stats.value_count() == 0 {
93 return CompressionEstimate::Verdict(EstimateVerdict::Skip);
94 }
95
96 let distinct_values_count = stats.distinct_count().vortex_expect(
97 "this must be present since `DictScheme` declared that we need distinct values",
98 );
99
100 if distinct_values_count > stats.value_count() / 2 {
102 return CompressionEstimate::Verdict(EstimateVerdict::Skip);
103 }
104
105 CompressionEstimate::Deferred(DeferredEstimate::Sample)
107 }
108
109 fn compress(
110 &self,
111 compressor: &CascadingCompressor,
112 data: &ArrayAndStats,
113 compress_ctx: CompressorContext,
114 exec_ctx: &mut ExecutionCtx,
115 ) -> VortexResult<ArrayRef> {
116 let stats = data.float_stats(exec_ctx);
117 let dict = dictionary_encode(data.array_as_primitive(), &stats)?;
118
119 let has_all_values_referenced = dict.has_all_values_referenced();
120
121 let compressed_values =
123 compressor.compress_child(dict.values(), &compress_ctx, self.id(), 0, exec_ctx)?;
124
125 let narrowed_codes = dict
127 .codes()
128 .clone()
129 .execute::<PrimitiveArray>(exec_ctx)?
130 .narrow()?
131 .into_array();
132 let compressed_codes =
133 compressor.compress_child(&narrowed_codes, &compress_ctx, self.id(), 1, exec_ctx)?;
134
135 unsafe {
137 Ok(
138 DictArray::new_unchecked(compressed_codes, compressed_values)
139 .set_all_values_referenced(has_all_values_referenced)
140 .into_array(),
141 )
142 }
143 }
144}
145
146macro_rules! typed_encode {
148 ($source_array:ident, $stats:ident, $typed:ident, $typ:ty) => {{
149 let distinct = $typed.distinct().vortex_expect(
150 "this must be present since `DictScheme` declared that we need distinct values",
151 );
152
153 let values_validity = match $source_array.validity()? {
154 Validity::NonNullable => Validity::NonNullable,
155 _ => Validity::AllValid,
156 };
157 let codes_validity = $source_array.validity()?;
158
159 let values: Buffer<$typ> = distinct.distinct_values().iter().map(|x| x.0).collect();
160
161 let max_code = values.len();
162 let codes = if max_code <= u8::MAX as usize {
163 let buf = <DictEncoder as Encode<$typ, u8>>::encode(
164 &values,
165 $source_array.as_slice::<$typ>(),
166 );
167 PrimitiveArray::new(buf, codes_validity).into_array()
168 } else if max_code <= u16::MAX as usize {
169 let buf = <DictEncoder as Encode<$typ, u16>>::encode(
170 &values,
171 $source_array.as_slice::<$typ>(),
172 );
173 PrimitiveArray::new(buf, codes_validity).into_array()
174 } else {
175 let buf = <DictEncoder as Encode<$typ, u32>>::encode(
176 &values,
177 $source_array.as_slice::<$typ>(),
178 );
179 PrimitiveArray::new(buf, codes_validity).into_array()
180 };
181
182 let values = PrimitiveArray::new(values, values_validity).into_array();
183 Ok(unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) })
185 }};
186}
187
188pub fn dictionary_encode(
194 array: ArrayView<'_, Primitive>,
195 stats: &FloatStats,
196) -> VortexResult<DictArray> {
197 match stats.erased() {
198 FloatErasedStats::F16(typed) => typed_encode!(array, stats, typed, f16),
199 FloatErasedStats::F32(typed) => typed_encode!(array, stats, typed, f32),
200 FloatErasedStats::F64(typed) => typed_encode!(array, stats, typed, f64),
201 }
202}
203
204struct DictEncoder;
206
207trait Encode<T, I> {
209 fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
211}
212
213macro_rules! impl_encode {
215 ($typ:ty, $utyp:ty) => { impl_encode!($typ, $utyp, u8, u16, u32); };
216 ($typ:ty, $utyp:ty, $($ityp:ty),+) => {
217 $(
218 impl Encode<$typ, $ityp> for DictEncoder {
219 #[expect(clippy::cast_possible_truncation)]
220 fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
221 let mut codes =
222 vortex_utils::aliases::hash_map::HashMap::<$utyp, $ityp>::with_capacity(
223 distinct.len(),
224 );
225 for (code, &value) in distinct.iter().enumerate() {
226 codes.insert(value.to_bits(), code as $ityp);
227 }
228
229 let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
230 for value in values {
231 output.push(codes.get(&value.to_bits()).copied().unwrap_or_default());
233 }
234
235 output.freeze()
236 }
237 }
238 )*
239 };
240}
241
242impl_encode!(f16, u16);
243impl_encode!(f32, u32);
244impl_encode!(f64, u64);
245
246#[cfg(test)]
247mod tests {
248 use vortex_array::IntoArray;
249 use vortex_array::VortexSessionExecute;
250 use vortex_array::arrays::BoolArray;
251 use vortex_array::arrays::PrimitiveArray;
252 use vortex_array::arrays::dict::DictArraySlotsExt;
253 use vortex_array::assert_arrays_eq;
254 use vortex_array::session::ArraySession;
255 use vortex_array::validity::Validity;
256 use vortex_buffer::buffer;
257 use vortex_error::VortexResult;
258 use vortex_session::VortexSession;
259
260 use super::dictionary_encode;
261 use crate::stats::FloatStats;
262 use crate::stats::GenerateStatsOptions;
263
264 #[test]
265 fn test_float_dict_encode() -> VortexResult<()> {
266 let mut ctx = VortexSession::empty()
267 .with::<ArraySession>()
268 .create_execution_ctx();
269 let values = buffer![1f32, 2f32, 2f32, 0f32, 1f32];
270 let validity =
271 Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
272 let array = PrimitiveArray::new(values, validity);
273
274 let stats = FloatStats::generate_opts(
275 &array,
276 GenerateStatsOptions {
277 count_distinct_values: true,
278 },
279 &mut ctx,
280 );
281 let dict_array = dictionary_encode(array.as_view(), &stats)?;
282 assert_eq!(dict_array.values().len(), 2);
283 assert_eq!(dict_array.codes().len(), 5);
284
285 let expected = PrimitiveArray::new(
286 buffer![1f32, 2f32, 2f32, 1f32, 1f32],
287 Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()),
288 )
289 .into_array();
290 let undict = dict_array
291 .as_array()
292 .clone()
293 .execute::<PrimitiveArray>(&mut ctx)?
294 .into_array();
295 assert_arrays_eq!(undict, expected);
296 Ok(())
297 }
298}