vortex_compressor/builtins/dict/
float.rs1use vortex_array::ArrayRef;
10use vortex_array::ArrayView;
11use vortex_array::Canonical;
12use vortex_array::IntoArray;
13use vortex_array::arrays::DictArray;
14use vortex_array::arrays::Primitive;
15use vortex_array::arrays::PrimitiveArray;
16use vortex_array::arrays::dict::DictArrayExt;
17use vortex_array::arrays::dict::DictArraySlotsExt;
18use vortex_array::arrays::primitive::PrimitiveArrayExt;
19use vortex_array::dtype::half::f16;
20use vortex_array::validity::Validity;
21use vortex_buffer::Buffer;
22use vortex_error::VortexExpect;
23use vortex_error::VortexResult;
24
25use crate::CascadingCompressor;
26use crate::builtins::FloatDictScheme;
27use crate::builtins::IntDictScheme;
28use crate::builtins::is_float_primitive;
29use crate::ctx::CompressorContext;
30use crate::estimate::CompressionEstimate;
31use crate::estimate::DeferredEstimate;
32use crate::estimate::EstimateVerdict;
33use crate::scheme::ChildSelection;
34use crate::scheme::DescendantExclusion;
35use crate::scheme::Scheme;
36use crate::scheme::SchemeExt;
37use crate::stats::ArrayAndStats;
38use crate::stats::FloatErasedStats;
39use crate::stats::FloatStats;
40use crate::stats::GenerateStatsOptions;
41
42impl Scheme for FloatDictScheme {
43 fn scheme_name(&self) -> &'static str {
44 "vortex.float.dict"
45 }
46
47 fn matches(&self, canonical: &Canonical) -> bool {
48 is_float_primitive(canonical)
49 }
50
51 fn stats_options(&self) -> GenerateStatsOptions {
52 GenerateStatsOptions {
53 count_distinct_values: true,
54 }
55 }
56
57 fn num_children(&self) -> usize {
59 2
60 }
61
62 fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
71 vec![
72 DescendantExclusion {
73 excluded: IntDictScheme.id(),
74 children: ChildSelection::One(1),
75 },
76 DescendantExclusion {
77 excluded: IntDictScheme.id(),
78 children: ChildSelection::One(0),
79 },
80 ]
81 }
82
83 fn expected_compression_ratio(
84 &self,
85 data: &mut ArrayAndStats,
86 _ctx: CompressorContext,
87 ) -> CompressionEstimate {
88 let stats = data.float_stats();
89
90 if stats.value_count() == 0 {
91 return CompressionEstimate::Verdict(EstimateVerdict::Skip);
92 }
93
94 let distinct_values_count = stats.distinct_count().vortex_expect(
95 "this must be present since `DictScheme` declared that we need distinct values",
96 );
97
98 if distinct_values_count > stats.value_count() / 2 {
100 return CompressionEstimate::Verdict(EstimateVerdict::Skip);
101 }
102
103 CompressionEstimate::Deferred(DeferredEstimate::Sample)
105 }
106
107 fn compress(
108 &self,
109 compressor: &CascadingCompressor,
110 data: &mut ArrayAndStats,
111 ctx: CompressorContext,
112 ) -> VortexResult<ArrayRef> {
113 let stats = data.float_stats().clone();
115 let dict = dictionary_encode(data.array_as_primitive(), &stats)?;
116
117 let has_all_values_referenced = dict.has_all_values_referenced();
118
119 let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?;
121
122 let narrowed_codes = dict
124 .codes()
125 .clone()
126 .execute::<PrimitiveArray>(&mut compressor.execution_ctx())?
127 .narrow()?
128 .into_array();
129 let compressed_codes = compressor.compress_child(&narrowed_codes, &ctx, self.id(), 1)?;
130
131 unsafe {
133 Ok(
134 DictArray::new_unchecked(compressed_codes, compressed_values)
135 .set_all_values_referenced(has_all_values_referenced)
136 .into_array(),
137 )
138 }
139 }
140}
141
142macro_rules! typed_encode {
144 ($source_array:ident, $stats:ident, $typed:ident, $typ:ty) => {{
145 let distinct = $typed.distinct().vortex_expect(
146 "this must be present since `DictScheme` declared that we need distinct values",
147 );
148
149 let values_validity = match $source_array.validity()? {
150 Validity::NonNullable => Validity::NonNullable,
151 _ => Validity::AllValid,
152 };
153 let codes_validity = $source_array.validity()?;
154
155 let values: Buffer<$typ> = distinct.distinct_values().iter().map(|x| x.0).collect();
156
157 let max_code = values.len();
158 let codes = if max_code <= u8::MAX as usize {
159 let buf = <DictEncoder as Encode<$typ, u8>>::encode(
160 &values,
161 $source_array.as_slice::<$typ>(),
162 );
163 PrimitiveArray::new(buf, codes_validity).into_array()
164 } else if max_code <= u16::MAX as usize {
165 let buf = <DictEncoder as Encode<$typ, u16>>::encode(
166 &values,
167 $source_array.as_slice::<$typ>(),
168 );
169 PrimitiveArray::new(buf, codes_validity).into_array()
170 } else {
171 let buf = <DictEncoder as Encode<$typ, u32>>::encode(
172 &values,
173 $source_array.as_slice::<$typ>(),
174 );
175 PrimitiveArray::new(buf, codes_validity).into_array()
176 };
177
178 let values = PrimitiveArray::new(values, values_validity).into_array();
179 Ok(unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) })
181 }};
182}
183
184pub fn dictionary_encode(
190 array: ArrayView<'_, Primitive>,
191 stats: &FloatStats,
192) -> VortexResult<DictArray> {
193 match stats.erased() {
194 FloatErasedStats::F16(typed) => typed_encode!(array, stats, typed, f16),
195 FloatErasedStats::F32(typed) => typed_encode!(array, stats, typed, f32),
196 FloatErasedStats::F64(typed) => typed_encode!(array, stats, typed, f64),
197 }
198}
199
200struct DictEncoder;
202
203trait Encode<T, I> {
205 fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
207}
208
209macro_rules! impl_encode {
211 ($typ:ty, $utyp:ty) => { impl_encode!($typ, $utyp, u8, u16, u32); };
212 ($typ:ty, $utyp:ty, $($ityp:ty),+) => {
213 $(
214 impl Encode<$typ, $ityp> for DictEncoder {
215 #[expect(clippy::cast_possible_truncation)]
216 fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
217 let mut codes =
218 vortex_utils::aliases::hash_map::HashMap::<$utyp, $ityp>::with_capacity(
219 distinct.len(),
220 );
221 for (code, &value) in distinct.iter().enumerate() {
222 codes.insert(value.to_bits(), code as $ityp);
223 }
224
225 let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
226 for value in values {
227 output.push(codes.get(&value.to_bits()).copied().unwrap_or_default());
229 }
230
231 output.freeze()
232 }
233 }
234 )*
235 };
236}
237
238impl_encode!(f16, u16);
239impl_encode!(f32, u32);
240impl_encode!(f64, u64);
241
242#[cfg(test)]
243mod tests {
244 use vortex_array::IntoArray;
245 use vortex_array::ToCanonical;
246 use vortex_array::arrays::BoolArray;
247 use vortex_array::arrays::PrimitiveArray;
248 use vortex_array::arrays::dict::DictArraySlotsExt;
249 use vortex_array::assert_arrays_eq;
250 use vortex_array::validity::Validity;
251 use vortex_buffer::buffer;
252
253 use super::dictionary_encode;
254 use crate::stats::FloatStats;
255 use crate::stats::GenerateStatsOptions;
256
257 #[test]
258 fn test_float_dict_encode() {
259 let values = buffer![1f32, 2f32, 2f32, 0f32, 1f32];
260 let validity =
261 Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
262 let array = PrimitiveArray::new(values, validity);
263
264 let stats = FloatStats::generate_opts(
265 &array,
266 GenerateStatsOptions {
267 count_distinct_values: true,
268 },
269 );
270 let dict_array = dictionary_encode(array.as_view(), &stats).unwrap();
271 assert_eq!(dict_array.values().len(), 2);
272 assert_eq!(dict_array.codes().len(), 5);
273
274 let expected = PrimitiveArray::new(
275 buffer![1f32, 2f32, 2f32, 1f32, 1f32],
276 Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()),
277 )
278 .into_array();
279 let undict = dict_array.as_array().to_primitive().into_array();
280 assert_arrays_eq!(undict, expected);
281 }
282}