vortex_compressor/builtins/dict/
float.rs1use vortex_array::ArrayRef;
10use vortex_array::Canonical;
11use vortex_array::IntoArray;
12use vortex_array::arrays::DictArray;
13use vortex_array::arrays::PrimitiveArray;
14use vortex_array::arrays::dict::DictArrayExt;
15use vortex_array::arrays::dict::DictArraySlotsExt;
16use vortex_array::arrays::primitive::PrimitiveArrayExt;
17use vortex_array::dtype::half::f16;
18use vortex_array::validity::Validity;
19use vortex_buffer::Buffer;
20use vortex_error::VortexExpect;
21use vortex_error::VortexResult;
22
23use crate::CascadingCompressor;
24use crate::builtins::FloatDictScheme;
25use crate::builtins::IntDictScheme;
26use crate::builtins::is_float_primitive;
27use crate::ctx::CompressorContext;
28use crate::estimate::CompressionEstimate;
29use crate::scheme::ChildSelection;
30use crate::scheme::DescendantExclusion;
31use crate::scheme::Scheme;
32use crate::scheme::SchemeExt;
33use crate::stats::ArrayAndStats;
34use crate::stats::FloatErasedStats;
35use crate::stats::FloatStats;
36use crate::stats::GenerateStatsOptions;
37
38impl Scheme for FloatDictScheme {
39 fn scheme_name(&self) -> &'static str {
40 "vortex.float.dict"
41 }
42
43 fn matches(&self, canonical: &Canonical) -> bool {
44 is_float_primitive(canonical)
45 }
46
47 fn stats_options(&self) -> GenerateStatsOptions {
48 GenerateStatsOptions {
49 count_distinct_values: true,
50 }
51 }
52
53 fn num_children(&self) -> usize {
55 2
56 }
57
58 fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
67 vec![
68 DescendantExclusion {
69 excluded: IntDictScheme.id(),
70 children: ChildSelection::One(1),
71 },
72 DescendantExclusion {
73 excluded: IntDictScheme.id(),
74 children: ChildSelection::One(0),
75 },
76 ]
77 }
78
79 fn expected_compression_ratio(
80 &self,
81 data: &mut ArrayAndStats,
82 _ctx: CompressorContext,
83 ) -> CompressionEstimate {
84 let stats = data.float_stats();
85
86 if stats.value_count() == 0 {
87 return CompressionEstimate::Skip;
88 }
89
90 let distinct_values_count = stats.distinct_count().vortex_expect(
91 "this must be present since `DictScheme` declared that we need distinct values",
92 );
93
94 if distinct_values_count > stats.value_count() / 2 {
96 return CompressionEstimate::Skip;
97 }
98
99 CompressionEstimate::Sample
101 }
102
103 fn compress(
104 &self,
105 compressor: &CascadingCompressor,
106 data: &mut ArrayAndStats,
107 ctx: CompressorContext,
108 ) -> VortexResult<ArrayRef> {
109 let stats = data.float_stats().clone();
111 let dict = dictionary_encode(data.array_as_primitive(), &stats)?;
112
113 let has_all_values_referenced = dict.has_all_values_referenced();
114
115 let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?;
117
118 let narrowed_codes = dict
120 .codes()
121 .clone()
122 .execute::<PrimitiveArray>(&mut compressor.execution_ctx())?
123 .narrow()?
124 .into_array();
125 let compressed_codes = compressor.compress_child(&narrowed_codes, &ctx, self.id(), 1)?;
126
127 unsafe {
129 Ok(
130 DictArray::new_unchecked(compressed_codes, compressed_values)
131 .set_all_values_referenced(has_all_values_referenced)
132 .into_array(),
133 )
134 }
135 }
136}
137
138macro_rules! typed_encode {
140 ($source_array:ident, $stats:ident, $typed:ident, $typ:ty) => {{
141 let distinct = $typed.distinct().vortex_expect(
142 "this must be present since `DictScheme` declared that we need distinct values",
143 );
144
145 let values_validity = match $source_array.validity()? {
146 Validity::NonNullable => Validity::NonNullable,
147 _ => Validity::AllValid,
148 };
149 let codes_validity = $source_array.validity()?;
150
151 let values: Buffer<$typ> = distinct.distinct_values().iter().map(|x| x.0).collect();
152
153 let max_code = values.len();
154 let codes = if max_code <= u8::MAX as usize {
155 let buf = <DictEncoder as Encode<$typ, u8>>::encode(
156 &values,
157 $source_array.as_slice::<$typ>(),
158 );
159 PrimitiveArray::new(buf, codes_validity).into_array()
160 } else if max_code <= u16::MAX as usize {
161 let buf = <DictEncoder as Encode<$typ, u16>>::encode(
162 &values,
163 $source_array.as_slice::<$typ>(),
164 );
165 PrimitiveArray::new(buf, codes_validity).into_array()
166 } else {
167 let buf = <DictEncoder as Encode<$typ, u32>>::encode(
168 &values,
169 $source_array.as_slice::<$typ>(),
170 );
171 PrimitiveArray::new(buf, codes_validity).into_array()
172 };
173
174 let values = PrimitiveArray::new(values, values_validity).into_array();
175 Ok(unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) })
177 }};
178}
179
180pub fn dictionary_encode(array: PrimitiveArray, stats: &FloatStats) -> VortexResult<DictArray> {
186 match stats.erased() {
187 FloatErasedStats::F16(typed) => typed_encode!(array, stats, typed, f16),
188 FloatErasedStats::F32(typed) => typed_encode!(array, stats, typed, f32),
189 FloatErasedStats::F64(typed) => typed_encode!(array, stats, typed, f64),
190 }
191}
192
193struct DictEncoder;
195
196trait Encode<T, I> {
198 fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
200}
201
202macro_rules! impl_encode {
204 ($typ:ty, $utyp:ty) => { impl_encode!($typ, $utyp, u8, u16, u32); };
205 ($typ:ty, $utyp:ty, $($ityp:ty),+) => {
206 $(
207 impl Encode<$typ, $ityp> for DictEncoder {
208 #[allow(clippy::cast_possible_truncation)]
209 fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
210 let mut codes =
211 vortex_utils::aliases::hash_map::HashMap::<$utyp, $ityp>::with_capacity(
212 distinct.len(),
213 );
214 for (code, &value) in distinct.iter().enumerate() {
215 codes.insert(value.to_bits(), code as $ityp);
216 }
217
218 let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
219 for value in values {
220 output.push(codes.get(&value.to_bits()).copied().unwrap_or_default());
222 }
223
224 output.freeze()
225 }
226 }
227 )*
228 };
229}
230
231impl_encode!(f16, u16);
232impl_encode!(f32, u32);
233impl_encode!(f64, u64);
234
235#[cfg(test)]
236mod tests {
237 use vortex_array::IntoArray;
238 use vortex_array::ToCanonical;
239 use vortex_array::arrays::BoolArray;
240 use vortex_array::arrays::PrimitiveArray;
241 use vortex_array::arrays::dict::DictArraySlotsExt;
242 use vortex_array::assert_arrays_eq;
243 use vortex_array::validity::Validity;
244 use vortex_buffer::buffer;
245
246 use super::dictionary_encode;
247 use crate::stats::FloatStats;
248 use crate::stats::GenerateStatsOptions;
249
250 #[test]
251 fn test_float_dict_encode() {
252 let values = buffer![1f32, 2f32, 2f32, 0f32, 1f32];
253 let validity =
254 Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
255 let array = PrimitiveArray::new(values, validity);
256
257 let stats = FloatStats::generate_opts(
258 &array,
259 GenerateStatsOptions {
260 count_distinct_values: true,
261 },
262 );
263 let dict_array = dictionary_encode(array, &stats).unwrap();
264 assert_eq!(dict_array.values().len(), 2);
265 assert_eq!(dict_array.codes().len(), 5);
266
267 let expected = PrimitiveArray::new(
268 buffer![1f32, 2f32, 2f32, 1f32, 1f32],
269 Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()),
270 )
271 .into_array();
272 let undict = dict_array.as_array().to_primitive().into_array();
273 assert_arrays_eq!(undict, expected);
274 }
275}