vortex_compressor/stats/
float.rs1use std::hash::Hash;
7
8use itertools::Itertools;
9use num_traits::Float;
10use rustc_hash::FxBuildHasher;
11use vortex_array::arrays::PrimitiveArray;
12use vortex_array::arrays::primitive::NativeValue;
13use vortex_array::dtype::NativePType;
14use vortex_array::dtype::PType;
15use vortex_array::dtype::half::f16;
16use vortex_error::VortexExpect;
17use vortex_error::VortexResult;
18use vortex_error::vortex_err;
19use vortex_error::vortex_panic;
20use vortex_mask::AllOr;
21use vortex_utils::aliases::hash_set::HashSet;
22
23use super::GenerateStatsOptions;
24
25#[derive(Debug, Clone)]
27pub struct DistinctInfo<T> {
28 distinct_values: HashSet<NativeValue<T>, FxBuildHasher>,
30 distinct_count: u32,
32}
33
34impl<T> DistinctInfo<T> {
35 pub fn distinct_values(&self) -> &HashSet<NativeValue<T>, FxBuildHasher> {
37 &self.distinct_values
38 }
39}
40
41#[derive(Debug, Clone)]
43pub struct TypedStats<T> {
44 distinct: Option<DistinctInfo<T>>,
46}
47
48impl<T> TypedStats<T> {
49 pub fn distinct(&self) -> Option<&DistinctInfo<T>> {
51 self.distinct.as_ref()
52 }
53}
54
55#[derive(Debug, Clone)]
57pub enum ErasedStats {
58 F16(TypedStats<f16>),
60 F32(TypedStats<f32>),
62 F64(TypedStats<f64>),
64}
65
66impl ErasedStats {
67 fn distinct_count(&self) -> Option<u32> {
69 match self {
70 ErasedStats::F16(x) => x.distinct.as_ref().map(|d| d.distinct_count),
71 ErasedStats::F32(x) => x.distinct.as_ref().map(|d| d.distinct_count),
72 ErasedStats::F64(x) => x.distinct.as_ref().map(|d| d.distinct_count),
73 }
74 }
75}
76
77macro_rules! impl_from_typed {
79 ($T:ty, $variant:path) => {
80 impl From<TypedStats<$T>> for ErasedStats {
81 fn from(typed: TypedStats<$T>) -> Self {
82 $variant(typed)
83 }
84 }
85 };
86}
87
88impl_from_typed!(f16, ErasedStats::F16);
89impl_from_typed!(f32, ErasedStats::F32);
90impl_from_typed!(f64, ErasedStats::F64);
91
92#[derive(Debug, Clone)]
94pub struct FloatStats {
95 null_count: u32,
97 value_count: u32,
99 average_run_length: u32,
101 erased: ErasedStats,
103}
104
105impl FloatStats {
106 fn generate_opts_fallible(
108 input: &PrimitiveArray,
109 opts: GenerateStatsOptions,
110 ) -> VortexResult<Self> {
111 match input.ptype() {
112 PType::F16 => typed_float_stats::<f16>(input, opts.count_distinct_values),
113 PType::F32 => typed_float_stats::<f32>(input, opts.count_distinct_values),
114 PType::F64 => typed_float_stats::<f64>(input, opts.count_distinct_values),
115 _ => vortex_panic!("cannot generate FloatStats from ptype {}", input.ptype()),
116 }
117 }
118
119 pub fn distinct_count(&self) -> Option<u32> {
121 self.erased.distinct_count()
122 }
123}
124
125impl FloatStats {
126 pub fn generate(input: &PrimitiveArray) -> Self {
128 Self::generate_opts(input, GenerateStatsOptions::default())
129 }
130
131 pub fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
133 Self::generate_opts_fallible(input, opts)
134 .vortex_expect("FloatStats::generate_opts should not fail")
135 }
136
137 pub fn null_count(&self) -> u32 {
139 self.null_count
140 }
141
142 pub fn value_count(&self) -> u32 {
144 self.value_count
145 }
146
147 pub fn average_run_length(&self) -> u32 {
149 self.average_run_length
150 }
151
152 pub fn erased(&self) -> &ErasedStats {
154 &self.erased
155 }
156}
157
158fn typed_float_stats<T: NativePType + Float>(
160 array: &PrimitiveArray,
161 count_distinct_values: bool,
162) -> VortexResult<FloatStats>
163where
164 NativeValue<T>: Hash + Eq,
165 TypedStats<T>: Into<ErasedStats>,
166{
167 if array.is_empty() {
169 return Ok(FloatStats {
170 null_count: 0,
171 value_count: 0,
172 average_run_length: 0,
173 erased: TypedStats { distinct: None }.into(),
174 });
175 }
176
177 if array.all_invalid()? {
178 return Ok(FloatStats {
179 null_count: u32::try_from(array.len())?,
180 value_count: 0,
181 average_run_length: 0,
182 erased: TypedStats { distinct: None }.into(),
183 });
184 }
185
186 let null_count = array
187 .statistics()
188 .compute_null_count()
189 .ok_or_else(|| vortex_err!("Failed to compute null_count"))?;
190 let value_count = array.len() - null_count;
191
192 let mut distinct_values = if count_distinct_values {
195 HashSet::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
196 } else {
197 HashSet::with_hasher(FxBuildHasher)
198 };
199
200 let validity = array.validity_mask()?;
201
202 let mut runs = 1;
203 let head_idx = validity
204 .first()
205 .vortex_expect("All null masks have been handled before");
206 let buff = array.to_buffer::<T>();
207 let mut prev = buff[head_idx];
208
209 let first_valid_buff = buff.slice(head_idx..array.len());
210 match validity.bit_buffer() {
211 AllOr::All => {
212 for value in first_valid_buff {
213 if count_distinct_values {
214 distinct_values.insert(NativeValue(value));
215 }
216
217 if value != prev {
218 prev = value;
219 runs += 1;
220 }
221 }
222 }
223 AllOr::None => unreachable!("All invalid arrays have been handled earlier"),
224 AllOr::Some(v) => {
225 for (&value, valid) in first_valid_buff
226 .iter()
227 .zip_eq(v.slice(head_idx..array.len()).iter())
228 {
229 if valid {
230 if count_distinct_values {
231 distinct_values.insert(NativeValue(value));
232 }
233
234 if value != prev {
235 prev = value;
236 runs += 1;
237 }
238 }
239 }
240 }
241 }
242
243 let null_count = u32::try_from(null_count)?;
244 let value_count = u32::try_from(value_count)?;
245
246 let distinct = count_distinct_values.then(|| DistinctInfo {
247 distinct_count: u32::try_from(distinct_values.len())
248 .vortex_expect("more than u32::MAX distinct values"),
249 distinct_values,
250 });
251
252 Ok(FloatStats {
253 null_count,
254 value_count,
255 average_run_length: value_count / runs,
256 erased: TypedStats { distinct }.into(),
257 })
258}
259
260#[cfg(test)]
261mod tests {
262 use vortex_array::IntoArray;
263 use vortex_array::ToCanonical;
264 use vortex_array::arrays::PrimitiveArray;
265 use vortex_array::validity::Validity;
266 use vortex_buffer::buffer;
267
268 use super::FloatStats;
269
270 #[test]
271 fn test_float_stats() {
272 let floats = buffer![0.0f32, 1.0f32, 2.0f32].into_array();
273 let floats = floats.to_primitive();
274
275 let stats = FloatStats::generate_opts(
276 &floats,
277 crate::stats::GenerateStatsOptions {
278 count_distinct_values: true,
279 },
280 );
281
282 assert_eq!(stats.value_count, 3);
283 assert_eq!(stats.null_count, 0);
284 assert_eq!(stats.average_run_length, 1);
285 assert_eq!(stats.distinct_count().unwrap(), 3);
286 }
287
288 #[test]
289 fn test_float_stats_leading_nulls() {
290 let floats = PrimitiveArray::new(
291 buffer![0.0f32, 1.0f32, 2.0f32],
292 Validity::from_iter([false, true, true]),
293 );
294
295 let stats = FloatStats::generate_opts(
296 &floats,
297 crate::stats::GenerateStatsOptions {
298 count_distinct_values: true,
299 },
300 );
301
302 assert_eq!(stats.value_count, 2);
303 assert_eq!(stats.null_count, 1);
304 assert_eq!(stats.average_run_length, 1);
305 assert_eq!(stats.distinct_count().unwrap(), 2);
306 }
307}