1use std::hash::Hash;
7
8use itertools::Itertools;
9use num_traits::Float;
10use rustc_hash::FxBuildHasher;
11use vortex_array::ExecutionCtx;
12use vortex_array::arrays::PrimitiveArray;
13use vortex_array::arrays::primitive::NativeValue;
14use vortex_array::dtype::NativePType;
15use vortex_array::dtype::PType;
16use vortex_array::dtype::half::f16;
17use vortex_error::VortexExpect;
18use vortex_error::VortexResult;
19use vortex_error::vortex_err;
20use vortex_error::vortex_panic;
21use vortex_mask::AllOr;
22use vortex_utils::aliases::hash_set::HashSet;
23
24use super::GenerateStatsOptions;
25
26#[derive(Debug, Clone)]
28pub struct DistinctInfo<T> {
29 distinct_values: HashSet<NativeValue<T>, FxBuildHasher>,
31 distinct_count: u32,
33}
34
35impl<T> DistinctInfo<T> {
36 pub fn distinct_values(&self) -> &HashSet<NativeValue<T>, FxBuildHasher> {
38 &self.distinct_values
39 }
40}
41
42#[derive(Debug, Clone)]
44pub struct TypedStats<T> {
45 distinct: Option<DistinctInfo<T>>,
47}
48
49impl<T> TypedStats<T> {
50 pub fn distinct(&self) -> Option<&DistinctInfo<T>> {
52 self.distinct.as_ref()
53 }
54}
55
56#[derive(Debug, Clone)]
58pub enum ErasedStats {
59 F16(TypedStats<f16>),
61 F32(TypedStats<f32>),
63 F64(TypedStats<f64>),
65}
66
67impl ErasedStats {
68 fn distinct_count(&self) -> Option<u32> {
70 match self {
71 ErasedStats::F16(x) => x.distinct.as_ref().map(|d| d.distinct_count),
72 ErasedStats::F32(x) => x.distinct.as_ref().map(|d| d.distinct_count),
73 ErasedStats::F64(x) => x.distinct.as_ref().map(|d| d.distinct_count),
74 }
75 }
76}
77
78macro_rules! impl_from_typed {
80 ($T:ty, $variant:path) => {
81 impl From<TypedStats<$T>> for ErasedStats {
82 fn from(typed: TypedStats<$T>) -> Self {
83 $variant(typed)
84 }
85 }
86 };
87}
88
89impl_from_typed!(f16, ErasedStats::F16);
90impl_from_typed!(f32, ErasedStats::F32);
91impl_from_typed!(f64, ErasedStats::F64);
92
93#[derive(Debug, Clone)]
95pub struct FloatStats {
96 null_count: u32,
98 value_count: u32,
100 average_run_length: u32,
102 erased: ErasedStats,
104}
105
106impl FloatStats {
107 fn generate_opts_fallible(
109 input: &PrimitiveArray,
110 opts: GenerateStatsOptions,
111 ctx: &mut ExecutionCtx,
112 ) -> VortexResult<Self> {
113 match input.ptype() {
114 PType::F16 => typed_float_stats::<f16>(input, opts.count_distinct_values, ctx),
115 PType::F32 => typed_float_stats::<f32>(input, opts.count_distinct_values, ctx),
116 PType::F64 => typed_float_stats::<f64>(input, opts.count_distinct_values, ctx),
117 _ => vortex_panic!("cannot generate FloatStats from ptype {}", input.ptype()),
118 }
119 }
120
121 pub fn distinct_count(&self) -> Option<u32> {
123 self.erased.distinct_count()
124 }
125}
126
127impl FloatStats {
128 pub fn generate(input: &PrimitiveArray, ctx: &mut ExecutionCtx) -> Self {
130 Self::generate_opts(input, GenerateStatsOptions::default(), ctx)
131 }
132
133 pub fn generate_opts(
135 input: &PrimitiveArray,
136 opts: GenerateStatsOptions,
137 ctx: &mut ExecutionCtx,
138 ) -> Self {
139 Self::generate_opts_fallible(input, opts, ctx)
140 .vortex_expect("FloatStats::generate_opts should not fail")
141 }
142
143 pub fn null_count(&self) -> u32 {
145 self.null_count
146 }
147
148 pub fn value_count(&self) -> u32 {
150 self.value_count
151 }
152
153 pub fn average_run_length(&self) -> u32 {
155 self.average_run_length
156 }
157
158 pub fn erased(&self) -> &ErasedStats {
160 &self.erased
161 }
162}
163
164fn typed_float_stats<T: NativePType + Float>(
166 array: &PrimitiveArray,
167 count_distinct_values: bool,
168 ctx: &mut ExecutionCtx,
169) -> VortexResult<FloatStats>
170where
171 NativeValue<T>: Hash + Eq,
172 TypedStats<T>: Into<ErasedStats>,
173{
174 if array.is_empty() {
176 return Ok(FloatStats {
177 null_count: 0,
178 value_count: 0,
179 average_run_length: 0,
180 erased: TypedStats { distinct: None }.into(),
181 });
182 }
183
184 if array.all_invalid(ctx)? {
185 return Ok(FloatStats {
186 null_count: u32::try_from(array.len())?,
187 value_count: 0,
188 average_run_length: 0,
189 erased: TypedStats {
190 distinct: Some(DistinctInfo {
191 distinct_values: HashSet::with_capacity_and_hasher(0, FxBuildHasher),
192 distinct_count: 0,
193 }),
194 }
195 .into(),
196 });
197 }
198
199 let null_count = array
200 .statistics()
201 .compute_null_count(ctx)
202 .ok_or_else(|| vortex_err!("Failed to compute null_count"))?;
203 let value_count = array.len() - null_count;
204
205 let mut distinct_values = if count_distinct_values {
208 HashSet::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
209 } else {
210 HashSet::with_hasher(FxBuildHasher)
211 };
212
213 let validity = array
214 .as_ref()
215 .validity()?
216 .execute_mask(array.as_ref().len(), ctx)?;
217
218 let mut runs = 1;
219 let head_idx = validity
220 .first()
221 .vortex_expect("All null masks have been handled before");
222 let buff = array.to_buffer::<T>();
223 let mut prev = buff[head_idx];
224
225 let first_valid_buff = buff.slice(head_idx..array.len());
226 match validity.bit_buffer() {
227 AllOr::All => {
228 for value in first_valid_buff {
229 if count_distinct_values {
230 distinct_values.insert(NativeValue(value));
231 }
232
233 if value != prev {
234 prev = value;
235 runs += 1;
236 }
237 }
238 }
239 AllOr::None => unreachable!("All invalid arrays have been handled earlier"),
240 AllOr::Some(v) => {
241 for (&value, valid) in first_valid_buff
242 .iter()
243 .zip_eq(v.slice(head_idx..array.len()).iter())
244 {
245 if valid {
246 if count_distinct_values {
247 distinct_values.insert(NativeValue(value));
248 }
249
250 if value != prev {
251 prev = value;
252 runs += 1;
253 }
254 }
255 }
256 }
257 }
258
259 let null_count = u32::try_from(null_count)?;
260 let value_count = u32::try_from(value_count)?;
261
262 let distinct = count_distinct_values.then(|| DistinctInfo {
263 distinct_count: u32::try_from(distinct_values.len())
264 .vortex_expect("more than u32::MAX distinct values"),
265 distinct_values,
266 });
267
268 Ok(FloatStats {
269 null_count,
270 value_count,
271 average_run_length: value_count / runs,
272 erased: TypedStats { distinct }.into(),
273 })
274}
275
276#[cfg(test)]
277mod tests {
278 use vortex_array::IntoArray;
279 use vortex_array::LEGACY_SESSION;
280 use vortex_array::VortexSessionExecute;
281 use vortex_array::arrays::PrimitiveArray;
282 use vortex_array::validity::Validity;
283 use vortex_buffer::buffer;
284 use vortex_error::VortexResult;
285
286 use super::FloatStats;
287
288 #[test]
289 fn test_float_stats() -> VortexResult<()> {
290 let mut ctx = LEGACY_SESSION.create_execution_ctx();
291 let floats = buffer![0.0f32, 1.0f32, 2.0f32].into_array();
292 let floats = floats.execute::<PrimitiveArray>(&mut ctx)?;
293
294 let stats = FloatStats::generate_opts(
295 &floats,
296 crate::stats::GenerateStatsOptions {
297 count_distinct_values: true,
298 },
299 &mut ctx,
300 );
301
302 assert_eq!(stats.value_count, 3);
303 assert_eq!(stats.null_count, 0);
304 assert_eq!(stats.average_run_length, 1);
305 assert_eq!(stats.distinct_count().unwrap(), 3);
306 Ok(())
307 }
308
309 #[test]
310 fn test_float_stats_leading_nulls() {
311 let mut ctx = LEGACY_SESSION.create_execution_ctx();
312 let floats = PrimitiveArray::new(
313 buffer![0.0f32, 1.0f32, 2.0f32],
314 Validity::from_iter([false, true, true]),
315 );
316
317 let stats = FloatStats::generate_opts(
318 &floats,
319 crate::stats::GenerateStatsOptions {
320 count_distinct_values: true,
321 },
322 &mut ctx,
323 );
324
325 assert_eq!(stats.value_count, 2);
326 assert_eq!(stats.null_count, 1);
327 assert_eq!(stats.average_run_length, 1);
328 assert_eq!(stats.distinct_count().unwrap(), 2);
329 }
330}