1use std::hash::Hash;
7
8use num_traits::PrimInt;
9use rustc_hash::FxBuildHasher;
10use vortex_array::LEGACY_SESSION;
11use vortex_array::VortexSessionExecute;
12use vortex_array::arrays::PrimitiveArray;
13use vortex_array::arrays::primitive::NativeValue;
14use vortex_array::dtype::IntegerPType;
15use vortex_array::expr::stats::Stat;
16use vortex_array::match_each_integer_ptype;
17use vortex_array::scalar::PValue;
18use vortex_array::scalar::Scalar;
19use vortex_buffer::BitBuffer;
20use vortex_error::VortexError;
21use vortex_error::VortexExpect;
22use vortex_error::VortexResult;
23use vortex_mask::AllOr;
24use vortex_utils::aliases::hash_map::HashMap;
25
26use super::GenerateStatsOptions;
27
28#[derive(Debug, Clone)]
30pub struct DistinctInfo<T> {
31 distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
33 distinct_count: u32,
35 most_frequent_value: T,
37 top_frequency: u32,
39}
40
41impl<T> DistinctInfo<T> {
42 pub fn distinct_values(&self) -> &HashMap<NativeValue<T>, u32, FxBuildHasher> {
44 &self.distinct_values
45 }
46}
47
48#[derive(Debug, Clone)]
50pub struct TypedStats<T> {
51 min: T,
53 max: T,
55 distinct: Option<DistinctInfo<T>>,
57}
58
59impl<T> TypedStats<T> {
60 pub fn distinct(&self) -> Option<&DistinctInfo<T>> {
62 self.distinct.as_ref()
63 }
64}
65
66impl<T> TypedStats<T> {
67 fn distinct_count(&self) -> Option<u32> {
69 Some(self.distinct.as_ref()?.distinct_count)
70 }
71
72 fn most_frequent_value_and_count(&self) -> Option<(&T, u32)> {
74 let distinct = self.distinct.as_ref()?;
75 Some((&distinct.most_frequent_value, distinct.top_frequency))
76 }
77}
78
79#[derive(Clone, Debug)]
84pub enum ErasedStats {
85 U8(TypedStats<u8>),
87 U16(TypedStats<u16>),
89 U32(TypedStats<u32>),
91 U64(TypedStats<u64>),
93 I8(TypedStats<i8>),
95 I16(TypedStats<i16>),
97 I32(TypedStats<i32>),
99 I64(TypedStats<i64>),
101}
102
103impl ErasedStats {
104 pub fn min_is_zero(&self) -> bool {
106 match &self {
107 ErasedStats::U8(x) => x.min == 0,
108 ErasedStats::U16(x) => x.min == 0,
109 ErasedStats::U32(x) => x.min == 0,
110 ErasedStats::U64(x) => x.min == 0,
111 ErasedStats::I8(x) => x.min == 0,
112 ErasedStats::I16(x) => x.min == 0,
113 ErasedStats::I32(x) => x.min == 0,
114 ErasedStats::I64(x) => x.min == 0,
115 }
116 }
117
118 pub fn min_is_negative(&self) -> bool {
120 match &self {
121 ErasedStats::U8(_)
122 | ErasedStats::U16(_)
123 | ErasedStats::U32(_)
124 | ErasedStats::U64(_) => false,
125 ErasedStats::I8(x) => x.min < 0,
126 ErasedStats::I16(x) => x.min < 0,
127 ErasedStats::I32(x) => x.min < 0,
128 ErasedStats::I64(x) => x.min < 0,
129 }
130 }
131
132 pub fn max_minus_min(&self) -> u64 {
134 match &self {
135 ErasedStats::U8(x) => (x.max - x.min) as u64,
136 ErasedStats::U16(x) => (x.max - x.min) as u64,
137 ErasedStats::U32(x) => (x.max - x.min) as u64,
138 ErasedStats::U64(x) => x.max - x.min,
139 ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
140 ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
141 ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
142 ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
143 .vortex_expect("max minus min result bigger than u64"),
144 }
145 }
146
147 pub fn max_ilog2(&self) -> Option<u32> {
156 match &self {
157 ErasedStats::U8(x) => x.max.checked_ilog2(),
158 ErasedStats::U16(x) => x.max.checked_ilog2(),
159 ErasedStats::U32(x) => x.max.checked_ilog2(),
160 ErasedStats::U64(x) => x.max.checked_ilog2(),
161 ErasedStats::I8(x) => (x.max as u8).checked_ilog2(),
163 ErasedStats::I16(x) => (x.max as u16).checked_ilog2(),
164 ErasedStats::I32(x) => (x.max as u32).checked_ilog2(),
165 ErasedStats::I64(x) => (x.max as u64).checked_ilog2(),
166 }
167 }
168
169 pub fn distinct_count(&self) -> Option<u32> {
171 match &self {
172 ErasedStats::U8(x) => x.distinct_count(),
173 ErasedStats::U16(x) => x.distinct_count(),
174 ErasedStats::U32(x) => x.distinct_count(),
175 ErasedStats::U64(x) => x.distinct_count(),
176 ErasedStats::I8(x) => x.distinct_count(),
177 ErasedStats::I16(x) => x.distinct_count(),
178 ErasedStats::I32(x) => x.distinct_count(),
179 ErasedStats::I64(x) => x.distinct_count(),
180 }
181 }
182
183 pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> {
185 match &self {
186 ErasedStats::U8(x) => {
187 let (top_value, count) = x.most_frequent_value_and_count()?;
188 Some(((*top_value).into(), count))
189 }
190 ErasedStats::U16(x) => {
191 let (top_value, count) = x.most_frequent_value_and_count()?;
192 Some(((*top_value).into(), count))
193 }
194 ErasedStats::U32(x) => {
195 let (top_value, count) = x.most_frequent_value_and_count()?;
196 Some(((*top_value).into(), count))
197 }
198 ErasedStats::U64(x) => {
199 let (top_value, count) = x.most_frequent_value_and_count()?;
200 Some(((*top_value).into(), count))
201 }
202 ErasedStats::I8(x) => {
203 let (top_value, count) = x.most_frequent_value_and_count()?;
204 Some(((*top_value).into(), count))
205 }
206 ErasedStats::I16(x) => {
207 let (top_value, count) = x.most_frequent_value_and_count()?;
208 Some(((*top_value).into(), count))
209 }
210 ErasedStats::I32(x) => {
211 let (top_value, count) = x.most_frequent_value_and_count()?;
212 Some(((*top_value).into(), count))
213 }
214 ErasedStats::I64(x) => {
215 let (top_value, count) = x.most_frequent_value_and_count()?;
216 Some(((*top_value).into(), count))
217 }
218 }
219 }
220}
221
222macro_rules! impl_from_typed {
224 ($T:ty, $variant:path) => {
225 impl From<TypedStats<$T>> for ErasedStats {
226 fn from(typed: TypedStats<$T>) -> Self {
227 $variant(typed)
228 }
229 }
230 };
231}
232
233impl_from_typed!(u8, ErasedStats::U8);
234impl_from_typed!(u16, ErasedStats::U16);
235impl_from_typed!(u32, ErasedStats::U32);
236impl_from_typed!(u64, ErasedStats::U64);
237impl_from_typed!(i8, ErasedStats::I8);
238impl_from_typed!(i16, ErasedStats::I16);
239impl_from_typed!(i32, ErasedStats::I32);
240impl_from_typed!(i64, ErasedStats::I64);
241
242#[derive(Clone, Debug)]
244pub struct IntegerStats {
245 null_count: u32,
247 value_count: u32,
249 average_run_length: u32,
251 erased: ErasedStats,
253}
254
255impl IntegerStats {
256 fn generate_opts_fallible(
258 input: &PrimitiveArray,
259 opts: GenerateStatsOptions,
260 ) -> VortexResult<Self> {
261 match_each_integer_ptype!(input.ptype(), |T| {
262 typed_int_stats::<T>(input, opts.count_distinct_values)
263 })
264 }
265
266 pub fn distinct_count(&self) -> Option<u32> {
268 self.erased.distinct_count()
269 }
270
271 pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> {
273 self.erased.most_frequent_value_and_count()
274 }
275}
276
277impl IntegerStats {
278 pub fn generate(input: &PrimitiveArray) -> Self {
280 Self::generate_opts(input, GenerateStatsOptions::default())
281 }
282
283 pub fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
285 Self::generate_opts_fallible(input, opts)
286 .vortex_expect("IntegerStats::generate_opts should not fail")
287 }
288
289 pub fn null_count(&self) -> u32 {
291 self.null_count
292 }
293
294 pub fn value_count(&self) -> u32 {
296 self.value_count
297 }
298
299 pub fn average_run_length(&self) -> u32 {
301 self.average_run_length
302 }
303
304 pub fn erased(&self) -> &ErasedStats {
306 &self.erased
307 }
308}
309
310fn typed_int_stats<T>(
312 array: &PrimitiveArray,
313 count_distinct_values: bool,
314) -> VortexResult<IntegerStats>
315where
316 T: IntegerPType + PrimInt + for<'a> TryFrom<&'a Scalar, Error = VortexError>,
317 TypedStats<T>: Into<ErasedStats>,
318 NativeValue<T>: Eq + Hash,
319{
320 if array.is_empty() {
322 return Ok(IntegerStats {
323 null_count: 0,
324 value_count: 0,
325 average_run_length: 0,
326 erased: TypedStats {
327 min: T::max_value(),
328 max: T::min_value(),
329 distinct: None,
330 }
331 .into(),
332 });
333 }
334
335 let mut ctx = LEGACY_SESSION.create_execution_ctx();
336 if array.all_invalid(&mut ctx)? {
337 return Ok(IntegerStats {
338 null_count: u32::try_from(array.len())?,
339 value_count: 0,
340 average_run_length: 0,
341 erased: TypedStats {
342 min: T::max_value(),
343 max: T::min_value(),
344 distinct: Some(DistinctInfo {
345 distinct_values: HashMap::with_capacity_and_hasher(0, FxBuildHasher),
346 distinct_count: 0,
347 most_frequent_value: T::zero(),
348 top_frequency: 0,
349 }),
350 }
351 .into(),
352 });
353 }
354
355 let validity = array.as_ref().validity()?.to_mask(
356 array.as_ref().len(),
357 &mut LEGACY_SESSION.create_execution_ctx(),
358 )?;
359 let null_count = validity.false_count();
360 let value_count = validity.true_count();
361
362 let head_idx = validity
364 .first()
365 .vortex_expect("All null masks have been handled before");
366 let buffer = array.to_buffer::<T>();
367 let head = buffer[head_idx];
368
369 let mut loop_state = LoopState {
370 distinct_values: if count_distinct_values {
371 HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
372 } else {
373 HashMap::with_hasher(FxBuildHasher)
374 },
375 prev: head,
376 runs: 1,
377 };
378
379 let sliced = buffer.slice(head_idx..array.len());
380 let mut chunks = sliced.as_slice().chunks_exact(64);
381 match validity.bit_buffer() {
382 AllOr::All => {
383 for chunk in &mut chunks {
384 inner_loop_nonnull(
385 chunk.try_into().ok().vortex_expect("chunk size must be 64"),
386 count_distinct_values,
387 &mut loop_state,
388 )
389 }
390 let remainder = chunks.remainder();
391 inner_loop_naive(
392 remainder,
393 count_distinct_values,
394 &BitBuffer::new_set(remainder.len()),
395 &mut loop_state,
396 );
397 }
398 AllOr::None => unreachable!("All invalid arrays have been handled before"),
399 AllOr::Some(v) => {
400 let mask = v.slice(head_idx..array.len());
401 let mut offset = 0;
402 for chunk in &mut chunks {
403 let validity = mask.slice(offset..(offset + 64));
404 offset += 64;
405
406 match validity.true_count() {
407 0 => continue,
409 64 => inner_loop_nonnull(
411 chunk.try_into().ok().vortex_expect("chunk size must be 64"),
412 count_distinct_values,
413 &mut loop_state,
414 ),
415 _ => inner_loop_nullable(
417 chunk.try_into().ok().vortex_expect("chunk size must be 64"),
418 count_distinct_values,
419 &validity,
420 &mut loop_state,
421 ),
422 }
423 }
424 let remainder = chunks.remainder();
426 inner_loop_naive(
427 remainder,
428 count_distinct_values,
429 &mask.slice(offset..(offset + remainder.len())),
430 &mut loop_state,
431 );
432 }
433 }
434
435 let runs = loop_state.runs;
436
437 let array_ref = array.as_ref();
438 let min = array_ref
439 .statistics()
440 .compute_as::<T>(Stat::Min, &mut ctx)
441 .vortex_expect("min should be computed");
442
443 let max = array_ref
444 .statistics()
445 .compute_as::<T>(Stat::Max, &mut ctx)
446 .vortex_expect("max should be computed");
447
448 let distinct = count_distinct_values.then(|| {
449 let (&top_value, &top_count) = loop_state
450 .distinct_values
451 .iter()
452 .max_by_key(|&(_, &count)| count)
453 .vortex_expect("we know this is non-empty");
454
455 DistinctInfo {
456 distinct_count: u32::try_from(loop_state.distinct_values.len())
457 .vortex_expect("there are more than `u32::MAX` distinct values"),
458 most_frequent_value: top_value.0,
459 top_frequency: top_count,
460 distinct_values: loop_state.distinct_values,
461 }
462 });
463
464 let typed = TypedStats { min, max, distinct };
465
466 let null_count = u32::try_from(null_count)?;
467 let value_count = u32::try_from(value_count)?;
468
469 Ok(IntegerStats {
470 null_count,
471 value_count,
472 average_run_length: value_count / runs,
473 erased: typed.into(),
474 })
475}
476
477struct LoopState<T> {
479 prev: T,
481 runs: u32,
483 distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
485}
486
487#[inline(always)]
489fn inner_loop_nonnull<T: IntegerPType>(
490 values: &[T; 64],
491 count_distinct_values: bool,
492 state: &mut LoopState<T>,
493) where
494 NativeValue<T>: Eq + Hash,
495{
496 for &value in values {
497 if count_distinct_values {
498 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
499 }
500
501 if value != state.prev {
502 state.prev = value;
503 state.runs += 1;
504 }
505 }
506}
507
508#[inline(always)]
510fn inner_loop_nullable<T: IntegerPType>(
511 values: &[T; 64],
512 count_distinct_values: bool,
513 is_valid: &BitBuffer,
514 state: &mut LoopState<T>,
515) where
516 NativeValue<T>: Eq + Hash,
517{
518 for (idx, &value) in values.iter().enumerate() {
519 if is_valid.value(idx) {
520 if count_distinct_values {
521 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
522 }
523
524 if value != state.prev {
525 state.prev = value;
526 state.runs += 1;
527 }
528 }
529 }
530}
531
532#[inline(always)]
534fn inner_loop_naive<T: IntegerPType>(
535 values: &[T],
536 count_distinct_values: bool,
537 is_valid: &BitBuffer,
538 state: &mut LoopState<T>,
539) where
540 NativeValue<T>: Eq + Hash,
541{
542 for (idx, &value) in values.iter().enumerate() {
543 if is_valid.value(idx) {
544 if count_distinct_values {
545 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
546 }
547
548 if value != state.prev {
549 state.prev = value;
550 state.runs += 1;
551 }
552 }
553 }
554}
555
556#[cfg(test)]
557mod tests {
558 use std::iter;
559
560 use vortex_array::arrays::PrimitiveArray;
561 use vortex_array::validity::Validity;
562 use vortex_buffer::BitBuffer;
563 use vortex_buffer::Buffer;
564 use vortex_buffer::buffer;
565 use vortex_error::VortexResult;
566
567 use super::IntegerStats;
568 use super::typed_int_stats;
569
570 #[test]
571 fn test_naive_count_distinct_values() -> VortexResult<()> {
572 let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
573 let stats = typed_int_stats::<u8>(&array, true)?;
574 assert_eq!(stats.distinct_count().unwrap(), 2);
575 Ok(())
576 }
577
578 #[test]
579 fn test_naive_count_distinct_values_nullable() -> VortexResult<()> {
580 let array = PrimitiveArray::new(
581 buffer![217u8, 0],
582 Validity::from(BitBuffer::from(vec![true, false])),
583 );
584 let stats = typed_int_stats::<u8>(&array, true)?;
585 assert_eq!(stats.distinct_count().unwrap(), 1);
586 Ok(())
587 }
588
589 #[test]
590 fn test_count_distinct_values() -> VortexResult<()> {
591 let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
592 let stats = typed_int_stats::<u8>(&array, true)?;
593 assert_eq!(stats.distinct_count().unwrap(), 128);
594 Ok(())
595 }
596
597 #[test]
598 fn test_count_distinct_values_nullable() -> VortexResult<()> {
599 let array = PrimitiveArray::new(
600 (0..128u8).collect::<Buffer<u8>>(),
601 Validity::from(BitBuffer::from_iter(
602 iter::repeat_n(vec![true, false], 64).flatten(),
603 )),
604 );
605 let stats = typed_int_stats::<u8>(&array, true)?;
606 assert_eq!(stats.distinct_count().unwrap(), 64);
607 Ok(())
608 }
609
610 #[test]
611 fn test_integer_stats_leading_nulls() {
612 let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
613
614 let stats = IntegerStats::generate_opts(
615 &ints,
616 crate::stats::GenerateStatsOptions {
617 count_distinct_values: true,
618 },
619 );
620
621 assert_eq!(stats.value_count, 2);
622 assert_eq!(stats.null_count, 1);
623 assert_eq!(stats.average_run_length, 1);
624 assert_eq!(stats.distinct_count().unwrap(), 2);
625 }
626}