1use std::hash::Hash;
7
8use num_traits::PrimInt;
9use rustc_hash::FxBuildHasher;
10use vortex_array::ExecutionCtx;
11use vortex_array::arrays::PrimitiveArray;
12use vortex_array::arrays::primitive::NativeValue;
13use vortex_array::dtype::IntegerPType;
14use vortex_array::expr::stats::Stat;
15use vortex_array::match_each_integer_ptype;
16use vortex_array::scalar::PValue;
17use vortex_array::scalar::Scalar;
18use vortex_buffer::BitBuffer;
19use vortex_error::VortexError;
20use vortex_error::VortexExpect;
21use vortex_error::VortexResult;
22use vortex_mask::AllOr;
23use vortex_utils::aliases::hash_map::HashMap;
24
25use super::GenerateStatsOptions;
26
27#[derive(Debug, Clone)]
29pub struct DistinctInfo<T> {
30 distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
32 distinct_count: u32,
34 most_frequent_value: T,
36 top_frequency: u32,
38}
39
40impl<T> DistinctInfo<T> {
41 pub fn distinct_values(&self) -> &HashMap<NativeValue<T>, u32, FxBuildHasher> {
43 &self.distinct_values
44 }
45}
46
47#[derive(Debug, Clone)]
49pub struct TypedStats<T> {
50 min: T,
52 max: T,
54 distinct: Option<DistinctInfo<T>>,
56}
57
58impl<T> TypedStats<T> {
59 pub fn distinct(&self) -> Option<&DistinctInfo<T>> {
61 self.distinct.as_ref()
62 }
63}
64
65impl<T> TypedStats<T> {
66 fn distinct_count(&self) -> Option<u32> {
68 Some(self.distinct.as_ref()?.distinct_count)
69 }
70
71 fn most_frequent_value_and_count(&self) -> Option<(&T, u32)> {
73 let distinct = self.distinct.as_ref()?;
74 Some((&distinct.most_frequent_value, distinct.top_frequency))
75 }
76}
77
78#[derive(Clone, Debug)]
83pub enum ErasedStats {
84 U8(TypedStats<u8>),
86 U16(TypedStats<u16>),
88 U32(TypedStats<u32>),
90 U64(TypedStats<u64>),
92 I8(TypedStats<i8>),
94 I16(TypedStats<i16>),
96 I32(TypedStats<i32>),
98 I64(TypedStats<i64>),
100}
101
102impl ErasedStats {
103 pub fn min_is_zero(&self) -> bool {
105 match &self {
106 ErasedStats::U8(x) => x.min == 0,
107 ErasedStats::U16(x) => x.min == 0,
108 ErasedStats::U32(x) => x.min == 0,
109 ErasedStats::U64(x) => x.min == 0,
110 ErasedStats::I8(x) => x.min == 0,
111 ErasedStats::I16(x) => x.min == 0,
112 ErasedStats::I32(x) => x.min == 0,
113 ErasedStats::I64(x) => x.min == 0,
114 }
115 }
116
117 pub fn min_is_negative(&self) -> bool {
119 match &self {
120 ErasedStats::U8(_)
121 | ErasedStats::U16(_)
122 | ErasedStats::U32(_)
123 | ErasedStats::U64(_) => false,
124 ErasedStats::I8(x) => x.min < 0,
125 ErasedStats::I16(x) => x.min < 0,
126 ErasedStats::I32(x) => x.min < 0,
127 ErasedStats::I64(x) => x.min < 0,
128 }
129 }
130
131 pub fn max_minus_min(&self) -> u64 {
133 match &self {
134 ErasedStats::U8(x) => (x.max - x.min) as u64,
135 ErasedStats::U16(x) => (x.max - x.min) as u64,
136 ErasedStats::U32(x) => (x.max - x.min) as u64,
137 ErasedStats::U64(x) => x.max - x.min,
138 ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
139 ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
140 ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
141 ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
142 .vortex_expect("max minus min result bigger than u64"),
143 }
144 }
145
146 pub fn max_ilog2(&self) -> Option<u32> {
155 match &self {
156 ErasedStats::U8(x) => x.max.checked_ilog2(),
157 ErasedStats::U16(x) => x.max.checked_ilog2(),
158 ErasedStats::U32(x) => x.max.checked_ilog2(),
159 ErasedStats::U64(x) => x.max.checked_ilog2(),
160 ErasedStats::I8(x) => (x.max as u8).checked_ilog2(),
162 ErasedStats::I16(x) => (x.max as u16).checked_ilog2(),
163 ErasedStats::I32(x) => (x.max as u32).checked_ilog2(),
164 ErasedStats::I64(x) => (x.max as u64).checked_ilog2(),
165 }
166 }
167
168 pub fn distinct_count(&self) -> Option<u32> {
170 match &self {
171 ErasedStats::U8(x) => x.distinct_count(),
172 ErasedStats::U16(x) => x.distinct_count(),
173 ErasedStats::U32(x) => x.distinct_count(),
174 ErasedStats::U64(x) => x.distinct_count(),
175 ErasedStats::I8(x) => x.distinct_count(),
176 ErasedStats::I16(x) => x.distinct_count(),
177 ErasedStats::I32(x) => x.distinct_count(),
178 ErasedStats::I64(x) => x.distinct_count(),
179 }
180 }
181
182 pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> {
184 match &self {
185 ErasedStats::U8(x) => {
186 let (top_value, count) = x.most_frequent_value_and_count()?;
187 Some(((*top_value).into(), count))
188 }
189 ErasedStats::U16(x) => {
190 let (top_value, count) = x.most_frequent_value_and_count()?;
191 Some(((*top_value).into(), count))
192 }
193 ErasedStats::U32(x) => {
194 let (top_value, count) = x.most_frequent_value_and_count()?;
195 Some(((*top_value).into(), count))
196 }
197 ErasedStats::U64(x) => {
198 let (top_value, count) = x.most_frequent_value_and_count()?;
199 Some(((*top_value).into(), count))
200 }
201 ErasedStats::I8(x) => {
202 let (top_value, count) = x.most_frequent_value_and_count()?;
203 Some(((*top_value).into(), count))
204 }
205 ErasedStats::I16(x) => {
206 let (top_value, count) = x.most_frequent_value_and_count()?;
207 Some(((*top_value).into(), count))
208 }
209 ErasedStats::I32(x) => {
210 let (top_value, count) = x.most_frequent_value_and_count()?;
211 Some(((*top_value).into(), count))
212 }
213 ErasedStats::I64(x) => {
214 let (top_value, count) = x.most_frequent_value_and_count()?;
215 Some(((*top_value).into(), count))
216 }
217 }
218 }
219}
220
221macro_rules! impl_from_typed {
223 ($T:ty, $variant:path) => {
224 impl From<TypedStats<$T>> for ErasedStats {
225 fn from(typed: TypedStats<$T>) -> Self {
226 $variant(typed)
227 }
228 }
229 };
230}
231
232impl_from_typed!(u8, ErasedStats::U8);
233impl_from_typed!(u16, ErasedStats::U16);
234impl_from_typed!(u32, ErasedStats::U32);
235impl_from_typed!(u64, ErasedStats::U64);
236impl_from_typed!(i8, ErasedStats::I8);
237impl_from_typed!(i16, ErasedStats::I16);
238impl_from_typed!(i32, ErasedStats::I32);
239impl_from_typed!(i64, ErasedStats::I64);
240
241#[derive(Clone, Debug)]
243pub struct IntegerStats {
244 null_count: u32,
246 value_count: u32,
248 average_run_length: u32,
250 erased: ErasedStats,
252}
253
254impl IntegerStats {
255 fn generate_opts_fallible(
257 input: &PrimitiveArray,
258 opts: GenerateStatsOptions,
259 ctx: &mut ExecutionCtx,
260 ) -> VortexResult<Self> {
261 match_each_integer_ptype!(input.ptype(), |T| {
262 typed_int_stats::<T>(input, opts.count_distinct_values, ctx)
263 })
264 }
265
266 pub fn distinct_count(&self) -> Option<u32> {
268 self.erased.distinct_count()
269 }
270
271 pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> {
273 self.erased.most_frequent_value_and_count()
274 }
275}
276
277impl IntegerStats {
278 pub fn generate(input: &PrimitiveArray, ctx: &mut ExecutionCtx) -> Self {
280 Self::generate_opts(input, GenerateStatsOptions::default(), ctx)
281 }
282
283 pub fn generate_opts(
285 input: &PrimitiveArray,
286 opts: GenerateStatsOptions,
287 ctx: &mut ExecutionCtx,
288 ) -> Self {
289 Self::generate_opts_fallible(input, opts, ctx)
290 .vortex_expect("IntegerStats::generate_opts should not fail")
291 }
292
293 pub fn null_count(&self) -> u32 {
295 self.null_count
296 }
297
298 pub fn value_count(&self) -> u32 {
300 self.value_count
301 }
302
303 pub fn average_run_length(&self) -> u32 {
305 self.average_run_length
306 }
307
308 pub fn erased(&self) -> &ErasedStats {
310 &self.erased
311 }
312}
313
314fn typed_int_stats<T>(
316 array: &PrimitiveArray,
317 count_distinct_values: bool,
318 ctx: &mut ExecutionCtx,
319) -> VortexResult<IntegerStats>
320where
321 T: IntegerPType + PrimInt + for<'a> TryFrom<&'a Scalar, Error = VortexError>,
322 TypedStats<T>: Into<ErasedStats>,
323 NativeValue<T>: Eq + Hash,
324{
325 if array.is_empty() {
327 return Ok(IntegerStats {
328 null_count: 0,
329 value_count: 0,
330 average_run_length: 0,
331 erased: TypedStats {
332 min: T::max_value(),
333 max: T::min_value(),
334 distinct: None,
335 }
336 .into(),
337 });
338 }
339
340 if array.all_invalid(ctx)? {
341 return Ok(IntegerStats {
342 null_count: u32::try_from(array.len())?,
343 value_count: 0,
344 average_run_length: 0,
345 erased: TypedStats {
346 min: T::max_value(),
347 max: T::min_value(),
348 distinct: Some(DistinctInfo {
349 distinct_values: HashMap::with_capacity_and_hasher(0, FxBuildHasher),
350 distinct_count: 0,
351 most_frequent_value: T::zero(),
352 top_frequency: 0,
353 }),
354 }
355 .into(),
356 });
357 }
358
359 let validity = array
360 .as_ref()
361 .validity()?
362 .execute_mask(array.as_ref().len(), ctx)?;
363 let null_count = validity.false_count();
364 let value_count = validity.true_count();
365
366 let head_idx = validity
368 .first()
369 .vortex_expect("All null masks have been handled before");
370 let buffer = array.to_buffer::<T>();
371 let head = buffer[head_idx];
372
373 let mut loop_state = LoopState {
374 distinct_values: if count_distinct_values {
375 HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
376 } else {
377 HashMap::with_hasher(FxBuildHasher)
378 },
379 prev: head,
380 runs: 1,
381 };
382
383 let sliced = buffer.slice(head_idx..array.len());
384 let mut chunks = sliced.as_slice().chunks_exact(64);
385 match validity.bit_buffer() {
386 AllOr::All => {
387 for chunk in &mut chunks {
388 inner_loop_nonnull(
389 chunk.try_into().ok().vortex_expect("chunk size must be 64"),
390 count_distinct_values,
391 &mut loop_state,
392 )
393 }
394 let remainder = chunks.remainder();
395 inner_loop_naive(
396 remainder,
397 count_distinct_values,
398 &BitBuffer::new_set(remainder.len()),
399 &mut loop_state,
400 );
401 }
402 AllOr::None => unreachable!("All invalid arrays have been handled before"),
403 AllOr::Some(v) => {
404 let mask = v.slice(head_idx..array.len());
405 let mut offset = 0;
406 for chunk in &mut chunks {
407 let validity = mask.slice(offset..(offset + 64));
408 offset += 64;
409
410 match validity.true_count() {
411 0 => continue,
413 64 => inner_loop_nonnull(
415 chunk.try_into().ok().vortex_expect("chunk size must be 64"),
416 count_distinct_values,
417 &mut loop_state,
418 ),
419 _ => inner_loop_nullable(
421 chunk.try_into().ok().vortex_expect("chunk size must be 64"),
422 count_distinct_values,
423 &validity,
424 &mut loop_state,
425 ),
426 }
427 }
428 let remainder = chunks.remainder();
430 inner_loop_naive(
431 remainder,
432 count_distinct_values,
433 &mask.slice(offset..(offset + remainder.len())),
434 &mut loop_state,
435 );
436 }
437 }
438
439 let runs = loop_state.runs;
440
441 let array_ref = array.as_ref();
442 let min = array_ref
443 .statistics()
444 .compute_as::<T>(Stat::Min, ctx)
445 .vortex_expect("min should be computed");
446
447 let max = array_ref
448 .statistics()
449 .compute_as::<T>(Stat::Max, ctx)
450 .vortex_expect("max should be computed");
451
452 let distinct = count_distinct_values.then(|| {
453 let (&top_value, &top_count) = loop_state
454 .distinct_values
455 .iter()
456 .max_by_key(|&(_, &count)| count)
457 .vortex_expect("we know this is non-empty");
458
459 DistinctInfo {
460 distinct_count: u32::try_from(loop_state.distinct_values.len())
461 .vortex_expect("there are more than `u32::MAX` distinct values"),
462 most_frequent_value: top_value.0,
463 top_frequency: top_count,
464 distinct_values: loop_state.distinct_values,
465 }
466 });
467
468 let typed = TypedStats { min, max, distinct };
469
470 let null_count = u32::try_from(null_count)?;
471 let value_count = u32::try_from(value_count)?;
472
473 Ok(IntegerStats {
474 null_count,
475 value_count,
476 average_run_length: value_count / runs,
477 erased: typed.into(),
478 })
479}
480
481struct LoopState<T> {
483 prev: T,
485 runs: u32,
487 distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
489}
490
491#[inline(always)]
493fn inner_loop_nonnull<T: IntegerPType>(
494 values: &[T; 64],
495 count_distinct_values: bool,
496 state: &mut LoopState<T>,
497) where
498 NativeValue<T>: Eq + Hash,
499{
500 for &value in values {
501 if count_distinct_values {
502 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
503 }
504
505 if value != state.prev {
506 state.prev = value;
507 state.runs += 1;
508 }
509 }
510}
511
512#[inline(always)]
514fn inner_loop_nullable<T: IntegerPType>(
515 values: &[T; 64],
516 count_distinct_values: bool,
517 is_valid: &BitBuffer,
518 state: &mut LoopState<T>,
519) where
520 NativeValue<T>: Eq + Hash,
521{
522 for (idx, &value) in values.iter().enumerate() {
523 if is_valid.value(idx) {
524 if count_distinct_values {
525 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
526 }
527
528 if value != state.prev {
529 state.prev = value;
530 state.runs += 1;
531 }
532 }
533 }
534}
535
536#[inline(always)]
538fn inner_loop_naive<T: IntegerPType>(
539 values: &[T],
540 count_distinct_values: bool,
541 is_valid: &BitBuffer,
542 state: &mut LoopState<T>,
543) where
544 NativeValue<T>: Eq + Hash,
545{
546 for (idx, &value) in values.iter().enumerate() {
547 if is_valid.value(idx) {
548 if count_distinct_values {
549 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
550 }
551
552 if value != state.prev {
553 state.prev = value;
554 state.runs += 1;
555 }
556 }
557 }
558}
559
560#[cfg(test)]
561mod tests {
562 use std::iter;
563
564 use vortex_array::LEGACY_SESSION;
565 use vortex_array::VortexSessionExecute;
566 use vortex_array::arrays::PrimitiveArray;
567 use vortex_array::validity::Validity;
568 use vortex_buffer::BitBuffer;
569 use vortex_buffer::Buffer;
570 use vortex_buffer::buffer;
571 use vortex_error::VortexResult;
572
573 use super::IntegerStats;
574 use super::typed_int_stats;
575
576 #[test]
577 fn test_naive_count_distinct_values() -> VortexResult<()> {
578 let mut ctx = LEGACY_SESSION.create_execution_ctx();
579 let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
580 let stats = typed_int_stats::<u8>(&array, true, &mut ctx)?;
581 assert_eq!(stats.distinct_count().unwrap(), 2);
582 Ok(())
583 }
584
585 #[test]
586 fn test_naive_count_distinct_values_nullable() -> VortexResult<()> {
587 let mut ctx = LEGACY_SESSION.create_execution_ctx();
588 let array = PrimitiveArray::new(
589 buffer![217u8, 0],
590 Validity::from(BitBuffer::from(vec![true, false])),
591 );
592 let stats = typed_int_stats::<u8>(&array, true, &mut ctx)?;
593 assert_eq!(stats.distinct_count().unwrap(), 1);
594 Ok(())
595 }
596
597 #[test]
598 fn test_count_distinct_values() -> VortexResult<()> {
599 let mut ctx = LEGACY_SESSION.create_execution_ctx();
600 let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
601 let stats = typed_int_stats::<u8>(&array, true, &mut ctx)?;
602 assert_eq!(stats.distinct_count().unwrap(), 128);
603 Ok(())
604 }
605
606 #[test]
607 fn test_count_distinct_values_nullable() -> VortexResult<()> {
608 let mut ctx = LEGACY_SESSION.create_execution_ctx();
609 let array = PrimitiveArray::new(
610 (0..128u8).collect::<Buffer<u8>>(),
611 Validity::from(BitBuffer::from_iter(
612 iter::repeat_n(vec![true, false], 64).flatten(),
613 )),
614 );
615 let stats = typed_int_stats::<u8>(&array, true, &mut ctx)?;
616 assert_eq!(stats.distinct_count().unwrap(), 64);
617 Ok(())
618 }
619
620 #[test]
621 fn test_integer_stats_leading_nulls() {
622 let mut ctx = LEGACY_SESSION.create_execution_ctx();
623 let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
624
625 let stats = IntegerStats::generate_opts(
626 &ints,
627 crate::stats::GenerateStatsOptions {
628 count_distinct_values: true,
629 },
630 &mut ctx,
631 );
632
633 assert_eq!(stats.value_count, 2);
634 assert_eq!(stats.null_count, 1);
635 assert_eq!(stats.average_run_length, 1);
636 assert_eq!(stats.distinct_count().unwrap(), 2);
637 }
638}