1use std::hash::Hash;
7
8use num_traits::PrimInt;
9use rustc_hash::FxBuildHasher;
10use vortex_array::arrays::PrimitiveArray;
11use vortex_array::arrays::primitive::NativeValue;
12use vortex_array::dtype::IntegerPType;
13use vortex_array::expr::stats::Stat;
14use vortex_array::match_each_integer_ptype;
15use vortex_array::scalar::PValue;
16use vortex_array::scalar::Scalar;
17use vortex_buffer::BitBuffer;
18use vortex_error::VortexError;
19use vortex_error::VortexExpect;
20use vortex_error::VortexResult;
21use vortex_mask::AllOr;
22use vortex_utils::aliases::hash_map::HashMap;
23
24use super::GenerateStatsOptions;
25
26#[derive(Debug, Clone)]
28pub struct DistinctInfo<T> {
29 distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
31 distinct_count: u32,
33 most_frequent_value: T,
35 top_frequency: u32,
37}
38
39impl<T> DistinctInfo<T> {
40 pub fn distinct_values(&self) -> &HashMap<NativeValue<T>, u32, FxBuildHasher> {
42 &self.distinct_values
43 }
44}
45
46#[derive(Debug, Clone)]
48pub struct TypedStats<T> {
49 min: T,
51 max: T,
53 distinct: Option<DistinctInfo<T>>,
55}
56
57impl<T> TypedStats<T> {
58 pub fn distinct(&self) -> Option<&DistinctInfo<T>> {
60 self.distinct.as_ref()
61 }
62}
63
64impl<T> TypedStats<T> {
65 fn distinct_count(&self) -> Option<u32> {
67 Some(self.distinct.as_ref()?.distinct_count)
68 }
69
70 fn most_frequent_value_and_count(&self) -> Option<(&T, u32)> {
72 let distinct = self.distinct.as_ref()?;
73 Some((&distinct.most_frequent_value, distinct.top_frequency))
74 }
75}
76
77#[derive(Clone, Debug)]
82pub enum ErasedStats {
83 U8(TypedStats<u8>),
85 U16(TypedStats<u16>),
87 U32(TypedStats<u32>),
89 U64(TypedStats<u64>),
91 I8(TypedStats<i8>),
93 I16(TypedStats<i16>),
95 I32(TypedStats<i32>),
97 I64(TypedStats<i64>),
99}
100
101impl ErasedStats {
102 pub fn min_is_zero(&self) -> bool {
104 match &self {
105 ErasedStats::U8(x) => x.min == 0,
106 ErasedStats::U16(x) => x.min == 0,
107 ErasedStats::U32(x) => x.min == 0,
108 ErasedStats::U64(x) => x.min == 0,
109 ErasedStats::I8(x) => x.min == 0,
110 ErasedStats::I16(x) => x.min == 0,
111 ErasedStats::I32(x) => x.min == 0,
112 ErasedStats::I64(x) => x.min == 0,
113 }
114 }
115
116 pub fn min_is_negative(&self) -> bool {
118 match &self {
119 ErasedStats::U8(_)
120 | ErasedStats::U16(_)
121 | ErasedStats::U32(_)
122 | ErasedStats::U64(_) => false,
123 ErasedStats::I8(x) => x.min < 0,
124 ErasedStats::I16(x) => x.min < 0,
125 ErasedStats::I32(x) => x.min < 0,
126 ErasedStats::I64(x) => x.min < 0,
127 }
128 }
129
130 pub fn max_minus_min(&self) -> u64 {
132 match &self {
133 ErasedStats::U8(x) => (x.max - x.min) as u64,
134 ErasedStats::U16(x) => (x.max - x.min) as u64,
135 ErasedStats::U32(x) => (x.max - x.min) as u64,
136 ErasedStats::U64(x) => x.max - x.min,
137 ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
138 ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
139 ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
140 ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
141 .vortex_expect("max minus min result bigger than u64"),
142 }
143 }
144
145 pub fn max_ilog2(&self) -> Option<u32> {
154 match &self {
155 ErasedStats::U8(x) => x.max.checked_ilog2(),
156 ErasedStats::U16(x) => x.max.checked_ilog2(),
157 ErasedStats::U32(x) => x.max.checked_ilog2(),
158 ErasedStats::U64(x) => x.max.checked_ilog2(),
159 ErasedStats::I8(x) => (x.max as u8).checked_ilog2(),
161 ErasedStats::I16(x) => (x.max as u16).checked_ilog2(),
162 ErasedStats::I32(x) => (x.max as u32).checked_ilog2(),
163 ErasedStats::I64(x) => (x.max as u64).checked_ilog2(),
164 }
165 }
166
167 pub fn distinct_count(&self) -> Option<u32> {
169 match &self {
170 ErasedStats::U8(x) => x.distinct_count(),
171 ErasedStats::U16(x) => x.distinct_count(),
172 ErasedStats::U32(x) => x.distinct_count(),
173 ErasedStats::U64(x) => x.distinct_count(),
174 ErasedStats::I8(x) => x.distinct_count(),
175 ErasedStats::I16(x) => x.distinct_count(),
176 ErasedStats::I32(x) => x.distinct_count(),
177 ErasedStats::I64(x) => x.distinct_count(),
178 }
179 }
180
181 pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> {
183 match &self {
184 ErasedStats::U8(x) => {
185 let (top_value, count) = x.most_frequent_value_and_count()?;
186 Some(((*top_value).into(), count))
187 }
188 ErasedStats::U16(x) => {
189 let (top_value, count) = x.most_frequent_value_and_count()?;
190 Some(((*top_value).into(), count))
191 }
192 ErasedStats::U32(x) => {
193 let (top_value, count) = x.most_frequent_value_and_count()?;
194 Some(((*top_value).into(), count))
195 }
196 ErasedStats::U64(x) => {
197 let (top_value, count) = x.most_frequent_value_and_count()?;
198 Some(((*top_value).into(), count))
199 }
200 ErasedStats::I8(x) => {
201 let (top_value, count) = x.most_frequent_value_and_count()?;
202 Some(((*top_value).into(), count))
203 }
204 ErasedStats::I16(x) => {
205 let (top_value, count) = x.most_frequent_value_and_count()?;
206 Some(((*top_value).into(), count))
207 }
208 ErasedStats::I32(x) => {
209 let (top_value, count) = x.most_frequent_value_and_count()?;
210 Some(((*top_value).into(), count))
211 }
212 ErasedStats::I64(x) => {
213 let (top_value, count) = x.most_frequent_value_and_count()?;
214 Some(((*top_value).into(), count))
215 }
216 }
217 }
218}
219
220macro_rules! impl_from_typed {
222 ($T:ty, $variant:path) => {
223 impl From<TypedStats<$T>> for ErasedStats {
224 fn from(typed: TypedStats<$T>) -> Self {
225 $variant(typed)
226 }
227 }
228 };
229}
230
231impl_from_typed!(u8, ErasedStats::U8);
232impl_from_typed!(u16, ErasedStats::U16);
233impl_from_typed!(u32, ErasedStats::U32);
234impl_from_typed!(u64, ErasedStats::U64);
235impl_from_typed!(i8, ErasedStats::I8);
236impl_from_typed!(i16, ErasedStats::I16);
237impl_from_typed!(i32, ErasedStats::I32);
238impl_from_typed!(i64, ErasedStats::I64);
239
240#[derive(Clone, Debug)]
242pub struct IntegerStats {
243 null_count: u32,
245 value_count: u32,
247 average_run_length: u32,
249 erased: ErasedStats,
251}
252
253impl IntegerStats {
254 fn generate_opts_fallible(
256 input: &PrimitiveArray,
257 opts: GenerateStatsOptions,
258 ) -> VortexResult<Self> {
259 match_each_integer_ptype!(input.ptype(), |T| {
260 typed_int_stats::<T>(input, opts.count_distinct_values)
261 })
262 }
263
264 pub fn distinct_count(&self) -> Option<u32> {
266 self.erased.distinct_count()
267 }
268
269 pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> {
271 self.erased.most_frequent_value_and_count()
272 }
273}
274
275impl IntegerStats {
276 pub fn generate(input: &PrimitiveArray) -> Self {
278 Self::generate_opts(input, GenerateStatsOptions::default())
279 }
280
281 pub fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
283 Self::generate_opts_fallible(input, opts)
284 .vortex_expect("IntegerStats::generate_opts should not fail")
285 }
286
287 pub fn null_count(&self) -> u32 {
289 self.null_count
290 }
291
292 pub fn value_count(&self) -> u32 {
294 self.value_count
295 }
296
297 pub fn average_run_length(&self) -> u32 {
299 self.average_run_length
300 }
301
302 pub fn erased(&self) -> &ErasedStats {
304 &self.erased
305 }
306}
307
308fn typed_int_stats<T>(
310 array: &PrimitiveArray,
311 count_distinct_values: bool,
312) -> VortexResult<IntegerStats>
313where
314 T: IntegerPType + PrimInt + for<'a> TryFrom<&'a Scalar, Error = VortexError>,
315 TypedStats<T>: Into<ErasedStats>,
316 NativeValue<T>: Eq + Hash,
317{
318 if array.is_empty() {
320 return Ok(IntegerStats {
321 null_count: 0,
322 value_count: 0,
323 average_run_length: 0,
324 erased: TypedStats {
325 min: T::max_value(),
326 max: T::min_value(),
327 distinct: None,
328 }
329 .into(),
330 });
331 }
332
333 if array.all_invalid()? {
334 return Ok(IntegerStats {
335 null_count: u32::try_from(array.len())?,
336 value_count: 0,
337 average_run_length: 0,
338 erased: TypedStats {
339 min: T::max_value(),
340 max: T::min_value(),
341 distinct: None,
342 }
343 .into(),
344 });
345 }
346
347 let validity = array.validity_mask()?;
348 let null_count = validity.false_count();
349 let value_count = validity.true_count();
350
351 let head_idx = validity
353 .first()
354 .vortex_expect("All null masks have been handled before");
355 let buffer = array.to_buffer::<T>();
356 let head = buffer[head_idx];
357
358 let mut loop_state = LoopState {
359 distinct_values: if count_distinct_values {
360 HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
361 } else {
362 HashMap::with_hasher(FxBuildHasher)
363 },
364 prev: head,
365 runs: 1,
366 };
367
368 let sliced = buffer.slice(head_idx..array.len());
369 let mut chunks = sliced.as_slice().chunks_exact(64);
370 match validity.bit_buffer() {
371 AllOr::All => {
372 for chunk in &mut chunks {
373 inner_loop_nonnull(
374 chunk.try_into().ok().vortex_expect("chunk size must be 64"),
375 count_distinct_values,
376 &mut loop_state,
377 )
378 }
379 let remainder = chunks.remainder();
380 inner_loop_naive(
381 remainder,
382 count_distinct_values,
383 &BitBuffer::new_set(remainder.len()),
384 &mut loop_state,
385 );
386 }
387 AllOr::None => unreachable!("All invalid arrays have been handled before"),
388 AllOr::Some(v) => {
389 let mask = v.slice(head_idx..array.len());
390 let mut offset = 0;
391 for chunk in &mut chunks {
392 let validity = mask.slice(offset..(offset + 64));
393 offset += 64;
394
395 match validity.true_count() {
396 0 => continue,
398 64 => inner_loop_nonnull(
400 chunk.try_into().ok().vortex_expect("chunk size must be 64"),
401 count_distinct_values,
402 &mut loop_state,
403 ),
404 _ => inner_loop_nullable(
406 chunk.try_into().ok().vortex_expect("chunk size must be 64"),
407 count_distinct_values,
408 &validity,
409 &mut loop_state,
410 ),
411 }
412 }
413 let remainder = chunks.remainder();
415 inner_loop_naive(
416 remainder,
417 count_distinct_values,
418 &mask.slice(offset..(offset + remainder.len())),
419 &mut loop_state,
420 );
421 }
422 }
423
424 let runs = loop_state.runs;
425
426 let array_ref = array.as_ref();
427 let min = array_ref
428 .statistics()
429 .compute_as::<T>(Stat::Min)
430 .vortex_expect("min should be computed");
431
432 let max = array_ref
433 .statistics()
434 .compute_as::<T>(Stat::Max)
435 .vortex_expect("max should be computed");
436
437 let distinct = count_distinct_values.then(|| {
438 let (&top_value, &top_count) = loop_state
439 .distinct_values
440 .iter()
441 .max_by_key(|&(_, &count)| count)
442 .vortex_expect("we know this is non-empty");
443
444 DistinctInfo {
445 distinct_count: u32::try_from(loop_state.distinct_values.len())
446 .vortex_expect("there are more than `u32::MAX` distinct values"),
447 most_frequent_value: top_value.0,
448 top_frequency: top_count,
449 distinct_values: loop_state.distinct_values,
450 }
451 });
452
453 let typed = TypedStats { min, max, distinct };
454
455 let null_count = u32::try_from(null_count)?;
456 let value_count = u32::try_from(value_count)?;
457
458 Ok(IntegerStats {
459 null_count,
460 value_count,
461 average_run_length: value_count / runs,
462 erased: typed.into(),
463 })
464}
465
466struct LoopState<T> {
468 prev: T,
470 runs: u32,
472 distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
474}
475
476#[inline(always)]
478fn inner_loop_nonnull<T: IntegerPType>(
479 values: &[T; 64],
480 count_distinct_values: bool,
481 state: &mut LoopState<T>,
482) where
483 NativeValue<T>: Eq + Hash,
484{
485 for &value in values {
486 if count_distinct_values {
487 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
488 }
489
490 if value != state.prev {
491 state.prev = value;
492 state.runs += 1;
493 }
494 }
495}
496
497#[inline(always)]
499fn inner_loop_nullable<T: IntegerPType>(
500 values: &[T; 64],
501 count_distinct_values: bool,
502 is_valid: &BitBuffer,
503 state: &mut LoopState<T>,
504) where
505 NativeValue<T>: Eq + Hash,
506{
507 for (idx, &value) in values.iter().enumerate() {
508 if is_valid.value(idx) {
509 if count_distinct_values {
510 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
511 }
512
513 if value != state.prev {
514 state.prev = value;
515 state.runs += 1;
516 }
517 }
518 }
519}
520
521#[inline(always)]
523fn inner_loop_naive<T: IntegerPType>(
524 values: &[T],
525 count_distinct_values: bool,
526 is_valid: &BitBuffer,
527 state: &mut LoopState<T>,
528) where
529 NativeValue<T>: Eq + Hash,
530{
531 for (idx, &value) in values.iter().enumerate() {
532 if is_valid.value(idx) {
533 if count_distinct_values {
534 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
535 }
536
537 if value != state.prev {
538 state.prev = value;
539 state.runs += 1;
540 }
541 }
542 }
543}
544
545#[cfg(test)]
546mod tests {
547 use std::iter;
548
549 use vortex_array::arrays::PrimitiveArray;
550 use vortex_array::validity::Validity;
551 use vortex_buffer::BitBuffer;
552 use vortex_buffer::Buffer;
553 use vortex_buffer::buffer;
554 use vortex_error::VortexResult;
555
556 use super::IntegerStats;
557 use super::typed_int_stats;
558
559 #[test]
560 fn test_naive_count_distinct_values() -> VortexResult<()> {
561 let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
562 let stats = typed_int_stats::<u8>(&array, true)?;
563 assert_eq!(stats.distinct_count().unwrap(), 2);
564 Ok(())
565 }
566
567 #[test]
568 fn test_naive_count_distinct_values_nullable() -> VortexResult<()> {
569 let array = PrimitiveArray::new(
570 buffer![217u8, 0],
571 Validity::from(BitBuffer::from(vec![true, false])),
572 );
573 let stats = typed_int_stats::<u8>(&array, true)?;
574 assert_eq!(stats.distinct_count().unwrap(), 1);
575 Ok(())
576 }
577
578 #[test]
579 fn test_count_distinct_values() -> VortexResult<()> {
580 let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
581 let stats = typed_int_stats::<u8>(&array, true)?;
582 assert_eq!(stats.distinct_count().unwrap(), 128);
583 Ok(())
584 }
585
586 #[test]
587 fn test_count_distinct_values_nullable() -> VortexResult<()> {
588 let array = PrimitiveArray::new(
589 (0..128u8).collect::<Buffer<u8>>(),
590 Validity::from(BitBuffer::from_iter(
591 iter::repeat_n(vec![true, false], 64).flatten(),
592 )),
593 );
594 let stats = typed_int_stats::<u8>(&array, true)?;
595 assert_eq!(stats.distinct_count().unwrap(), 64);
596 Ok(())
597 }
598
599 #[test]
600 fn test_integer_stats_leading_nulls() {
601 let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
602
603 let stats = IntegerStats::generate_opts(
604 &ints,
605 crate::stats::GenerateStatsOptions {
606 count_distinct_values: true,
607 },
608 );
609
610 assert_eq!(stats.value_count, 2);
611 assert_eq!(stats.null_count, 1);
612 assert_eq!(stats.average_run_length, 1);
613 assert_eq!(stats.distinct_count().unwrap(), 2);
614 }
615}