1use std::hash::Hash;
5
6use num_traits::PrimInt;
7use rustc_hash::FxBuildHasher;
8use vortex_array::IntoArray;
9use vortex_array::ToCanonical;
10use vortex_array::arrays::Primitive;
11use vortex_array::arrays::PrimitiveArray;
12use vortex_array::arrays::primitive::NativeValue;
13use vortex_array::dtype::IntegerPType;
14use vortex_array::expr::stats::Stat;
15use vortex_array::match_each_integer_ptype;
16use vortex_array::scalar::PValue;
17use vortex_array::scalar::Scalar;
18use vortex_buffer::BitBuffer;
19use vortex_error::VortexError;
20use vortex_error::VortexExpect;
21use vortex_error::VortexResult;
22use vortex_mask::AllOr;
23use vortex_utils::aliases::hash_map::HashMap;
24
25use crate::CompressorStats;
26use crate::GenerateStatsOptions;
27use crate::compressor::rle::RLEStats;
28use crate::sample::sample;
29
30#[derive(Clone, Debug)]
31pub struct TypedStats<T> {
32 pub min: T,
33 pub max: T,
34 pub top_value: T,
35 pub top_count: u32,
36 pub distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
37}
38
39#[derive(Clone, Debug)]
44pub enum ErasedStats {
45 U8(TypedStats<u8>),
46 U16(TypedStats<u16>),
47 U32(TypedStats<u32>),
48 U64(TypedStats<u64>),
49 I8(TypedStats<i8>),
50 I16(TypedStats<i16>),
51 I32(TypedStats<i32>),
52 I64(TypedStats<i64>),
53}
54
55impl ErasedStats {
56 pub fn min_is_zero(&self) -> bool {
57 match &self {
58 ErasedStats::U8(x) => x.min == 0,
59 ErasedStats::U16(x) => x.min == 0,
60 ErasedStats::U32(x) => x.min == 0,
61 ErasedStats::U64(x) => x.min == 0,
62 ErasedStats::I8(x) => x.min == 0,
63 ErasedStats::I16(x) => x.min == 0,
64 ErasedStats::I32(x) => x.min == 0,
65 ErasedStats::I64(x) => x.min == 0,
66 }
67 }
68
69 pub fn min_is_negative(&self) -> bool {
70 match &self {
71 ErasedStats::U8(_)
72 | ErasedStats::U16(_)
73 | ErasedStats::U32(_)
74 | ErasedStats::U64(_) => false,
75 ErasedStats::I8(x) => x.min < 0,
76 ErasedStats::I16(x) => x.min < 0,
77 ErasedStats::I32(x) => x.min < 0,
78 ErasedStats::I64(x) => x.min < 0,
79 }
80 }
81
82 pub fn max_minus_min(&self) -> u64 {
84 match &self {
85 ErasedStats::U8(x) => (x.max - x.min) as u64,
86 ErasedStats::U16(x) => (x.max - x.min) as u64,
87 ErasedStats::U32(x) => (x.max - x.min) as u64,
88 ErasedStats::U64(x) => x.max - x.min,
89 ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
90 ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
91 ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
92 ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
93 .vortex_expect("max minus min result bigger than u64"),
94 }
95 }
96
97 pub fn max_ilog2(&self) -> Option<u32> {
106 match &self {
107 ErasedStats::U8(x) => x.max.checked_ilog2(),
108 ErasedStats::U16(x) => x.max.checked_ilog2(),
109 ErasedStats::U32(x) => x.max.checked_ilog2(),
110 ErasedStats::U64(x) => x.max.checked_ilog2(),
111 ErasedStats::I8(x) => (x.max as u8).checked_ilog2(),
113 ErasedStats::I16(x) => (x.max as u16).checked_ilog2(),
114 ErasedStats::I32(x) => (x.max as u32).checked_ilog2(),
115 ErasedStats::I64(x) => (x.max as u64).checked_ilog2(),
116 }
117 }
118
119 pub fn top_value_and_count(&self) -> (PValue, u32) {
121 match &self {
122 ErasedStats::U8(x) => (x.top_value.into(), x.top_count),
123 ErasedStats::U16(x) => (x.top_value.into(), x.top_count),
124 ErasedStats::U32(x) => (x.top_value.into(), x.top_count),
125 ErasedStats::U64(x) => (x.top_value.into(), x.top_count),
126 ErasedStats::I8(x) => (x.top_value.into(), x.top_count),
127 ErasedStats::I16(x) => (x.top_value.into(), x.top_count),
128 ErasedStats::I32(x) => (x.top_value.into(), x.top_count),
129 ErasedStats::I64(x) => (x.top_value.into(), x.top_count),
130 }
131 }
132}
133
134macro_rules! impl_from_typed {
135 ($T:ty, $variant:path) => {
136 impl From<TypedStats<$T>> for ErasedStats {
137 fn from(typed: TypedStats<$T>) -> Self {
138 $variant(typed)
139 }
140 }
141 };
142}
143
144impl_from_typed!(u8, ErasedStats::U8);
145impl_from_typed!(u16, ErasedStats::U16);
146impl_from_typed!(u32, ErasedStats::U32);
147impl_from_typed!(u64, ErasedStats::U64);
148impl_from_typed!(i8, ErasedStats::I8);
149impl_from_typed!(i16, ErasedStats::I16);
150impl_from_typed!(i32, ErasedStats::I32);
151impl_from_typed!(i64, ErasedStats::I64);
152
153#[derive(Clone, Debug)]
155pub struct IntegerStats {
156 pub(super) src: PrimitiveArray,
157 pub(super) null_count: u32,
159 pub(super) value_count: u32,
161 pub(super) average_run_length: u32,
162 pub(super) distinct_values_count: u32,
163 pub(crate) typed: ErasedStats,
164}
165
166impl IntegerStats {
167 fn generate_opts_fallible(
168 input: &PrimitiveArray,
169 opts: GenerateStatsOptions,
170 ) -> VortexResult<Self> {
171 match_each_integer_ptype!(input.ptype(), |T| {
172 typed_int_stats::<T>(input, opts.count_distinct_values)
173 })
174 }
175}
176
177impl CompressorStats for IntegerStats {
178 type ArrayVTable = Primitive;
179
180 fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
181 Self::generate_opts_fallible(input, opts)
182 .vortex_expect("IntegerStats::generate_opts should not fail")
183 }
184
185 fn source(&self) -> &PrimitiveArray {
186 &self.src
187 }
188
189 fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
190 let sampled =
191 sample(&self.src.clone().into_array(), sample_size, sample_count).to_primitive();
192
193 Self::generate_opts(&sampled, opts)
194 }
195}
196
197impl RLEStats for IntegerStats {
198 fn value_count(&self) -> u32 {
199 self.value_count
200 }
201
202 fn average_run_length(&self) -> u32 {
203 self.average_run_length
204 }
205
206 fn source(&self) -> &PrimitiveArray {
207 &self.src
208 }
209}
210
211fn typed_int_stats<T>(
212 array: &PrimitiveArray,
213 count_distinct_values: bool,
214) -> VortexResult<IntegerStats>
215where
216 T: IntegerPType + PrimInt + for<'a> TryFrom<&'a Scalar, Error = VortexError>,
217 TypedStats<T>: Into<ErasedStats>,
218 NativeValue<T>: Eq + Hash,
219{
220 if array.is_empty() {
222 return Ok(IntegerStats {
223 src: array.clone(),
224 null_count: 0,
225 value_count: 0,
226 average_run_length: 0,
227 distinct_values_count: 0,
228 typed: TypedStats {
229 min: T::max_value(),
230 max: T::min_value(),
231 top_value: T::default(),
232 top_count: 0,
233 distinct_values: HashMap::with_hasher(FxBuildHasher),
234 }
235 .into(),
236 });
237 } else if array.all_invalid()? {
238 return Ok(IntegerStats {
239 src: array.clone(),
240 null_count: u32::try_from(array.len())?,
241 value_count: 0,
242 average_run_length: 0,
243 distinct_values_count: 0,
244 typed: TypedStats {
245 min: T::max_value(),
246 max: T::min_value(),
247 top_value: T::default(),
248 top_count: 0,
249 distinct_values: HashMap::with_hasher(FxBuildHasher),
250 }
251 .into(),
252 });
253 }
254
255 let validity = array.validity_mask()?;
256 let null_count = validity.false_count();
257 let value_count = validity.true_count();
258
259 let head_idx = validity
261 .first()
262 .vortex_expect("All null masks have been handled before");
263 let buffer = array.to_buffer::<T>();
264 let head = buffer[head_idx];
265
266 let mut loop_state = LoopState {
267 distinct_values: if count_distinct_values {
268 HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
269 } else {
270 HashMap::with_hasher(FxBuildHasher)
271 },
272 prev: head,
273 runs: 1,
274 };
275
276 let sliced = buffer.slice(head_idx..array.len());
277 let mut chunks = sliced.as_slice().chunks_exact(64);
278 match validity.bit_buffer() {
279 AllOr::All => {
280 for chunk in &mut chunks {
281 inner_loop_nonnull(
282 chunk.try_into().ok().vortex_expect("chunk size must be 64"),
283 count_distinct_values,
284 &mut loop_state,
285 )
286 }
287 let remainder = chunks.remainder();
288 inner_loop_naive(
289 remainder,
290 count_distinct_values,
291 &BitBuffer::new_set(remainder.len()),
292 &mut loop_state,
293 );
294 }
295 AllOr::None => unreachable!("All invalid arrays have been handled before"),
296 AllOr::Some(v) => {
297 let mask = v.slice(head_idx..array.len());
298 let mut offset = 0;
299 for chunk in &mut chunks {
300 let validity = mask.slice(offset..(offset + 64));
301 offset += 64;
302
303 match validity.true_count() {
304 0 => continue,
306 64 => inner_loop_nonnull(
308 chunk.try_into().ok().vortex_expect("chunk size must be 64"),
309 count_distinct_values,
310 &mut loop_state,
311 ),
312 _ => inner_loop_nullable(
314 chunk.try_into().ok().vortex_expect("chunk size must be 64"),
315 count_distinct_values,
316 &validity,
317 &mut loop_state,
318 ),
319 }
320 }
321 let remainder = chunks.remainder();
323 inner_loop_naive(
324 remainder,
325 count_distinct_values,
326 &mask.slice(offset..(offset + remainder.len())),
327 &mut loop_state,
328 );
329 }
330 }
331
332 let (top_value, top_count) = if count_distinct_values {
333 let (&top_value, &top_count) = loop_state
334 .distinct_values
335 .iter()
336 .max_by_key(|&(_, &count)| count)
337 .vortex_expect("non-empty");
338 (top_value.0, top_count)
339 } else {
340 (T::default(), 0)
341 };
342
343 let runs = loop_state.runs;
344 let distinct_values_count = if count_distinct_values {
345 u32::try_from(loop_state.distinct_values.len())?
346 } else {
347 u32::MAX
348 };
349
350 let min = array
351 .statistics()
352 .compute_as::<T>(Stat::Min)
353 .vortex_expect("min should be computed");
354
355 let max = array
356 .statistics()
357 .compute_as::<T>(Stat::Max)
358 .vortex_expect("max should be computed");
359
360 let typed = TypedStats {
361 min,
362 max,
363 distinct_values: loop_state.distinct_values,
364 top_value,
365 top_count,
366 };
367
368 let null_count = u32::try_from(null_count)?;
369 let value_count = u32::try_from(value_count)?;
370
371 Ok(IntegerStats {
372 src: array.clone(),
373 null_count,
374 value_count,
375 average_run_length: value_count / runs,
376 distinct_values_count,
377 typed: typed.into(),
378 })
379}
380
381struct LoopState<T> {
382 prev: T,
383 runs: u32,
384 distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
385}
386
387#[inline(always)]
388fn inner_loop_nonnull<T: IntegerPType>(
389 values: &[T; 64],
390 count_distinct_values: bool,
391 state: &mut LoopState<T>,
392) where
393 NativeValue<T>: Eq + Hash,
394{
395 for &value in values {
396 if count_distinct_values {
397 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
398 }
399
400 if value != state.prev {
401 state.prev = value;
402 state.runs += 1;
403 }
404 }
405}
406
407#[inline(always)]
408fn inner_loop_nullable<T: IntegerPType>(
409 values: &[T; 64],
410 count_distinct_values: bool,
411 is_valid: &BitBuffer,
412 state: &mut LoopState<T>,
413) where
414 NativeValue<T>: Eq + Hash,
415{
416 for (idx, &value) in values.iter().enumerate() {
417 if is_valid.value(idx) {
418 if count_distinct_values {
419 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
420 }
421
422 if value != state.prev {
423 state.prev = value;
424 state.runs += 1;
425 }
426 }
427 }
428}
429
430#[inline(always)]
431fn inner_loop_naive<T: IntegerPType>(
432 values: &[T],
433 count_distinct_values: bool,
434 is_valid: &BitBuffer,
435 state: &mut LoopState<T>,
436) where
437 NativeValue<T>: Eq + Hash,
438{
439 for (idx, &value) in values.iter().enumerate() {
440 if is_valid.value(idx) {
441 if count_distinct_values {
442 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
443 }
444
445 if value != state.prev {
446 state.prev = value;
447 state.runs += 1;
448 }
449 }
450 }
451}
452
453#[cfg(test)]
454mod tests {
455 use std::iter;
456
457 use vortex_array::arrays::PrimitiveArray;
458 use vortex_array::validity::Validity;
459 use vortex_buffer::BitBuffer;
460 use vortex_buffer::Buffer;
461 use vortex_buffer::buffer;
462 use vortex_error::VortexResult;
463
464 use super::IntegerStats;
465 use super::typed_int_stats;
466 use crate::CompressorStats;
467
468 #[test]
469 fn test_naive_count_distinct_values() -> VortexResult<()> {
470 let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
471 let stats = typed_int_stats::<u8>(&array, true)?;
472 assert_eq!(stats.distinct_values_count, 2);
473 Ok(())
474 }
475
476 #[test]
477 fn test_naive_count_distinct_values_nullable() -> VortexResult<()> {
478 let array = PrimitiveArray::new(
479 buffer![217u8, 0],
480 Validity::from(BitBuffer::from(vec![true, false])),
481 );
482 let stats = typed_int_stats::<u8>(&array, true)?;
483 assert_eq!(stats.distinct_values_count, 1);
484 Ok(())
485 }
486
487 #[test]
488 fn test_count_distinct_values() -> VortexResult<()> {
489 let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
490 let stats = typed_int_stats::<u8>(&array, true)?;
491 assert_eq!(stats.distinct_values_count, 128);
492 Ok(())
493 }
494
495 #[test]
496 fn test_count_distinct_values_nullable() -> VortexResult<()> {
497 let array = PrimitiveArray::new(
498 (0..128u8).collect::<Buffer<u8>>(),
499 Validity::from(BitBuffer::from_iter(
500 iter::repeat_n(vec![true, false], 64).flatten(),
501 )),
502 );
503 let stats = typed_int_stats::<u8>(&array, true)?;
504 assert_eq!(stats.distinct_values_count, 64);
505 Ok(())
506 }
507
508 #[test]
509 fn test_integer_stats_leading_nulls() {
510 let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
511
512 let stats = IntegerStats::generate(&ints);
513
514 assert_eq!(stats.value_count, 2);
515 assert_eq!(stats.null_count, 1);
516 assert_eq!(stats.average_run_length, 1);
517 assert_eq!(stats.distinct_values_count, 2);
518 }
519}