1use std::hash::Hash;
5
6use num_traits::PrimInt;
7use rustc_hash::FxBuildHasher;
8use vortex_array::ToCanonical;
9use vortex_array::arrays::NativeValue;
10use vortex_array::arrays::PrimitiveArray;
11use vortex_array::arrays::PrimitiveVTable;
12use vortex_array::dtype::IntegerPType;
13use vortex_array::expr::stats::Stat;
14use vortex_array::match_each_integer_ptype;
15use vortex_array::scalar::PValue;
16use vortex_array::scalar::Scalar;
17use vortex_buffer::BitBuffer;
18use vortex_error::VortexError;
19use vortex_error::VortexExpect;
20use vortex_error::VortexResult;
21use vortex_mask::AllOr;
22use vortex_utils::aliases::hash_map::HashMap;
23
24use crate::CompressorStats;
25use crate::GenerateStatsOptions;
26use crate::compressor::rle::RLEStats;
27use crate::sample::sample;
28
29#[derive(Clone, Debug)]
30pub struct TypedStats<T> {
31 pub min: T,
32 pub max: T,
33 pub top_value: T,
34 pub top_count: u32,
35 pub distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
36}
37
38#[derive(Clone, Debug)]
43pub enum ErasedStats {
44 U8(TypedStats<u8>),
45 U16(TypedStats<u16>),
46 U32(TypedStats<u32>),
47 U64(TypedStats<u64>),
48 I8(TypedStats<i8>),
49 I16(TypedStats<i16>),
50 I32(TypedStats<i32>),
51 I64(TypedStats<i64>),
52}
53
54impl ErasedStats {
55 pub fn min_is_zero(&self) -> bool {
56 match &self {
57 ErasedStats::U8(x) => x.min == 0,
58 ErasedStats::U16(x) => x.min == 0,
59 ErasedStats::U32(x) => x.min == 0,
60 ErasedStats::U64(x) => x.min == 0,
61 ErasedStats::I8(x) => x.min == 0,
62 ErasedStats::I16(x) => x.min == 0,
63 ErasedStats::I32(x) => x.min == 0,
64 ErasedStats::I64(x) => x.min == 0,
65 }
66 }
67
68 pub fn min_is_negative(&self) -> bool {
69 match &self {
70 ErasedStats::U8(_)
71 | ErasedStats::U16(_)
72 | ErasedStats::U32(_)
73 | ErasedStats::U64(_) => false,
74 ErasedStats::I8(x) => x.min < 0,
75 ErasedStats::I16(x) => x.min < 0,
76 ErasedStats::I32(x) => x.min < 0,
77 ErasedStats::I64(x) => x.min < 0,
78 }
79 }
80
81 pub fn max_minus_min(&self) -> u64 {
83 match &self {
84 ErasedStats::U8(x) => (x.max - x.min) as u64,
85 ErasedStats::U16(x) => (x.max - x.min) as u64,
86 ErasedStats::U32(x) => (x.max - x.min) as u64,
87 ErasedStats::U64(x) => x.max - x.min,
88 ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
89 ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
90 ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
91 ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
92 .vortex_expect("max minus min result bigger than u64"),
93 }
94 }
95
96 pub fn max_ilog2(&self) -> Option<u32> {
105 match &self {
106 ErasedStats::U8(x) => x.max.checked_ilog2(),
107 ErasedStats::U16(x) => x.max.checked_ilog2(),
108 ErasedStats::U32(x) => x.max.checked_ilog2(),
109 ErasedStats::U64(x) => x.max.checked_ilog2(),
110 ErasedStats::I8(x) => (x.max as u8).checked_ilog2(),
112 ErasedStats::I16(x) => (x.max as u16).checked_ilog2(),
113 ErasedStats::I32(x) => (x.max as u32).checked_ilog2(),
114 ErasedStats::I64(x) => (x.max as u64).checked_ilog2(),
115 }
116 }
117
118 pub fn top_value_and_count(&self) -> (PValue, u32) {
120 match &self {
121 ErasedStats::U8(x) => (x.top_value.into(), x.top_count),
122 ErasedStats::U16(x) => (x.top_value.into(), x.top_count),
123 ErasedStats::U32(x) => (x.top_value.into(), x.top_count),
124 ErasedStats::U64(x) => (x.top_value.into(), x.top_count),
125 ErasedStats::I8(x) => (x.top_value.into(), x.top_count),
126 ErasedStats::I16(x) => (x.top_value.into(), x.top_count),
127 ErasedStats::I32(x) => (x.top_value.into(), x.top_count),
128 ErasedStats::I64(x) => (x.top_value.into(), x.top_count),
129 }
130 }
131}
132
133macro_rules! impl_from_typed {
134 ($T:ty, $variant:path) => {
135 impl From<TypedStats<$T>> for ErasedStats {
136 fn from(typed: TypedStats<$T>) -> Self {
137 $variant(typed)
138 }
139 }
140 };
141}
142
143impl_from_typed!(u8, ErasedStats::U8);
144impl_from_typed!(u16, ErasedStats::U16);
145impl_from_typed!(u32, ErasedStats::U32);
146impl_from_typed!(u64, ErasedStats::U64);
147impl_from_typed!(i8, ErasedStats::I8);
148impl_from_typed!(i16, ErasedStats::I16);
149impl_from_typed!(i32, ErasedStats::I32);
150impl_from_typed!(i64, ErasedStats::I64);
151
152#[derive(Clone, Debug)]
154pub struct IntegerStats {
155 pub(super) src: PrimitiveArray,
156 pub(super) null_count: u32,
158 pub(super) value_count: u32,
160 pub(super) average_run_length: u32,
161 pub(super) distinct_values_count: u32,
162 pub(crate) typed: ErasedStats,
163}
164
165impl IntegerStats {
166 fn generate_opts_fallible(
167 input: &PrimitiveArray,
168 opts: GenerateStatsOptions,
169 ) -> VortexResult<Self> {
170 match_each_integer_ptype!(input.ptype(), |T| {
171 typed_int_stats::<T>(input, opts.count_distinct_values)
172 })
173 }
174}
175
176impl CompressorStats for IntegerStats {
177 type ArrayVTable = PrimitiveVTable;
178
179 fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
180 Self::generate_opts_fallible(input, opts)
181 .vortex_expect("IntegerStats::generate_opts should not fail")
182 }
183
184 fn source(&self) -> &PrimitiveArray {
185 &self.src
186 }
187
188 fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
189 let sampled = sample(self.src.as_ref(), sample_size, sample_count).to_primitive();
190
191 Self::generate_opts(&sampled, opts)
192 }
193}
194
195impl RLEStats for IntegerStats {
196 fn value_count(&self) -> u32 {
197 self.value_count
198 }
199
200 fn average_run_length(&self) -> u32 {
201 self.average_run_length
202 }
203
204 fn source(&self) -> &PrimitiveArray {
205 &self.src
206 }
207}
208
209fn typed_int_stats<T>(
210 array: &PrimitiveArray,
211 count_distinct_values: bool,
212) -> VortexResult<IntegerStats>
213where
214 T: IntegerPType + PrimInt + for<'a> TryFrom<&'a Scalar, Error = VortexError>,
215 TypedStats<T>: Into<ErasedStats>,
216 NativeValue<T>: Eq + Hash,
217{
218 if array.is_empty() {
220 return Ok(IntegerStats {
221 src: array.clone(),
222 null_count: 0,
223 value_count: 0,
224 average_run_length: 0,
225 distinct_values_count: 0,
226 typed: TypedStats {
227 min: T::max_value(),
228 max: T::min_value(),
229 top_value: T::default(),
230 top_count: 0,
231 distinct_values: HashMap::with_hasher(FxBuildHasher),
232 }
233 .into(),
234 });
235 } else if array.all_invalid()? {
236 return Ok(IntegerStats {
237 src: array.clone(),
238 null_count: u32::try_from(array.len())?,
239 value_count: 0,
240 average_run_length: 0,
241 distinct_values_count: 0,
242 typed: TypedStats {
243 min: T::max_value(),
244 max: T::min_value(),
245 top_value: T::default(),
246 top_count: 0,
247 distinct_values: HashMap::with_hasher(FxBuildHasher),
248 }
249 .into(),
250 });
251 }
252
253 let validity = array.validity_mask()?;
254 let null_count = validity.false_count();
255 let value_count = validity.true_count();
256
257 let head_idx = validity
259 .first()
260 .vortex_expect("All null masks have been handled before");
261 let buffer = array.to_buffer::<T>();
262 let head = buffer[head_idx];
263
264 let mut loop_state = LoopState {
265 distinct_values: if count_distinct_values {
266 HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
267 } else {
268 HashMap::with_hasher(FxBuildHasher)
269 },
270 prev: head,
271 runs: 1,
272 };
273
274 let sliced = buffer.slice(head_idx..array.len());
275 let mut chunks = sliced.as_slice().chunks_exact(64);
276 match validity.bit_buffer() {
277 AllOr::All => {
278 for chunk in &mut chunks {
279 inner_loop_nonnull(
280 chunk.try_into().ok().vortex_expect("chunk size must be 64"),
281 count_distinct_values,
282 &mut loop_state,
283 )
284 }
285 let remainder = chunks.remainder();
286 inner_loop_naive(
287 remainder,
288 count_distinct_values,
289 &BitBuffer::new_set(remainder.len()),
290 &mut loop_state,
291 );
292 }
293 AllOr::None => unreachable!("All invalid arrays have been handled before"),
294 AllOr::Some(v) => {
295 let mask = v.slice(head_idx..array.len());
296 let mut offset = 0;
297 for chunk in &mut chunks {
298 let validity = mask.slice(offset..(offset + 64));
299 offset += 64;
300
301 match validity.true_count() {
302 0 => continue,
304 64 => inner_loop_nonnull(
306 chunk.try_into().ok().vortex_expect("chunk size must be 64"),
307 count_distinct_values,
308 &mut loop_state,
309 ),
310 _ => inner_loop_nullable(
312 chunk.try_into().ok().vortex_expect("chunk size must be 64"),
313 count_distinct_values,
314 &validity,
315 &mut loop_state,
316 ),
317 }
318 }
319 let remainder = chunks.remainder();
321 inner_loop_naive(
322 remainder,
323 count_distinct_values,
324 &mask.slice(offset..(offset + remainder.len())),
325 &mut loop_state,
326 );
327 }
328 }
329
330 let (top_value, top_count) = if count_distinct_values {
331 let (&top_value, &top_count) = loop_state
332 .distinct_values
333 .iter()
334 .max_by_key(|&(_, &count)| count)
335 .vortex_expect("non-empty");
336 (top_value.0, top_count)
337 } else {
338 (T::default(), 0)
339 };
340
341 let runs = loop_state.runs;
342 let distinct_values_count = if count_distinct_values {
343 u32::try_from(loop_state.distinct_values.len())?
344 } else {
345 u32::MAX
346 };
347
348 let min = array
349 .statistics()
350 .compute_as::<T>(Stat::Min)
351 .vortex_expect("min should be computed");
352
353 let max = array
354 .statistics()
355 .compute_as::<T>(Stat::Max)
356 .vortex_expect("max should be computed");
357
358 let typed = TypedStats {
359 min,
360 max,
361 distinct_values: loop_state.distinct_values,
362 top_value,
363 top_count,
364 };
365
366 let null_count = u32::try_from(null_count)?;
367 let value_count = u32::try_from(value_count)?;
368
369 Ok(IntegerStats {
370 src: array.clone(),
371 null_count,
372 value_count,
373 average_run_length: value_count / runs,
374 distinct_values_count,
375 typed: typed.into(),
376 })
377}
378
379struct LoopState<T> {
380 prev: T,
381 runs: u32,
382 distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
383}
384
385#[inline(always)]
386fn inner_loop_nonnull<T: IntegerPType>(
387 values: &[T; 64],
388 count_distinct_values: bool,
389 state: &mut LoopState<T>,
390) where
391 NativeValue<T>: Eq + Hash,
392{
393 for &value in values {
394 if count_distinct_values {
395 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
396 }
397
398 if value != state.prev {
399 state.prev = value;
400 state.runs += 1;
401 }
402 }
403}
404
405#[inline(always)]
406fn inner_loop_nullable<T: IntegerPType>(
407 values: &[T; 64],
408 count_distinct_values: bool,
409 is_valid: &BitBuffer,
410 state: &mut LoopState<T>,
411) where
412 NativeValue<T>: Eq + Hash,
413{
414 for (idx, &value) in values.iter().enumerate() {
415 if is_valid.value(idx) {
416 if count_distinct_values {
417 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
418 }
419
420 if value != state.prev {
421 state.prev = value;
422 state.runs += 1;
423 }
424 }
425 }
426}
427
428#[inline(always)]
429fn inner_loop_naive<T: IntegerPType>(
430 values: &[T],
431 count_distinct_values: bool,
432 is_valid: &BitBuffer,
433 state: &mut LoopState<T>,
434) where
435 NativeValue<T>: Eq + Hash,
436{
437 for (idx, &value) in values.iter().enumerate() {
438 if is_valid.value(idx) {
439 if count_distinct_values {
440 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
441 }
442
443 if value != state.prev {
444 state.prev = value;
445 state.runs += 1;
446 }
447 }
448 }
449}
450
451#[cfg(test)]
452mod tests {
453 use std::iter;
454
455 use vortex_array::arrays::PrimitiveArray;
456 use vortex_array::validity::Validity;
457 use vortex_buffer::BitBuffer;
458 use vortex_buffer::Buffer;
459 use vortex_buffer::buffer;
460 use vortex_error::VortexResult;
461
462 use super::IntegerStats;
463 use super::typed_int_stats;
464 use crate::CompressorStats;
465
466 #[test]
467 fn test_naive_count_distinct_values() -> VortexResult<()> {
468 let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
469 let stats = typed_int_stats::<u8>(&array, true)?;
470 assert_eq!(stats.distinct_values_count, 2);
471 Ok(())
472 }
473
474 #[test]
475 fn test_naive_count_distinct_values_nullable() -> VortexResult<()> {
476 let array = PrimitiveArray::new(
477 buffer![217u8, 0],
478 Validity::from(BitBuffer::from(vec![true, false])),
479 );
480 let stats = typed_int_stats::<u8>(&array, true)?;
481 assert_eq!(stats.distinct_values_count, 1);
482 Ok(())
483 }
484
485 #[test]
486 fn test_count_distinct_values() -> VortexResult<()> {
487 let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
488 let stats = typed_int_stats::<u8>(&array, true)?;
489 assert_eq!(stats.distinct_values_count, 128);
490 Ok(())
491 }
492
493 #[test]
494 fn test_count_distinct_values_nullable() -> VortexResult<()> {
495 let array = PrimitiveArray::new(
496 (0..128u8).collect::<Buffer<u8>>(),
497 Validity::from(BitBuffer::from_iter(
498 iter::repeat_n(vec![true, false], 64).flatten(),
499 )),
500 );
501 let stats = typed_int_stats::<u8>(&array, true)?;
502 assert_eq!(stats.distinct_values_count, 64);
503 Ok(())
504 }
505
506 #[test]
507 fn test_integer_stats_leading_nulls() {
508 let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
509
510 let stats = IntegerStats::generate(&ints);
511
512 assert_eq!(stats.value_count, 2);
513 assert_eq!(stats.null_count, 1);
514 assert_eq!(stats.average_run_length, 1);
515 assert_eq!(stats.distinct_values_count, 2);
516 }
517}