1use std::hash::Hash;
5
6use num_traits::PrimInt;
7use rustc_hash::FxBuildHasher;
8use vortex_array::ToCanonical;
9use vortex_array::arrays::NativeValue;
10use vortex_array::arrays::PrimitiveArray;
11use vortex_array::arrays::PrimitiveVTable;
12use vortex_array::expr::stats::Stat;
13use vortex_buffer::BitBuffer;
14use vortex_dtype::IntegerPType;
15use vortex_dtype::match_each_integer_ptype;
16use vortex_error::VortexError;
17use vortex_error::VortexExpect;
18use vortex_error::VortexUnwrap;
19use vortex_mask::AllOr;
20use vortex_scalar::PValue;
21use vortex_scalar::Scalar;
22use vortex_utils::aliases::hash_map::HashMap;
23
24use crate::CompressorStats;
25use crate::GenerateStatsOptions;
26use crate::rle::RLEStats;
27use crate::sample::sample;
28
29#[derive(Clone, Debug)]
30pub struct TypedStats<T> {
31 pub min: T,
32 pub max: T,
33 pub top_value: T,
34 pub top_count: u32,
35 pub distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
36}
37
38#[derive(Clone, Debug)]
43pub enum ErasedStats {
44 U8(TypedStats<u8>),
45 U16(TypedStats<u16>),
46 U32(TypedStats<u32>),
47 U64(TypedStats<u64>),
48 I8(TypedStats<i8>),
49 I16(TypedStats<i16>),
50 I32(TypedStats<i32>),
51 I64(TypedStats<i64>),
52}
53
54impl ErasedStats {
55 pub fn min_is_zero(&self) -> bool {
56 match &self {
57 ErasedStats::U8(x) => x.min == 0,
58 ErasedStats::U16(x) => x.min == 0,
59 ErasedStats::U32(x) => x.min == 0,
60 ErasedStats::U64(x) => x.min == 0,
61 ErasedStats::I8(x) => x.min == 0,
62 ErasedStats::I16(x) => x.min == 0,
63 ErasedStats::I32(x) => x.min == 0,
64 ErasedStats::I64(x) => x.min == 0,
65 }
66 }
67
68 pub fn min_is_negative(&self) -> bool {
69 match &self {
70 ErasedStats::U8(_)
71 | ErasedStats::U16(_)
72 | ErasedStats::U32(_)
73 | ErasedStats::U64(_) => false,
74 ErasedStats::I8(x) => x.min < 0,
75 ErasedStats::I16(x) => x.min < 0,
76 ErasedStats::I32(x) => x.min < 0,
77 ErasedStats::I64(x) => x.min < 0,
78 }
79 }
80
81 pub fn max_minus_min(&self) -> u64 {
83 match &self {
84 ErasedStats::U8(x) => (x.max - x.min) as u64,
85 ErasedStats::U16(x) => (x.max - x.min) as u64,
86 ErasedStats::U32(x) => (x.max - x.min) as u64,
87 ErasedStats::U64(x) => x.max - x.min,
88 ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
89 ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
90 ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
91 ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
92 .vortex_expect("max minus min result bigger than u64"),
93 }
94 }
95
96 pub fn top_value_and_count(&self) -> (PValue, u32) {
98 match &self {
99 ErasedStats::U8(x) => (x.top_value.into(), x.top_count),
100 ErasedStats::U16(x) => (x.top_value.into(), x.top_count),
101 ErasedStats::U32(x) => (x.top_value.into(), x.top_count),
102 ErasedStats::U64(x) => (x.top_value.into(), x.top_count),
103 ErasedStats::I8(x) => (x.top_value.into(), x.top_count),
104 ErasedStats::I16(x) => (x.top_value.into(), x.top_count),
105 ErasedStats::I32(x) => (x.top_value.into(), x.top_count),
106 ErasedStats::I64(x) => (x.top_value.into(), x.top_count),
107 }
108 }
109}
110
111macro_rules! impl_from_typed {
112 ($T:ty, $variant:path) => {
113 impl From<TypedStats<$T>> for ErasedStats {
114 fn from(typed: TypedStats<$T>) -> Self {
115 $variant(typed)
116 }
117 }
118 };
119}
120
121impl_from_typed!(u8, ErasedStats::U8);
122impl_from_typed!(u16, ErasedStats::U16);
123impl_from_typed!(u32, ErasedStats::U32);
124impl_from_typed!(u64, ErasedStats::U64);
125impl_from_typed!(i8, ErasedStats::I8);
126impl_from_typed!(i16, ErasedStats::I16);
127impl_from_typed!(i32, ErasedStats::I32);
128impl_from_typed!(i64, ErasedStats::I64);
129
130#[derive(Clone, Debug)]
132pub struct IntegerStats {
133 pub(super) src: PrimitiveArray,
134 pub(super) null_count: u32,
136 pub(super) value_count: u32,
138 pub(super) average_run_length: u32,
139 pub(super) distinct_values_count: u32,
140 pub(crate) typed: ErasedStats,
141}
142
143impl CompressorStats for IntegerStats {
144 type ArrayVTable = PrimitiveVTable;
145
146 fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
147 match_each_integer_ptype!(input.ptype(), |T| {
148 typed_int_stats::<T>(input, opts.count_distinct_values)
149 })
150 }
151
152 fn source(&self) -> &PrimitiveArray {
153 &self.src
154 }
155
156 fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
157 let sampled = sample(self.src.as_ref(), sample_size, sample_count).to_primitive();
158
159 Self::generate_opts(&sampled, opts)
160 }
161}
162
163impl RLEStats for IntegerStats {
164 fn value_count(&self) -> u32 {
165 self.value_count
166 }
167
168 fn average_run_length(&self) -> u32 {
169 self.average_run_length
170 }
171
172 fn source(&self) -> &PrimitiveArray {
173 &self.src
174 }
175}
176
177fn typed_int_stats<T>(array: &PrimitiveArray, count_distinct_values: bool) -> IntegerStats
178where
179 T: IntegerPType + PrimInt + for<'a> TryFrom<&'a Scalar, Error = VortexError>,
180 TypedStats<T>: Into<ErasedStats>,
181 NativeValue<T>: Eq + Hash,
182{
183 if array.is_empty() {
185 return IntegerStats {
186 src: array.clone(),
187 null_count: 0,
188 value_count: 0,
189 average_run_length: 0,
190 distinct_values_count: 0,
191 typed: TypedStats {
192 min: T::max_value(),
193 max: T::min_value(),
194 top_value: T::default(),
195 top_count: 0,
196 distinct_values: HashMap::with_hasher(FxBuildHasher),
197 }
198 .into(),
199 };
200 } else if array.all_invalid() {
201 return IntegerStats {
202 src: array.clone(),
203 null_count: array.len().try_into().vortex_expect("null_count"),
204 value_count: 0,
205 average_run_length: 0,
206 distinct_values_count: 0,
207 typed: TypedStats {
208 min: T::max_value(),
209 max: T::min_value(),
210 top_value: T::default(),
211 top_count: 0,
212 distinct_values: HashMap::with_hasher(FxBuildHasher),
213 }
214 .into(),
215 };
216 }
217
218 let validity = array.validity_mask();
219 let null_count = validity.false_count();
220 let value_count = validity.true_count();
221
222 let head_idx = validity
224 .first()
225 .vortex_expect("All null masks have been handled before");
226 let buffer = array.buffer::<T>();
227 let head = buffer[head_idx];
228
229 let mut loop_state = LoopState {
230 distinct_values: if count_distinct_values {
231 HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
232 } else {
233 HashMap::with_hasher(FxBuildHasher)
234 },
235 prev: head,
236 runs: 1,
237 };
238
239 let sliced = buffer.slice(head_idx..array.len());
240 let mut chunks = sliced.as_slice().chunks_exact(64);
241 match validity.bit_buffer() {
242 AllOr::All => {
243 for chunk in &mut chunks {
244 inner_loop_nonnull(
245 chunk.try_into().vortex_unwrap(),
246 count_distinct_values,
247 &mut loop_state,
248 )
249 }
250 let remainder = chunks.remainder();
251 inner_loop_naive(
252 remainder,
253 count_distinct_values,
254 &BitBuffer::new_set(remainder.len()),
255 &mut loop_state,
256 );
257 }
258 AllOr::None => unreachable!("All invalid arrays have been handled before"),
259 AllOr::Some(v) => {
260 let mask = v.slice(head_idx..array.len());
261 let mut offset = 0;
262 for chunk in &mut chunks {
263 let validity = mask.slice(offset..(offset + 64));
264 offset += 64;
265
266 match validity.true_count() {
267 0 => continue,
269 64 => inner_loop_nonnull(
271 chunk.try_into().vortex_unwrap(),
272 count_distinct_values,
273 &mut loop_state,
274 ),
275 _ => inner_loop_nullable(
277 chunk.try_into().vortex_unwrap(),
278 count_distinct_values,
279 &validity,
280 &mut loop_state,
281 ),
282 }
283 }
284 let remainder = chunks.remainder();
286 inner_loop_naive(
287 remainder,
288 count_distinct_values,
289 &mask.slice(offset..(offset + remainder.len())),
290 &mut loop_state,
291 );
292 }
293 }
294
295 let (top_value, top_count) = if count_distinct_values {
296 let (&top_value, &top_count) = loop_state
297 .distinct_values
298 .iter()
299 .max_by_key(|&(_, &count)| count)
300 .vortex_expect("non-empty");
301 (top_value.0, top_count)
302 } else {
303 (T::default(), 0)
304 };
305
306 let runs = loop_state.runs;
307 let distinct_values_count = if count_distinct_values {
308 loop_state.distinct_values.len().try_into().vortex_unwrap()
309 } else {
310 u32::MAX
311 };
312
313 let min = array
314 .statistics()
315 .compute_as::<T>(Stat::Min)
316 .vortex_expect("min should be computed");
317
318 let max = array
319 .statistics()
320 .compute_as::<T>(Stat::Max)
321 .vortex_expect("max should be computed");
322
323 let typed = TypedStats {
324 min,
325 max,
326 distinct_values: loop_state.distinct_values,
327 top_value,
328 top_count,
329 };
330
331 let null_count = null_count
332 .try_into()
333 .vortex_expect("null_count must fit in u32");
334 let value_count = value_count
335 .try_into()
336 .vortex_expect("value_count must fit in u32");
337
338 IntegerStats {
339 src: array.clone(),
340 null_count,
341 value_count,
342 average_run_length: value_count / runs,
343 distinct_values_count,
344 typed: typed.into(),
345 }
346}
347
348struct LoopState<T> {
349 prev: T,
350 runs: u32,
351 distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
352}
353
354#[inline(always)]
355fn inner_loop_nonnull<T: IntegerPType>(
356 values: &[T; 64],
357 count_distinct_values: bool,
358 state: &mut LoopState<T>,
359) where
360 NativeValue<T>: Eq + Hash,
361{
362 for &value in values {
363 if count_distinct_values {
364 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
365 }
366
367 if value != state.prev {
368 state.prev = value;
369 state.runs += 1;
370 }
371 }
372}
373
374#[inline(always)]
375fn inner_loop_nullable<T: IntegerPType>(
376 values: &[T; 64],
377 count_distinct_values: bool,
378 is_valid: &BitBuffer,
379 state: &mut LoopState<T>,
380) where
381 NativeValue<T>: Eq + Hash,
382{
383 for (idx, &value) in values.iter().enumerate() {
384 if is_valid.value(idx) {
385 if count_distinct_values {
386 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
387 }
388
389 if value != state.prev {
390 state.prev = value;
391 state.runs += 1;
392 }
393 }
394 }
395}
396
397#[inline(always)]
398fn inner_loop_naive<T: IntegerPType>(
399 values: &[T],
400 count_distinct_values: bool,
401 is_valid: &BitBuffer,
402 state: &mut LoopState<T>,
403) where
404 NativeValue<T>: Eq + Hash,
405{
406 for (idx, &value) in values.iter().enumerate() {
407 if is_valid.value(idx) {
408 if count_distinct_values {
409 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
410 }
411
412 if value != state.prev {
413 state.prev = value;
414 state.runs += 1;
415 }
416 }
417 }
418}
419
420#[cfg(test)]
421mod tests {
422 use std::iter;
423
424 use vortex_array::arrays::PrimitiveArray;
425 use vortex_array::validity::Validity;
426 use vortex_buffer::BitBuffer;
427 use vortex_buffer::Buffer;
428 use vortex_buffer::buffer;
429
430 use crate::CompressorStats;
431 use crate::integer::IntegerStats;
432 use crate::integer::stats::typed_int_stats;
433
434 #[test]
435 fn test_naive_count_distinct_values() {
436 let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
437 let stats = typed_int_stats::<u8>(&array, true);
438 assert_eq!(stats.distinct_values_count, 2);
439 }
440
441 #[test]
442 fn test_naive_count_distinct_values_nullable() {
443 let array = PrimitiveArray::new(
444 buffer![217u8, 0],
445 Validity::from(BitBuffer::from(vec![true, false])),
446 );
447 let stats = typed_int_stats::<u8>(&array, true);
448 assert_eq!(stats.distinct_values_count, 1);
449 }
450
451 #[test]
452 fn test_count_distinct_values() {
453 let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
454 let stats = typed_int_stats::<u8>(&array, true);
455 assert_eq!(stats.distinct_values_count, 128);
456 }
457
458 #[test]
459 fn test_count_distinct_values_nullable() {
460 let array = PrimitiveArray::new(
461 (0..128u8).collect::<Buffer<u8>>(),
462 Validity::from(BitBuffer::from_iter(
463 iter::repeat_n(vec![true, false], 64).flatten(),
464 )),
465 );
466 let stats = typed_int_stats::<u8>(&array, true);
467 assert_eq!(stats.distinct_values_count, 64);
468 }
469
470 #[test]
471 fn test_integer_stats_leading_nulls() {
472 let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
473
474 let stats = IntegerStats::generate(&ints);
475
476 assert_eq!(stats.value_count, 2);
477 assert_eq!(stats.null_count, 1);
478 assert_eq!(stats.average_run_length, 1);
479 assert_eq!(stats.distinct_values_count, 2);
480 }
481}