1use std::hash::Hash;
5
6use arrow_buffer::BooleanBuffer;
7use num_traits::PrimInt;
8use rustc_hash::FxBuildHasher;
9use vortex_array::ToCanonical;
10use vortex_array::arrays::{NativeValue, PrimitiveArray, PrimitiveVTable};
11use vortex_array::stats::Stat;
12use vortex_dtype::{NativePType, match_each_integer_ptype};
13use vortex_error::{VortexError, VortexExpect, VortexUnwrap};
14use vortex_mask::AllOr;
15use vortex_scalar::{PValue, Scalar};
16use vortex_utils::aliases::hash_map::HashMap;
17
18use crate::sample::sample;
19use crate::{CompressorStats, GenerateStatsOptions};
20
21#[derive(Clone, Debug)]
22pub struct TypedStats<T> {
23 pub min: T,
24 pub max: T,
25 pub top_value: T,
26 pub top_count: u32,
27 pub distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
28}
29
30#[derive(Clone, Debug)]
35pub enum ErasedStats {
36 U8(TypedStats<u8>),
37 U16(TypedStats<u16>),
38 U32(TypedStats<u32>),
39 U64(TypedStats<u64>),
40 I8(TypedStats<i8>),
41 I16(TypedStats<i16>),
42 I32(TypedStats<i32>),
43 I64(TypedStats<i64>),
44}
45
46impl ErasedStats {
47 pub fn min_is_zero(&self) -> bool {
48 match &self {
49 ErasedStats::U8(x) => x.min == 0,
50 ErasedStats::U16(x) => x.min == 0,
51 ErasedStats::U32(x) => x.min == 0,
52 ErasedStats::U64(x) => x.min == 0,
53 ErasedStats::I8(x) => x.min == 0,
54 ErasedStats::I16(x) => x.min == 0,
55 ErasedStats::I32(x) => x.min == 0,
56 ErasedStats::I64(x) => x.min == 0,
57 }
58 }
59
60 pub fn min_is_negative(&self) -> bool {
61 match &self {
62 ErasedStats::U8(_)
63 | ErasedStats::U16(_)
64 | ErasedStats::U32(_)
65 | ErasedStats::U64(_) => false,
66 ErasedStats::I8(x) => x.min < 0,
67 ErasedStats::I16(x) => x.min < 0,
68 ErasedStats::I32(x) => x.min < 0,
69 ErasedStats::I64(x) => x.min < 0,
70 }
71 }
72
73 pub fn max_minus_min(&self) -> u64 {
75 match &self {
76 ErasedStats::U8(x) => (x.max - x.min) as u64,
77 ErasedStats::U16(x) => (x.max - x.min) as u64,
78 ErasedStats::U32(x) => (x.max - x.min) as u64,
79 ErasedStats::U64(x) => x.max - x.min,
80 ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
81 ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
82 ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
83 ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
84 .vortex_expect("max minus min result bigger than u64"),
85 }
86 }
87
88 pub fn top_value_and_count(&self) -> (PValue, u32) {
90 match &self {
91 ErasedStats::U8(x) => (x.top_value.into(), x.top_count),
92 ErasedStats::U16(x) => (x.top_value.into(), x.top_count),
93 ErasedStats::U32(x) => (x.top_value.into(), x.top_count),
94 ErasedStats::U64(x) => (x.top_value.into(), x.top_count),
95 ErasedStats::I8(x) => (x.top_value.into(), x.top_count),
96 ErasedStats::I16(x) => (x.top_value.into(), x.top_count),
97 ErasedStats::I32(x) => (x.top_value.into(), x.top_count),
98 ErasedStats::I64(x) => (x.top_value.into(), x.top_count),
99 }
100 }
101}
102
103macro_rules! impl_from_typed {
104 ($T:ty, $variant:path) => {
105 impl From<TypedStats<$T>> for ErasedStats {
106 fn from(typed: TypedStats<$T>) -> Self {
107 $variant(typed)
108 }
109 }
110 };
111}
112
113impl_from_typed!(u8, ErasedStats::U8);
114impl_from_typed!(u16, ErasedStats::U16);
115impl_from_typed!(u32, ErasedStats::U32);
116impl_from_typed!(u64, ErasedStats::U64);
117impl_from_typed!(i8, ErasedStats::I8);
118impl_from_typed!(i16, ErasedStats::I16);
119impl_from_typed!(i32, ErasedStats::I32);
120impl_from_typed!(i64, ErasedStats::I64);
121
122#[derive(Clone, Debug)]
123pub struct IntegerStats {
124 pub(super) src: PrimitiveArray,
125 pub(super) null_count: u32,
127 pub(super) value_count: u32,
129 pub(super) average_run_length: u32,
130 pub(super) distinct_values_count: u32,
131 pub(crate) typed: ErasedStats,
132}
133
134impl CompressorStats for IntegerStats {
135 type ArrayVTable = PrimitiveVTable;
136
137 fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
138 match_each_integer_ptype!(input.ptype(), |T| {
139 typed_int_stats::<T>(input, opts.count_distinct_values)
140 })
141 }
142
143 fn source(&self) -> &PrimitiveArray {
144 &self.src
145 }
146
147 fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
148 let sampled = sample(self.src.as_ref(), sample_size, sample_count).to_primitive();
149
150 Self::generate_opts(&sampled, opts)
151 }
152}
153
154fn typed_int_stats<T>(array: &PrimitiveArray, count_distinct_values: bool) -> IntegerStats
155where
156 T: NativePType + PrimInt + for<'a> TryFrom<&'a Scalar, Error = VortexError>,
157 TypedStats<T>: Into<ErasedStats>,
158 NativeValue<T>: Eq + Hash,
159{
160 if array.is_empty() {
162 return IntegerStats {
163 src: array.clone(),
164 null_count: 0,
165 value_count: 0,
166 average_run_length: 0,
167 distinct_values_count: 0,
168 typed: TypedStats {
169 min: T::max_value(),
170 max: T::min_value(),
171 top_value: T::default(),
172 top_count: 0,
173 distinct_values: HashMap::with_hasher(FxBuildHasher),
174 }
175 .into(),
176 };
177 } else if array.all_invalid() {
178 return IntegerStats {
179 src: array.clone(),
180 null_count: array.len().try_into().vortex_expect("null_count"),
181 value_count: 0,
182 average_run_length: 0,
183 distinct_values_count: 0,
184 typed: TypedStats {
185 min: T::max_value(),
186 max: T::min_value(),
187 top_value: T::default(),
188 top_count: 0,
189 distinct_values: HashMap::with_hasher(FxBuildHasher),
190 }
191 .into(),
192 };
193 }
194
195 let validity = array.validity_mask();
196 let null_count = validity.false_count();
197 let value_count = validity.true_count();
198
199 let head_idx = validity
201 .first()
202 .vortex_expect("All null masks have been handled before");
203 let buffer = array.buffer::<T>();
204 let head = buffer[head_idx];
205
206 let mut loop_state = LoopState {
207 distinct_values: if count_distinct_values {
208 HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
209 } else {
210 HashMap::with_hasher(FxBuildHasher)
211 },
212 prev: head,
213 runs: 1,
214 };
215
216 let sliced = buffer.slice(head_idx..array.len());
217 let mut chunks = sliced.as_slice().chunks_exact(64);
218 match validity.boolean_buffer() {
219 AllOr::All => {
220 for chunk in &mut chunks {
221 inner_loop_nonnull(
222 chunk.try_into().vortex_unwrap(),
223 count_distinct_values,
224 &mut loop_state,
225 )
226 }
227 let remainder = chunks.remainder();
228 inner_loop_naive(
229 remainder,
230 count_distinct_values,
231 &BooleanBuffer::new_set(remainder.len()),
232 &mut loop_state,
233 );
234 }
235 AllOr::None => unreachable!("All invalid arrays have been handled before"),
236 AllOr::Some(v) => {
237 let mask = v.slice(head_idx, array.len() - head_idx);
238 let mut offset = 0;
239 for chunk in &mut chunks {
240 let validity = mask.slice(offset, 64);
241 offset += 64;
242
243 match validity.count_set_bits() {
244 0 => continue,
246 64 => inner_loop_nonnull(
248 chunk.try_into().vortex_unwrap(),
249 count_distinct_values,
250 &mut loop_state,
251 ),
252 _ => inner_loop_nullable(
254 chunk.try_into().vortex_unwrap(),
255 count_distinct_values,
256 &validity,
257 &mut loop_state,
258 ),
259 }
260 }
261 let remainder = chunks.remainder();
263 inner_loop_naive(
264 remainder,
265 count_distinct_values,
266 &mask.slice(offset, remainder.len()),
267 &mut loop_state,
268 );
269 }
270 }
271
272 let (top_value, top_count) = if count_distinct_values {
273 let (&top_value, &top_count) = loop_state
274 .distinct_values
275 .iter()
276 .max_by_key(|&(_, &count)| count)
277 .vortex_expect("non-empty");
278 (top_value.0, top_count)
279 } else {
280 (T::default(), 0)
281 };
282
283 let runs = loop_state.runs;
284 let distinct_values_count = if count_distinct_values {
285 loop_state.distinct_values.len().try_into().vortex_unwrap()
286 } else {
287 u32::MAX
288 };
289
290 let min = array
291 .statistics()
292 .compute_as::<T>(Stat::Min)
293 .vortex_expect("min should be computed");
294
295 let max = array
296 .statistics()
297 .compute_as::<T>(Stat::Max)
298 .vortex_expect("max should be computed");
299
300 let typed = TypedStats {
301 min,
302 max,
303 distinct_values: loop_state.distinct_values,
304 top_value,
305 top_count,
306 };
307
308 let null_count = null_count
309 .try_into()
310 .vortex_expect("null_count must fit in u32");
311 let value_count = value_count
312 .try_into()
313 .vortex_expect("value_count must fit in u32");
314
315 IntegerStats {
316 src: array.clone(),
317 null_count,
318 value_count,
319 average_run_length: value_count / runs,
320 distinct_values_count,
321 typed: typed.into(),
322 }
323}
324
325struct LoopState<T> {
326 prev: T,
327 runs: u32,
328 distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
329}
330
331#[inline(always)]
332fn inner_loop_nonnull<T: NativePType>(
333 values: &[T; 64],
334 count_distinct_values: bool,
335 state: &mut LoopState<T>,
336) where
337 NativeValue<T>: Eq + Hash,
338{
339 for &value in values {
340 if count_distinct_values {
341 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
342 }
343
344 if value != state.prev {
345 state.prev = value;
346 state.runs += 1;
347 }
348 }
349}
350
351#[inline(always)]
352fn inner_loop_nullable<T: NativePType>(
353 values: &[T; 64],
354 count_distinct_values: bool,
355 is_valid: &BooleanBuffer,
356 state: &mut LoopState<T>,
357) where
358 NativeValue<T>: Eq + Hash,
359{
360 for (idx, &value) in values.iter().enumerate() {
361 if is_valid.value(idx) {
362 if count_distinct_values {
363 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
364 }
365
366 if value != state.prev {
367 state.prev = value;
368 state.runs += 1;
369 }
370 }
371 }
372}
373
374#[inline(always)]
375fn inner_loop_naive<T: NativePType>(
376 values: &[T],
377 count_distinct_values: bool,
378 is_valid: &BooleanBuffer,
379 state: &mut LoopState<T>,
380) where
381 NativeValue<T>: Eq + Hash,
382{
383 for (idx, &value) in values.iter().enumerate() {
384 if is_valid.value(idx) {
385 if count_distinct_values {
386 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
387 }
388
389 if value != state.prev {
390 state.prev = value;
391 state.runs += 1;
392 }
393 }
394 }
395}
396
397#[cfg(test)]
398mod tests {
399 use std::iter;
400
401 use arrow_buffer::BooleanBuffer;
402 use vortex_array::arrays::PrimitiveArray;
403 use vortex_array::validity::Validity;
404 use vortex_buffer::{Buffer, buffer};
405
406 use crate::CompressorStats;
407 use crate::integer::IntegerStats;
408 use crate::integer::stats::typed_int_stats;
409
410 #[test]
411 fn test_naive_count_distinct_values() {
412 let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
413 let stats = typed_int_stats::<u8>(&array, true);
414 assert_eq!(stats.distinct_values_count, 2);
415 }
416
417 #[test]
418 fn test_naive_count_distinct_values_nullable() {
419 let array = PrimitiveArray::new(
420 buffer![217u8, 0],
421 Validity::from(BooleanBuffer::from(vec![true, false])),
422 );
423 let stats = typed_int_stats::<u8>(&array, true);
424 assert_eq!(stats.distinct_values_count, 1);
425 }
426
427 #[test]
428 fn test_count_distinct_values() {
429 let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
430 let stats = typed_int_stats::<u8>(&array, true);
431 assert_eq!(stats.distinct_values_count, 128);
432 }
433
434 #[test]
435 fn test_count_distinct_values_nullable() {
436 let array = PrimitiveArray::new(
437 (0..128u8).collect::<Buffer<u8>>(),
438 Validity::from(BooleanBuffer::from_iter(
439 iter::repeat_n(vec![true, false], 64).flatten(),
440 )),
441 );
442 let stats = typed_int_stats::<u8>(&array, true);
443 assert_eq!(stats.distinct_values_count, 64);
444 }
445
446 #[test]
447 fn test_integer_stats_leading_nulls() {
448 let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
449
450 let stats = IntegerStats::generate(&ints);
451
452 assert_eq!(stats.value_count, 2);
453 assert_eq!(stats.null_count, 1);
454 assert_eq!(stats.average_run_length, 1);
455 assert_eq!(stats.distinct_values_count, 2);
456 }
457}