1use std::hash::Hash;
2
3use arrow_buffer::BooleanBuffer;
4use num_traits::PrimInt;
5use rustc_hash::FxBuildHasher;
6use vortex_array::aliases::hash_map::HashMap;
7use vortex_array::arrays::{NativeValue, PrimitiveArray};
8use vortex_array::stats::Stat;
9use vortex_array::variants::PrimitiveArrayTrait;
10use vortex_array::{Array, ToCanonical};
11use vortex_dtype::{NativePType, match_each_integer_ptype};
12use vortex_error::{VortexError, VortexExpect, VortexUnwrap};
13use vortex_mask::AllOr;
14use vortex_scalar::{PValue, ScalarValue};
15
16use crate::sample::sample;
17use crate::{CompressorStats, GenerateStatsOptions};
18
19#[derive(Clone, Debug)]
20pub struct TypedStats<T> {
21 pub min: T,
22 pub max: T,
23 pub top_value: T,
24 pub top_count: u32,
25 pub distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
26}
27
28#[derive(Clone, Debug)]
33pub enum ErasedStats {
34 U8(TypedStats<u8>),
35 U16(TypedStats<u16>),
36 U32(TypedStats<u32>),
37 U64(TypedStats<u64>),
38 I8(TypedStats<i8>),
39 I16(TypedStats<i16>),
40 I32(TypedStats<i32>),
41 I64(TypedStats<i64>),
42}
43
44impl ErasedStats {
45 pub fn min_is_zero(&self) -> bool {
46 match &self {
47 ErasedStats::U8(x) => x.min == 0,
48 ErasedStats::U16(x) => x.min == 0,
49 ErasedStats::U32(x) => x.min == 0,
50 ErasedStats::U64(x) => x.min == 0,
51 ErasedStats::I8(x) => x.min == 0,
52 ErasedStats::I16(x) => x.min == 0,
53 ErasedStats::I32(x) => x.min == 0,
54 ErasedStats::I64(x) => x.min == 0,
55 }
56 }
57
58 pub fn min_is_negative(&self) -> bool {
59 match &self {
60 ErasedStats::U8(_)
61 | ErasedStats::U16(_)
62 | ErasedStats::U32(_)
63 | ErasedStats::U64(_) => false,
64 ErasedStats::I8(x) => x.min < 0,
65 ErasedStats::I16(x) => x.min < 0,
66 ErasedStats::I32(x) => x.min < 0,
67 ErasedStats::I64(x) => x.min < 0,
68 }
69 }
70
71 pub fn max_minus_min(&self) -> u64 {
73 match &self {
74 ErasedStats::U8(x) => (x.max - x.min) as u64,
75 ErasedStats::U16(x) => (x.max - x.min) as u64,
76 ErasedStats::U32(x) => (x.max - x.min) as u64,
77 ErasedStats::U64(x) => x.max - x.min,
78 ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
79 ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
80 ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
81 ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
82 .vortex_expect("max minus min result bigger than u64"),
83 }
84 }
85
86 pub fn top_value_and_count(&self) -> (PValue, u32) {
88 match &self {
89 ErasedStats::U8(x) => (x.top_value.into(), x.top_count),
90 ErasedStats::U16(x) => (x.top_value.into(), x.top_count),
91 ErasedStats::U32(x) => (x.top_value.into(), x.top_count),
92 ErasedStats::U64(x) => (x.top_value.into(), x.top_count),
93 ErasedStats::I8(x) => (x.top_value.into(), x.top_count),
94 ErasedStats::I16(x) => (x.top_value.into(), x.top_count),
95 ErasedStats::I32(x) => (x.top_value.into(), x.top_count),
96 ErasedStats::I64(x) => (x.top_value.into(), x.top_count),
97 }
98 }
99}
100
101macro_rules! impl_from_typed {
102 ($T:ty, $variant:path) => {
103 impl From<TypedStats<$T>> for ErasedStats {
104 fn from(typed: TypedStats<$T>) -> Self {
105 $variant(typed)
106 }
107 }
108 };
109}
110
111impl_from_typed!(u8, ErasedStats::U8);
112impl_from_typed!(u16, ErasedStats::U16);
113impl_from_typed!(u32, ErasedStats::U32);
114impl_from_typed!(u64, ErasedStats::U64);
115impl_from_typed!(i8, ErasedStats::I8);
116impl_from_typed!(i16, ErasedStats::I16);
117impl_from_typed!(i32, ErasedStats::I32);
118impl_from_typed!(i64, ErasedStats::I64);
119
120#[derive(Clone, Debug)]
121pub struct IntegerStats {
122 pub(super) src: PrimitiveArray,
123 pub(super) null_count: u32,
125 pub(super) value_count: u32,
127 pub(super) average_run_length: u32,
128 pub(super) distinct_values_count: u32,
129 pub(crate) typed: ErasedStats,
130}
131
132impl CompressorStats for IntegerStats {
133 type ArrayType = PrimitiveArray;
134
135 fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
136 match_each_integer_ptype!(input.ptype(), |$T| {
137 typed_int_stats::<$T>(input, opts.count_distinct_values)
138 })
139 }
140
141 fn source(&self) -> &PrimitiveArray {
142 &self.src
143 }
144
145 fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
146 let sampled = sample(self.src.clone(), sample_size, sample_count)
147 .to_primitive()
148 .vortex_expect("primitive");
149
150 Self::generate_opts(&sampled, opts)
151 }
152}
153
154fn typed_int_stats<T>(array: &PrimitiveArray, count_distinct_values: bool) -> IntegerStats
155where
156 T: NativePType + PrimInt + for<'a> TryFrom<&'a ScalarValue, Error = VortexError>,
157 TypedStats<T>: Into<ErasedStats>,
158 NativeValue<T>: Eq + Hash,
159{
160 if array.is_empty() {
162 return IntegerStats {
163 src: array.clone(),
164 null_count: 0,
165 value_count: 0,
166 average_run_length: 0,
167 distinct_values_count: 0,
168 typed: TypedStats {
169 min: T::max_value(),
170 max: T::min_value(),
171 top_value: T::default(),
172 top_count: 0,
173 distinct_values: HashMap::with_hasher(FxBuildHasher),
174 }
175 .into(),
176 };
177 } else if array.all_invalid().vortex_expect("all_invalid") {
178 return IntegerStats {
179 src: array.clone(),
180 null_count: array.len().try_into().vortex_expect("null_count"),
181 value_count: 0,
182 average_run_length: 0,
183 distinct_values_count: 0,
184 typed: TypedStats {
185 min: T::max_value(),
186 max: T::min_value(),
187 top_value: T::default(),
188 top_count: 0,
189 distinct_values: HashMap::with_hasher(FxBuildHasher),
190 }
191 .into(),
192 };
193 }
194
195 let validity = array.validity_mask().vortex_expect("logical_validity");
196 let null_count = validity.false_count();
197 let value_count = validity.true_count();
198
199 let head_idx = validity
201 .first()
202 .vortex_expect("All null masks have been handled before");
203 let buffer = array.buffer::<T>();
204 let head = buffer[head_idx];
205
206 let mut loop_state = LoopState {
207 distinct_values: if count_distinct_values {
208 HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
209 } else {
210 HashMap::with_hasher(FxBuildHasher)
211 },
212 prev: head,
213 runs: 1,
214 };
215
216 let sliced = buffer.slice(head_idx..array.len());
217 let mut chunks = sliced.as_slice().array_chunks::<64>();
218 match validity.boolean_buffer() {
219 AllOr::All => {
220 for chunk in &mut chunks {
221 inner_loop_nonnull(chunk, count_distinct_values, &mut loop_state)
222 }
223 let remainder = chunks.remainder();
224 inner_loop_naive(
225 remainder,
226 count_distinct_values,
227 &BooleanBuffer::new_set(remainder.len()),
228 &mut loop_state,
229 );
230 }
231 AllOr::None => unreachable!("All invalid arrays have been handled before"),
232 AllOr::Some(v) => {
233 let mask = v.slice(head_idx, array.len() - head_idx);
234 let mut offset = 0;
235 for chunk in &mut chunks {
236 let validity = mask.slice(offset, 64);
237 offset += 64;
238
239 match validity.count_set_bits() {
240 0 => continue,
242 64 => inner_loop_nonnull(chunk, count_distinct_values, &mut loop_state),
244 _ => inner_loop_nullable(
246 chunk,
247 count_distinct_values,
248 &validity,
249 &mut loop_state,
250 ),
251 }
252 }
253 let remainder = chunks.remainder();
255 inner_loop_naive(
256 remainder,
257 count_distinct_values,
258 &mask.slice(offset, remainder.len()),
259 &mut loop_state,
260 );
261 }
262 }
263
264 let (top_value, top_count) = if count_distinct_values {
265 let (&top_value, &top_count) = loop_state
266 .distinct_values
267 .iter()
268 .max_by_key(|&(_, &count)| count)
269 .vortex_expect("non-empty");
270 (top_value.0, top_count)
271 } else {
272 (T::default(), 0)
273 };
274
275 let runs = loop_state.runs;
276 let distinct_values_count = if count_distinct_values {
277 loop_state.distinct_values.len().try_into().vortex_unwrap()
278 } else {
279 u32::MAX
280 };
281
282 let min = array
283 .statistics()
284 .compute_as::<T>(Stat::Min)
285 .vortex_expect("min should be computed");
286
287 let max = array
288 .statistics()
289 .compute_as::<T>(Stat::Max)
290 .vortex_expect("max should be computed");
291
292 let typed = TypedStats {
293 min,
294 max,
295 distinct_values: loop_state.distinct_values,
296 top_value,
297 top_count,
298 };
299
300 let null_count = null_count
301 .try_into()
302 .vortex_expect("null_count must fit in u32");
303 let value_count = value_count
304 .try_into()
305 .vortex_expect("value_count must fit in u32");
306
307 IntegerStats {
308 src: array.clone(),
309 null_count,
310 value_count,
311 average_run_length: value_count / runs,
312 distinct_values_count,
313 typed: typed.into(),
314 }
315}
316
317struct LoopState<T> {
318 prev: T,
319 runs: u32,
320 distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
321}
322
323#[inline(always)]
324fn inner_loop_nonnull<T: NativePType>(
325 values: &[T; 64],
326 count_distinct_values: bool,
327 state: &mut LoopState<T>,
328) where
329 NativeValue<T>: Eq + Hash,
330{
331 for &value in values {
332 if count_distinct_values {
333 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
334 }
335
336 if value != state.prev {
337 state.prev = value;
338 state.runs += 1;
339 }
340 }
341}
342
343#[inline(always)]
344fn inner_loop_nullable<T: NativePType>(
345 values: &[T; 64],
346 count_distinct_values: bool,
347 is_valid: &BooleanBuffer,
348 state: &mut LoopState<T>,
349) where
350 NativeValue<T>: Eq + Hash,
351{
352 for (idx, &value) in values.iter().enumerate() {
353 if is_valid.value(idx) {
354 if count_distinct_values {
355 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
356 }
357
358 if value != state.prev {
359 state.prev = value;
360 state.runs += 1;
361 }
362 }
363 }
364}
365
366#[inline(always)]
367fn inner_loop_naive<T: NativePType>(
368 values: &[T],
369 count_distinct_values: bool,
370 is_valid: &BooleanBuffer,
371 state: &mut LoopState<T>,
372) where
373 NativeValue<T>: Eq + Hash,
374{
375 for (idx, &value) in values.iter().enumerate() {
376 if is_valid.value(idx) {
377 if count_distinct_values {
378 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
379 }
380
381 if value != state.prev {
382 state.prev = value;
383 state.runs += 1;
384 }
385 }
386 }
387}
388
389#[cfg(test)]
390mod tests {
391 use std::iter;
392
393 use arrow_buffer::BooleanBuffer;
394 use vortex_array::arrays::PrimitiveArray;
395 use vortex_array::validity::Validity;
396 use vortex_buffer::{Buffer, buffer};
397
398 use crate::CompressorStats;
399 use crate::integer::IntegerStats;
400 use crate::integer::stats::typed_int_stats;
401
402 #[test]
403 fn test_naive_count_distinct_values() {
404 let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
405 let stats = typed_int_stats::<u8>(&array, true);
406 assert_eq!(stats.distinct_values_count, 2);
407 }
408
409 #[test]
410 fn test_naive_count_distinct_values_nullable() {
411 let array = PrimitiveArray::new(
412 buffer![217u8, 0],
413 Validity::from(BooleanBuffer::from(vec![true, false])),
414 );
415 let stats = typed_int_stats::<u8>(&array, true);
416 assert_eq!(stats.distinct_values_count, 1);
417 }
418
419 #[test]
420 fn test_count_distinct_values() {
421 let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
422 let stats = typed_int_stats::<u8>(&array, true);
423 assert_eq!(stats.distinct_values_count, 128);
424 }
425
426 #[test]
427 fn test_count_distinct_values_nullable() {
428 let array = PrimitiveArray::new(
429 (0..128u8).collect::<Buffer<u8>>(),
430 Validity::from(BooleanBuffer::from_iter(
431 iter::repeat_n(vec![true, false], 64).flatten(),
432 )),
433 );
434 let stats = typed_int_stats::<u8>(&array, true);
435 assert_eq!(stats.distinct_values_count, 64);
436 }
437
438 #[test]
439 fn test_integer_stats_leading_nulls() {
440 let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
441
442 let stats = IntegerStats::generate(&ints);
443
444 assert_eq!(stats.value_count, 2);
445 assert_eq!(stats.null_count, 1);
446 assert_eq!(stats.average_run_length, 1);
447 assert_eq!(stats.distinct_values_count, 2);
448 }
449}