1use std::hash::Hash;
5
6use arrow_buffer::BooleanBuffer;
7use num_traits::PrimInt;
8use rustc_hash::FxBuildHasher;
9use vortex_array::ToCanonical;
10use vortex_array::arrays::{NativeValue, PrimitiveArray, PrimitiveVTable};
11use vortex_array::stats::Stat;
12use vortex_dtype::{NativePType, match_each_integer_ptype};
13use vortex_error::{VortexError, VortexExpect, VortexUnwrap};
14use vortex_mask::AllOr;
15use vortex_scalar::{PValue, ScalarValue};
16use vortex_utils::aliases::hash_map::HashMap;
17
18use crate::sample::sample;
19use crate::{CompressorStats, GenerateStatsOptions};
20
21#[derive(Clone, Debug)]
22pub struct TypedStats<T> {
23 pub min: T,
24 pub max: T,
25 pub top_value: T,
26 pub top_count: u32,
27 pub distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
28}
29
30#[derive(Clone, Debug)]
35pub enum ErasedStats {
36 U8(TypedStats<u8>),
37 U16(TypedStats<u16>),
38 U32(TypedStats<u32>),
39 U64(TypedStats<u64>),
40 I8(TypedStats<i8>),
41 I16(TypedStats<i16>),
42 I32(TypedStats<i32>),
43 I64(TypedStats<i64>),
44}
45
46impl ErasedStats {
47 pub fn min_is_zero(&self) -> bool {
48 match &self {
49 ErasedStats::U8(x) => x.min == 0,
50 ErasedStats::U16(x) => x.min == 0,
51 ErasedStats::U32(x) => x.min == 0,
52 ErasedStats::U64(x) => x.min == 0,
53 ErasedStats::I8(x) => x.min == 0,
54 ErasedStats::I16(x) => x.min == 0,
55 ErasedStats::I32(x) => x.min == 0,
56 ErasedStats::I64(x) => x.min == 0,
57 }
58 }
59
60 pub fn min_is_negative(&self) -> bool {
61 match &self {
62 ErasedStats::U8(_)
63 | ErasedStats::U16(_)
64 | ErasedStats::U32(_)
65 | ErasedStats::U64(_) => false,
66 ErasedStats::I8(x) => x.min < 0,
67 ErasedStats::I16(x) => x.min < 0,
68 ErasedStats::I32(x) => x.min < 0,
69 ErasedStats::I64(x) => x.min < 0,
70 }
71 }
72
73 pub fn max_minus_min(&self) -> u64 {
75 match &self {
76 ErasedStats::U8(x) => (x.max - x.min) as u64,
77 ErasedStats::U16(x) => (x.max - x.min) as u64,
78 ErasedStats::U32(x) => (x.max - x.min) as u64,
79 ErasedStats::U64(x) => x.max - x.min,
80 ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
81 ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
82 ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
83 ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
84 .vortex_expect("max minus min result bigger than u64"),
85 }
86 }
87
88 pub fn top_value_and_count(&self) -> (PValue, u32) {
90 match &self {
91 ErasedStats::U8(x) => (x.top_value.into(), x.top_count),
92 ErasedStats::U16(x) => (x.top_value.into(), x.top_count),
93 ErasedStats::U32(x) => (x.top_value.into(), x.top_count),
94 ErasedStats::U64(x) => (x.top_value.into(), x.top_count),
95 ErasedStats::I8(x) => (x.top_value.into(), x.top_count),
96 ErasedStats::I16(x) => (x.top_value.into(), x.top_count),
97 ErasedStats::I32(x) => (x.top_value.into(), x.top_count),
98 ErasedStats::I64(x) => (x.top_value.into(), x.top_count),
99 }
100 }
101}
102
103macro_rules! impl_from_typed {
104 ($T:ty, $variant:path) => {
105 impl From<TypedStats<$T>> for ErasedStats {
106 fn from(typed: TypedStats<$T>) -> Self {
107 $variant(typed)
108 }
109 }
110 };
111}
112
113impl_from_typed!(u8, ErasedStats::U8);
114impl_from_typed!(u16, ErasedStats::U16);
115impl_from_typed!(u32, ErasedStats::U32);
116impl_from_typed!(u64, ErasedStats::U64);
117impl_from_typed!(i8, ErasedStats::I8);
118impl_from_typed!(i16, ErasedStats::I16);
119impl_from_typed!(i32, ErasedStats::I32);
120impl_from_typed!(i64, ErasedStats::I64);
121
122#[derive(Clone, Debug)]
123pub struct IntegerStats {
124 pub(super) src: PrimitiveArray,
125 pub(super) null_count: u32,
127 pub(super) value_count: u32,
129 pub(super) average_run_length: u32,
130 pub(super) distinct_values_count: u32,
131 pub(crate) typed: ErasedStats,
132}
133
134impl CompressorStats for IntegerStats {
135 type ArrayVTable = PrimitiveVTable;
136
137 fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
138 match_each_integer_ptype!(input.ptype(), |T| {
139 typed_int_stats::<T>(input, opts.count_distinct_values)
140 })
141 }
142
143 fn source(&self) -> &PrimitiveArray {
144 &self.src
145 }
146
147 fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
148 let sampled = sample(self.src.as_ref(), sample_size, sample_count)
149 .to_primitive()
150 .vortex_expect("primitive");
151
152 Self::generate_opts(&sampled, opts)
153 }
154}
155
156fn typed_int_stats<T>(array: &PrimitiveArray, count_distinct_values: bool) -> IntegerStats
157where
158 T: NativePType + PrimInt + for<'a> TryFrom<&'a ScalarValue, Error = VortexError>,
159 TypedStats<T>: Into<ErasedStats>,
160 NativeValue<T>: Eq + Hash,
161{
162 if array.is_empty() {
164 return IntegerStats {
165 src: array.clone(),
166 null_count: 0,
167 value_count: 0,
168 average_run_length: 0,
169 distinct_values_count: 0,
170 typed: TypedStats {
171 min: T::max_value(),
172 max: T::min_value(),
173 top_value: T::default(),
174 top_count: 0,
175 distinct_values: HashMap::with_hasher(FxBuildHasher),
176 }
177 .into(),
178 };
179 } else if array.all_invalid().vortex_expect("all_invalid") {
180 return IntegerStats {
181 src: array.clone(),
182 null_count: array.len().try_into().vortex_expect("null_count"),
183 value_count: 0,
184 average_run_length: 0,
185 distinct_values_count: 0,
186 typed: TypedStats {
187 min: T::max_value(),
188 max: T::min_value(),
189 top_value: T::default(),
190 top_count: 0,
191 distinct_values: HashMap::with_hasher(FxBuildHasher),
192 }
193 .into(),
194 };
195 }
196
197 let validity = array.validity_mask().vortex_expect("logical_validity");
198 let null_count = validity.false_count();
199 let value_count = validity.true_count();
200
201 let head_idx = validity
203 .first()
204 .vortex_expect("All null masks have been handled before");
205 let buffer = array.buffer::<T>();
206 let head = buffer[head_idx];
207
208 let mut loop_state = LoopState {
209 distinct_values: if count_distinct_values {
210 HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
211 } else {
212 HashMap::with_hasher(FxBuildHasher)
213 },
214 prev: head,
215 runs: 1,
216 };
217
218 let sliced = buffer.slice(head_idx..array.len());
219 let mut chunks = sliced.as_slice().chunks_exact(64);
220 match validity.boolean_buffer() {
221 AllOr::All => {
222 for chunk in &mut chunks {
223 inner_loop_nonnull(
224 chunk.try_into().vortex_unwrap(),
225 count_distinct_values,
226 &mut loop_state,
227 )
228 }
229 let remainder = chunks.remainder();
230 inner_loop_naive(
231 remainder,
232 count_distinct_values,
233 &BooleanBuffer::new_set(remainder.len()),
234 &mut loop_state,
235 );
236 }
237 AllOr::None => unreachable!("All invalid arrays have been handled before"),
238 AllOr::Some(v) => {
239 let mask = v.slice(head_idx, array.len() - head_idx);
240 let mut offset = 0;
241 for chunk in &mut chunks {
242 let validity = mask.slice(offset, 64);
243 offset += 64;
244
245 match validity.count_set_bits() {
246 0 => continue,
248 64 => inner_loop_nonnull(
250 chunk.try_into().vortex_unwrap(),
251 count_distinct_values,
252 &mut loop_state,
253 ),
254 _ => inner_loop_nullable(
256 chunk.try_into().vortex_unwrap(),
257 count_distinct_values,
258 &validity,
259 &mut loop_state,
260 ),
261 }
262 }
263 let remainder = chunks.remainder();
265 inner_loop_naive(
266 remainder,
267 count_distinct_values,
268 &mask.slice(offset, remainder.len()),
269 &mut loop_state,
270 );
271 }
272 }
273
274 let (top_value, top_count) = if count_distinct_values {
275 let (&top_value, &top_count) = loop_state
276 .distinct_values
277 .iter()
278 .max_by_key(|&(_, &count)| count)
279 .vortex_expect("non-empty");
280 (top_value.0, top_count)
281 } else {
282 (T::default(), 0)
283 };
284
285 let runs = loop_state.runs;
286 let distinct_values_count = if count_distinct_values {
287 loop_state.distinct_values.len().try_into().vortex_unwrap()
288 } else {
289 u32::MAX
290 };
291
292 let min = array
293 .statistics()
294 .compute_as::<T>(Stat::Min)
295 .vortex_expect("min should be computed");
296
297 let max = array
298 .statistics()
299 .compute_as::<T>(Stat::Max)
300 .vortex_expect("max should be computed");
301
302 let typed = TypedStats {
303 min,
304 max,
305 distinct_values: loop_state.distinct_values,
306 top_value,
307 top_count,
308 };
309
310 let null_count = null_count
311 .try_into()
312 .vortex_expect("null_count must fit in u32");
313 let value_count = value_count
314 .try_into()
315 .vortex_expect("value_count must fit in u32");
316
317 IntegerStats {
318 src: array.clone(),
319 null_count,
320 value_count,
321 average_run_length: value_count / runs,
322 distinct_values_count,
323 typed: typed.into(),
324 }
325}
326
327struct LoopState<T> {
328 prev: T,
329 runs: u32,
330 distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
331}
332
333#[inline(always)]
334fn inner_loop_nonnull<T: NativePType>(
335 values: &[T; 64],
336 count_distinct_values: bool,
337 state: &mut LoopState<T>,
338) where
339 NativeValue<T>: Eq + Hash,
340{
341 for &value in values {
342 if count_distinct_values {
343 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
344 }
345
346 if value != state.prev {
347 state.prev = value;
348 state.runs += 1;
349 }
350 }
351}
352
353#[inline(always)]
354fn inner_loop_nullable<T: NativePType>(
355 values: &[T; 64],
356 count_distinct_values: bool,
357 is_valid: &BooleanBuffer,
358 state: &mut LoopState<T>,
359) where
360 NativeValue<T>: Eq + Hash,
361{
362 for (idx, &value) in values.iter().enumerate() {
363 if is_valid.value(idx) {
364 if count_distinct_values {
365 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
366 }
367
368 if value != state.prev {
369 state.prev = value;
370 state.runs += 1;
371 }
372 }
373 }
374}
375
376#[inline(always)]
377fn inner_loop_naive<T: NativePType>(
378 values: &[T],
379 count_distinct_values: bool,
380 is_valid: &BooleanBuffer,
381 state: &mut LoopState<T>,
382) where
383 NativeValue<T>: Eq + Hash,
384{
385 for (idx, &value) in values.iter().enumerate() {
386 if is_valid.value(idx) {
387 if count_distinct_values {
388 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
389 }
390
391 if value != state.prev {
392 state.prev = value;
393 state.runs += 1;
394 }
395 }
396 }
397}
398
399#[cfg(test)]
400mod tests {
401 use std::iter;
402
403 use arrow_buffer::BooleanBuffer;
404 use vortex_array::arrays::PrimitiveArray;
405 use vortex_array::validity::Validity;
406 use vortex_buffer::{Buffer, buffer};
407
408 use crate::CompressorStats;
409 use crate::integer::IntegerStats;
410 use crate::integer::stats::typed_int_stats;
411
412 #[test]
413 fn test_naive_count_distinct_values() {
414 let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
415 let stats = typed_int_stats::<u8>(&array, true);
416 assert_eq!(stats.distinct_values_count, 2);
417 }
418
419 #[test]
420 fn test_naive_count_distinct_values_nullable() {
421 let array = PrimitiveArray::new(
422 buffer![217u8, 0],
423 Validity::from(BooleanBuffer::from(vec![true, false])),
424 );
425 let stats = typed_int_stats::<u8>(&array, true);
426 assert_eq!(stats.distinct_values_count, 1);
427 }
428
429 #[test]
430 fn test_count_distinct_values() {
431 let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
432 let stats = typed_int_stats::<u8>(&array, true);
433 assert_eq!(stats.distinct_values_count, 128);
434 }
435
436 #[test]
437 fn test_count_distinct_values_nullable() {
438 let array = PrimitiveArray::new(
439 (0..128u8).collect::<Buffer<u8>>(),
440 Validity::from(BooleanBuffer::from_iter(
441 iter::repeat_n(vec![true, false], 64).flatten(),
442 )),
443 );
444 let stats = typed_int_stats::<u8>(&array, true);
445 assert_eq!(stats.distinct_values_count, 64);
446 }
447
448 #[test]
449 fn test_integer_stats_leading_nulls() {
450 let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
451
452 let stats = IntegerStats::generate(&ints);
453
454 assert_eq!(stats.value_count, 2);
455 assert_eq!(stats.null_count, 1);
456 assert_eq!(stats.average_run_length, 1);
457 assert_eq!(stats.distinct_values_count, 2);
458 }
459}