1use std::hash::Hash;
2
3use arrow_buffer::BooleanBuffer;
4use num_traits::PrimInt;
5use rustc_hash::FxBuildHasher;
6use vortex_array::ToCanonical;
7use vortex_array::aliases::hash_map::HashMap;
8use vortex_array::arrays::{NativeValue, PrimitiveArray, PrimitiveVTable};
9use vortex_array::stats::Stat;
10use vortex_dtype::{NativePType, match_each_integer_ptype};
11use vortex_error::{VortexError, VortexExpect, VortexUnwrap};
12use vortex_mask::AllOr;
13use vortex_scalar::{PValue, ScalarValue};
14
15use crate::sample::sample;
16use crate::{CompressorStats, GenerateStatsOptions};
17
18#[derive(Clone, Debug)]
19pub struct TypedStats<T> {
20 pub min: T,
21 pub max: T,
22 pub top_value: T,
23 pub top_count: u32,
24 pub distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
25}
26
27#[derive(Clone, Debug)]
32pub enum ErasedStats {
33 U8(TypedStats<u8>),
34 U16(TypedStats<u16>),
35 U32(TypedStats<u32>),
36 U64(TypedStats<u64>),
37 I8(TypedStats<i8>),
38 I16(TypedStats<i16>),
39 I32(TypedStats<i32>),
40 I64(TypedStats<i64>),
41}
42
43impl ErasedStats {
44 pub fn min_is_zero(&self) -> bool {
45 match &self {
46 ErasedStats::U8(x) => x.min == 0,
47 ErasedStats::U16(x) => x.min == 0,
48 ErasedStats::U32(x) => x.min == 0,
49 ErasedStats::U64(x) => x.min == 0,
50 ErasedStats::I8(x) => x.min == 0,
51 ErasedStats::I16(x) => x.min == 0,
52 ErasedStats::I32(x) => x.min == 0,
53 ErasedStats::I64(x) => x.min == 0,
54 }
55 }
56
57 pub fn min_is_negative(&self) -> bool {
58 match &self {
59 ErasedStats::U8(_)
60 | ErasedStats::U16(_)
61 | ErasedStats::U32(_)
62 | ErasedStats::U64(_) => false,
63 ErasedStats::I8(x) => x.min < 0,
64 ErasedStats::I16(x) => x.min < 0,
65 ErasedStats::I32(x) => x.min < 0,
66 ErasedStats::I64(x) => x.min < 0,
67 }
68 }
69
70 pub fn max_minus_min(&self) -> u64 {
72 match &self {
73 ErasedStats::U8(x) => (x.max - x.min) as u64,
74 ErasedStats::U16(x) => (x.max - x.min) as u64,
75 ErasedStats::U32(x) => (x.max - x.min) as u64,
76 ErasedStats::U64(x) => x.max - x.min,
77 ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
78 ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
79 ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
80 ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
81 .vortex_expect("max minus min result bigger than u64"),
82 }
83 }
84
85 pub fn top_value_and_count(&self) -> (PValue, u32) {
87 match &self {
88 ErasedStats::U8(x) => (x.top_value.into(), x.top_count),
89 ErasedStats::U16(x) => (x.top_value.into(), x.top_count),
90 ErasedStats::U32(x) => (x.top_value.into(), x.top_count),
91 ErasedStats::U64(x) => (x.top_value.into(), x.top_count),
92 ErasedStats::I8(x) => (x.top_value.into(), x.top_count),
93 ErasedStats::I16(x) => (x.top_value.into(), x.top_count),
94 ErasedStats::I32(x) => (x.top_value.into(), x.top_count),
95 ErasedStats::I64(x) => (x.top_value.into(), x.top_count),
96 }
97 }
98}
99
100macro_rules! impl_from_typed {
101 ($T:ty, $variant:path) => {
102 impl From<TypedStats<$T>> for ErasedStats {
103 fn from(typed: TypedStats<$T>) -> Self {
104 $variant(typed)
105 }
106 }
107 };
108}
109
110impl_from_typed!(u8, ErasedStats::U8);
111impl_from_typed!(u16, ErasedStats::U16);
112impl_from_typed!(u32, ErasedStats::U32);
113impl_from_typed!(u64, ErasedStats::U64);
114impl_from_typed!(i8, ErasedStats::I8);
115impl_from_typed!(i16, ErasedStats::I16);
116impl_from_typed!(i32, ErasedStats::I32);
117impl_from_typed!(i64, ErasedStats::I64);
118
119#[derive(Clone, Debug)]
120pub struct IntegerStats {
121 pub(super) src: PrimitiveArray,
122 pub(super) null_count: u32,
124 pub(super) value_count: u32,
126 pub(super) average_run_length: u32,
127 pub(super) distinct_values_count: u32,
128 pub(crate) typed: ErasedStats,
129}
130
131impl CompressorStats for IntegerStats {
132 type ArrayVTable = PrimitiveVTable;
133
134 fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
135 match_each_integer_ptype!(input.ptype(), |$T| {
136 typed_int_stats::<$T>(input, opts.count_distinct_values)
137 })
138 }
139
140 fn source(&self) -> &PrimitiveArray {
141 &self.src
142 }
143
144 fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
145 let sampled = sample(self.src.as_ref(), sample_size, sample_count)
146 .to_primitive()
147 .vortex_expect("primitive");
148
149 Self::generate_opts(&sampled, opts)
150 }
151}
152
153fn typed_int_stats<T>(array: &PrimitiveArray, count_distinct_values: bool) -> IntegerStats
154where
155 T: NativePType + PrimInt + for<'a> TryFrom<&'a ScalarValue, Error = VortexError>,
156 TypedStats<T>: Into<ErasedStats>,
157 NativeValue<T>: Eq + Hash,
158{
159 if array.is_empty() {
161 return IntegerStats {
162 src: array.clone(),
163 null_count: 0,
164 value_count: 0,
165 average_run_length: 0,
166 distinct_values_count: 0,
167 typed: TypedStats {
168 min: T::max_value(),
169 max: T::min_value(),
170 top_value: T::default(),
171 top_count: 0,
172 distinct_values: HashMap::with_hasher(FxBuildHasher),
173 }
174 .into(),
175 };
176 } else if array.all_invalid().vortex_expect("all_invalid") {
177 return IntegerStats {
178 src: array.clone(),
179 null_count: array.len().try_into().vortex_expect("null_count"),
180 value_count: 0,
181 average_run_length: 0,
182 distinct_values_count: 0,
183 typed: TypedStats {
184 min: T::max_value(),
185 max: T::min_value(),
186 top_value: T::default(),
187 top_count: 0,
188 distinct_values: HashMap::with_hasher(FxBuildHasher),
189 }
190 .into(),
191 };
192 }
193
194 let validity = array.validity_mask().vortex_expect("logical_validity");
195 let null_count = validity.false_count();
196 let value_count = validity.true_count();
197
198 let head_idx = validity
200 .first()
201 .vortex_expect("All null masks have been handled before");
202 let buffer = array.buffer::<T>();
203 let head = buffer[head_idx];
204
205 let mut loop_state = LoopState {
206 distinct_values: if count_distinct_values {
207 HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
208 } else {
209 HashMap::with_hasher(FxBuildHasher)
210 },
211 prev: head,
212 runs: 1,
213 };
214
215 let sliced = buffer.slice(head_idx..array.len());
216 let mut chunks = sliced.as_slice().array_chunks::<64>();
217 match validity.boolean_buffer() {
218 AllOr::All => {
219 for chunk in &mut chunks {
220 inner_loop_nonnull(chunk, count_distinct_values, &mut loop_state)
221 }
222 let remainder = chunks.remainder();
223 inner_loop_naive(
224 remainder,
225 count_distinct_values,
226 &BooleanBuffer::new_set(remainder.len()),
227 &mut loop_state,
228 );
229 }
230 AllOr::None => unreachable!("All invalid arrays have been handled before"),
231 AllOr::Some(v) => {
232 let mask = v.slice(head_idx, array.len() - head_idx);
233 let mut offset = 0;
234 for chunk in &mut chunks {
235 let validity = mask.slice(offset, 64);
236 offset += 64;
237
238 match validity.count_set_bits() {
239 0 => continue,
241 64 => inner_loop_nonnull(chunk, count_distinct_values, &mut loop_state),
243 _ => inner_loop_nullable(
245 chunk,
246 count_distinct_values,
247 &validity,
248 &mut loop_state,
249 ),
250 }
251 }
252 let remainder = chunks.remainder();
254 inner_loop_naive(
255 remainder,
256 count_distinct_values,
257 &mask.slice(offset, remainder.len()),
258 &mut loop_state,
259 );
260 }
261 }
262
263 let (top_value, top_count) = if count_distinct_values {
264 let (&top_value, &top_count) = loop_state
265 .distinct_values
266 .iter()
267 .max_by_key(|&(_, &count)| count)
268 .vortex_expect("non-empty");
269 (top_value.0, top_count)
270 } else {
271 (T::default(), 0)
272 };
273
274 let runs = loop_state.runs;
275 let distinct_values_count = if count_distinct_values {
276 loop_state.distinct_values.len().try_into().vortex_unwrap()
277 } else {
278 u32::MAX
279 };
280
281 let min = array
282 .statistics()
283 .compute_as::<T>(Stat::Min)
284 .vortex_expect("min should be computed");
285
286 let max = array
287 .statistics()
288 .compute_as::<T>(Stat::Max)
289 .vortex_expect("max should be computed");
290
291 let typed = TypedStats {
292 min,
293 max,
294 distinct_values: loop_state.distinct_values,
295 top_value,
296 top_count,
297 };
298
299 let null_count = null_count
300 .try_into()
301 .vortex_expect("null_count must fit in u32");
302 let value_count = value_count
303 .try_into()
304 .vortex_expect("value_count must fit in u32");
305
306 IntegerStats {
307 src: array.clone(),
308 null_count,
309 value_count,
310 average_run_length: value_count / runs,
311 distinct_values_count,
312 typed: typed.into(),
313 }
314}
315
316struct LoopState<T> {
317 prev: T,
318 runs: u32,
319 distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
320}
321
322#[inline(always)]
323fn inner_loop_nonnull<T: NativePType>(
324 values: &[T; 64],
325 count_distinct_values: bool,
326 state: &mut LoopState<T>,
327) where
328 NativeValue<T>: Eq + Hash,
329{
330 for &value in values {
331 if count_distinct_values {
332 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
333 }
334
335 if value != state.prev {
336 state.prev = value;
337 state.runs += 1;
338 }
339 }
340}
341
342#[inline(always)]
343fn inner_loop_nullable<T: NativePType>(
344 values: &[T; 64],
345 count_distinct_values: bool,
346 is_valid: &BooleanBuffer,
347 state: &mut LoopState<T>,
348) where
349 NativeValue<T>: Eq + Hash,
350{
351 for (idx, &value) in values.iter().enumerate() {
352 if is_valid.value(idx) {
353 if count_distinct_values {
354 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
355 }
356
357 if value != state.prev {
358 state.prev = value;
359 state.runs += 1;
360 }
361 }
362 }
363}
364
365#[inline(always)]
366fn inner_loop_naive<T: NativePType>(
367 values: &[T],
368 count_distinct_values: bool,
369 is_valid: &BooleanBuffer,
370 state: &mut LoopState<T>,
371) where
372 NativeValue<T>: Eq + Hash,
373{
374 for (idx, &value) in values.iter().enumerate() {
375 if is_valid.value(idx) {
376 if count_distinct_values {
377 *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
378 }
379
380 if value != state.prev {
381 state.prev = value;
382 state.runs += 1;
383 }
384 }
385 }
386}
387
388#[cfg(test)]
389mod tests {
390 use std::iter;
391
392 use arrow_buffer::BooleanBuffer;
393 use vortex_array::arrays::PrimitiveArray;
394 use vortex_array::validity::Validity;
395 use vortex_buffer::{Buffer, buffer};
396
397 use crate::CompressorStats;
398 use crate::integer::IntegerStats;
399 use crate::integer::stats::typed_int_stats;
400
401 #[test]
402 fn test_naive_count_distinct_values() {
403 let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
404 let stats = typed_int_stats::<u8>(&array, true);
405 assert_eq!(stats.distinct_values_count, 2);
406 }
407
408 #[test]
409 fn test_naive_count_distinct_values_nullable() {
410 let array = PrimitiveArray::new(
411 buffer![217u8, 0],
412 Validity::from(BooleanBuffer::from(vec![true, false])),
413 );
414 let stats = typed_int_stats::<u8>(&array, true);
415 assert_eq!(stats.distinct_values_count, 1);
416 }
417
418 #[test]
419 fn test_count_distinct_values() {
420 let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
421 let stats = typed_int_stats::<u8>(&array, true);
422 assert_eq!(stats.distinct_values_count, 128);
423 }
424
425 #[test]
426 fn test_count_distinct_values_nullable() {
427 let array = PrimitiveArray::new(
428 (0..128u8).collect::<Buffer<u8>>(),
429 Validity::from(BooleanBuffer::from_iter(
430 iter::repeat_n(vec![true, false], 64).flatten(),
431 )),
432 );
433 let stats = typed_int_stats::<u8>(&array, true);
434 assert_eq!(stats.distinct_values_count, 64);
435 }
436
437 #[test]
438 fn test_integer_stats_leading_nulls() {
439 let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
440
441 let stats = IntegerStats::generate(&ints);
442
443 assert_eq!(stats.value_count, 2);
444 assert_eq!(stats.null_count, 1);
445 assert_eq!(stats.average_run_length, 1);
446 assert_eq!(stats.distinct_values_count, 2);
447 }
448}