vortex_btrblocks/integer/
stats.rs

1use std::hash::Hash;
2
3use arrow_buffer::BooleanBuffer;
4use num_traits::PrimInt;
5use rustc_hash::FxBuildHasher;
6use vortex_array::ToCanonical;
7use vortex_array::aliases::hash_map::HashMap;
8use vortex_array::arrays::{NativeValue, PrimitiveArray, PrimitiveVTable};
9use vortex_array::stats::Stat;
10use vortex_dtype::{NativePType, match_each_integer_ptype};
11use vortex_error::{VortexError, VortexExpect, VortexUnwrap};
12use vortex_mask::AllOr;
13use vortex_scalar::{PValue, ScalarValue};
14
15use crate::sample::sample;
16use crate::{CompressorStats, GenerateStatsOptions};
17
18#[derive(Clone, Debug)]
19pub struct TypedStats<T> {
20    pub min: T,
21    pub max: T,
22    pub top_value: T,
23    pub top_count: u32,
24    pub distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
25}
26
27/// Type-erased container for one of the [TypedStats] variants.
28///
29/// Building the `TypedStats` is considerably faster and cheaper than building a type-erased
30/// set of stats. We then perform a variety of access methods on them.
31#[derive(Clone, Debug)]
32pub enum ErasedStats {
33    U8(TypedStats<u8>),
34    U16(TypedStats<u16>),
35    U32(TypedStats<u32>),
36    U64(TypedStats<u64>),
37    I8(TypedStats<i8>),
38    I16(TypedStats<i16>),
39    I32(TypedStats<i32>),
40    I64(TypedStats<i64>),
41}
42
43impl ErasedStats {
44    pub fn min_is_zero(&self) -> bool {
45        match &self {
46            ErasedStats::U8(x) => x.min == 0,
47            ErasedStats::U16(x) => x.min == 0,
48            ErasedStats::U32(x) => x.min == 0,
49            ErasedStats::U64(x) => x.min == 0,
50            ErasedStats::I8(x) => x.min == 0,
51            ErasedStats::I16(x) => x.min == 0,
52            ErasedStats::I32(x) => x.min == 0,
53            ErasedStats::I64(x) => x.min == 0,
54        }
55    }
56
57    pub fn min_is_negative(&self) -> bool {
58        match &self {
59            ErasedStats::U8(_)
60            | ErasedStats::U16(_)
61            | ErasedStats::U32(_)
62            | ErasedStats::U64(_) => false,
63            ErasedStats::I8(x) => x.min < 0,
64            ErasedStats::I16(x) => x.min < 0,
65            ErasedStats::I32(x) => x.min < 0,
66            ErasedStats::I64(x) => x.min < 0,
67        }
68    }
69
70    // Difference between max and min.
71    pub fn max_minus_min(&self) -> u64 {
72        match &self {
73            ErasedStats::U8(x) => (x.max - x.min) as u64,
74            ErasedStats::U16(x) => (x.max - x.min) as u64,
75            ErasedStats::U32(x) => (x.max - x.min) as u64,
76            ErasedStats::U64(x) => x.max - x.min,
77            ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
78            ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
79            ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
80            ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
81                .vortex_expect("max minus min result bigger than u64"),
82        }
83    }
84
85    /// Get the most commonly occurring value and its count
86    pub fn top_value_and_count(&self) -> (PValue, u32) {
87        match &self {
88            ErasedStats::U8(x) => (x.top_value.into(), x.top_count),
89            ErasedStats::U16(x) => (x.top_value.into(), x.top_count),
90            ErasedStats::U32(x) => (x.top_value.into(), x.top_count),
91            ErasedStats::U64(x) => (x.top_value.into(), x.top_count),
92            ErasedStats::I8(x) => (x.top_value.into(), x.top_count),
93            ErasedStats::I16(x) => (x.top_value.into(), x.top_count),
94            ErasedStats::I32(x) => (x.top_value.into(), x.top_count),
95            ErasedStats::I64(x) => (x.top_value.into(), x.top_count),
96        }
97    }
98}
99
100macro_rules! impl_from_typed {
101    ($T:ty, $variant:path) => {
102        impl From<TypedStats<$T>> for ErasedStats {
103            fn from(typed: TypedStats<$T>) -> Self {
104                $variant(typed)
105            }
106        }
107    };
108}
109
110impl_from_typed!(u8, ErasedStats::U8);
111impl_from_typed!(u16, ErasedStats::U16);
112impl_from_typed!(u32, ErasedStats::U32);
113impl_from_typed!(u64, ErasedStats::U64);
114impl_from_typed!(i8, ErasedStats::I8);
115impl_from_typed!(i16, ErasedStats::I16);
116impl_from_typed!(i32, ErasedStats::I32);
117impl_from_typed!(i64, ErasedStats::I64);
118
119#[derive(Clone, Debug)]
120pub struct IntegerStats {
121    pub(super) src: PrimitiveArray,
122    // cache for validity.false_count()
123    pub(super) null_count: u32,
124    // cache for validity.true_count()
125    pub(super) value_count: u32,
126    pub(super) average_run_length: u32,
127    pub(super) distinct_values_count: u32,
128    pub(crate) typed: ErasedStats,
129}
130
131impl CompressorStats for IntegerStats {
132    type ArrayVTable = PrimitiveVTable;
133
134    fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
135        match_each_integer_ptype!(input.ptype(), |$T| {
136            typed_int_stats::<$T>(input, opts.count_distinct_values)
137        })
138    }
139
140    fn source(&self) -> &PrimitiveArray {
141        &self.src
142    }
143
144    fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
145        let sampled = sample(self.src.as_ref(), sample_size, sample_count)
146            .to_primitive()
147            .vortex_expect("primitive");
148
149        Self::generate_opts(&sampled, opts)
150    }
151}
152
153fn typed_int_stats<T>(array: &PrimitiveArray, count_distinct_values: bool) -> IntegerStats
154where
155    T: NativePType + PrimInt + for<'a> TryFrom<&'a ScalarValue, Error = VortexError>,
156    TypedStats<T>: Into<ErasedStats>,
157    NativeValue<T>: Eq + Hash,
158{
159    // Special case: empty array
160    if array.is_empty() {
161        return IntegerStats {
162            src: array.clone(),
163            null_count: 0,
164            value_count: 0,
165            average_run_length: 0,
166            distinct_values_count: 0,
167            typed: TypedStats {
168                min: T::max_value(),
169                max: T::min_value(),
170                top_value: T::default(),
171                top_count: 0,
172                distinct_values: HashMap::with_hasher(FxBuildHasher),
173            }
174            .into(),
175        };
176    } else if array.all_invalid().vortex_expect("all_invalid") {
177        return IntegerStats {
178            src: array.clone(),
179            null_count: array.len().try_into().vortex_expect("null_count"),
180            value_count: 0,
181            average_run_length: 0,
182            distinct_values_count: 0,
183            typed: TypedStats {
184                min: T::max_value(),
185                max: T::min_value(),
186                top_value: T::default(),
187                top_count: 0,
188                distinct_values: HashMap::with_hasher(FxBuildHasher),
189            }
190            .into(),
191        };
192    }
193
194    let validity = array.validity_mask().vortex_expect("logical_validity");
195    let null_count = validity.false_count();
196    let value_count = validity.true_count();
197
198    // Initialize loop state
199    let head_idx = validity
200        .first()
201        .vortex_expect("All null masks have been handled before");
202    let buffer = array.buffer::<T>();
203    let head = buffer[head_idx];
204
205    let mut loop_state = LoopState {
206        distinct_values: if count_distinct_values {
207            HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
208        } else {
209            HashMap::with_hasher(FxBuildHasher)
210        },
211        prev: head,
212        runs: 1,
213    };
214
215    let sliced = buffer.slice(head_idx..array.len());
216    let mut chunks = sliced.as_slice().array_chunks::<64>();
217    match validity.boolean_buffer() {
218        AllOr::All => {
219            for chunk in &mut chunks {
220                inner_loop_nonnull(chunk, count_distinct_values, &mut loop_state)
221            }
222            let remainder = chunks.remainder();
223            inner_loop_naive(
224                remainder,
225                count_distinct_values,
226                &BooleanBuffer::new_set(remainder.len()),
227                &mut loop_state,
228            );
229        }
230        AllOr::None => unreachable!("All invalid arrays have been handled before"),
231        AllOr::Some(v) => {
232            let mask = v.slice(head_idx, array.len() - head_idx);
233            let mut offset = 0;
234            for chunk in &mut chunks {
235                let validity = mask.slice(offset, 64);
236                offset += 64;
237
238                match validity.count_set_bits() {
239                    // All nulls -> no stats to update
240                    0 => continue,
241                    // Inner loop for when validity check can be elided
242                    64 => inner_loop_nonnull(chunk, count_distinct_values, &mut loop_state),
243                    // Inner loop for when we need to check validity
244                    _ => inner_loop_nullable(
245                        chunk,
246                        count_distinct_values,
247                        &validity,
248                        &mut loop_state,
249                    ),
250                }
251            }
252            // Final iteration, run naive loop
253            let remainder = chunks.remainder();
254            inner_loop_naive(
255                remainder,
256                count_distinct_values,
257                &mask.slice(offset, remainder.len()),
258                &mut loop_state,
259            );
260        }
261    }
262
263    let (top_value, top_count) = if count_distinct_values {
264        let (&top_value, &top_count) = loop_state
265            .distinct_values
266            .iter()
267            .max_by_key(|&(_, &count)| count)
268            .vortex_expect("non-empty");
269        (top_value.0, top_count)
270    } else {
271        (T::default(), 0)
272    };
273
274    let runs = loop_state.runs;
275    let distinct_values_count = if count_distinct_values {
276        loop_state.distinct_values.len().try_into().vortex_unwrap()
277    } else {
278        u32::MAX
279    };
280
281    let min = array
282        .statistics()
283        .compute_as::<T>(Stat::Min)
284        .vortex_expect("min should be computed");
285
286    let max = array
287        .statistics()
288        .compute_as::<T>(Stat::Max)
289        .vortex_expect("max should be computed");
290
291    let typed = TypedStats {
292        min,
293        max,
294        distinct_values: loop_state.distinct_values,
295        top_value,
296        top_count,
297    };
298
299    let null_count = null_count
300        .try_into()
301        .vortex_expect("null_count must fit in u32");
302    let value_count = value_count
303        .try_into()
304        .vortex_expect("value_count must fit in u32");
305
306    IntegerStats {
307        src: array.clone(),
308        null_count,
309        value_count,
310        average_run_length: value_count / runs,
311        distinct_values_count,
312        typed: typed.into(),
313    }
314}
315
316struct LoopState<T> {
317    prev: T,
318    runs: u32,
319    distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
320}
321
322#[inline(always)]
323fn inner_loop_nonnull<T: NativePType>(
324    values: &[T; 64],
325    count_distinct_values: bool,
326    state: &mut LoopState<T>,
327) where
328    NativeValue<T>: Eq + Hash,
329{
330    for &value in values {
331        if count_distinct_values {
332            *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
333        }
334
335        if value != state.prev {
336            state.prev = value;
337            state.runs += 1;
338        }
339    }
340}
341
342#[inline(always)]
343fn inner_loop_nullable<T: NativePType>(
344    values: &[T; 64],
345    count_distinct_values: bool,
346    is_valid: &BooleanBuffer,
347    state: &mut LoopState<T>,
348) where
349    NativeValue<T>: Eq + Hash,
350{
351    for (idx, &value) in values.iter().enumerate() {
352        if is_valid.value(idx) {
353            if count_distinct_values {
354                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
355            }
356
357            if value != state.prev {
358                state.prev = value;
359                state.runs += 1;
360            }
361        }
362    }
363}
364
365#[inline(always)]
366fn inner_loop_naive<T: NativePType>(
367    values: &[T],
368    count_distinct_values: bool,
369    is_valid: &BooleanBuffer,
370    state: &mut LoopState<T>,
371) where
372    NativeValue<T>: Eq + Hash,
373{
374    for (idx, &value) in values.iter().enumerate() {
375        if is_valid.value(idx) {
376            if count_distinct_values {
377                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
378            }
379
380            if value != state.prev {
381                state.prev = value;
382                state.runs += 1;
383            }
384        }
385    }
386}
387
388#[cfg(test)]
389mod tests {
390    use std::iter;
391
392    use arrow_buffer::BooleanBuffer;
393    use vortex_array::arrays::PrimitiveArray;
394    use vortex_array::validity::Validity;
395    use vortex_buffer::{Buffer, buffer};
396
397    use crate::CompressorStats;
398    use crate::integer::IntegerStats;
399    use crate::integer::stats::typed_int_stats;
400
401    #[test]
402    fn test_naive_count_distinct_values() {
403        let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
404        let stats = typed_int_stats::<u8>(&array, true);
405        assert_eq!(stats.distinct_values_count, 2);
406    }
407
408    #[test]
409    fn test_naive_count_distinct_values_nullable() {
410        let array = PrimitiveArray::new(
411            buffer![217u8, 0],
412            Validity::from(BooleanBuffer::from(vec![true, false])),
413        );
414        let stats = typed_int_stats::<u8>(&array, true);
415        assert_eq!(stats.distinct_values_count, 1);
416    }
417
418    #[test]
419    fn test_count_distinct_values() {
420        let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
421        let stats = typed_int_stats::<u8>(&array, true);
422        assert_eq!(stats.distinct_values_count, 128);
423    }
424
425    #[test]
426    fn test_count_distinct_values_nullable() {
427        let array = PrimitiveArray::new(
428            (0..128u8).collect::<Buffer<u8>>(),
429            Validity::from(BooleanBuffer::from_iter(
430                iter::repeat_n(vec![true, false], 64).flatten(),
431            )),
432        );
433        let stats = typed_int_stats::<u8>(&array, true);
434        assert_eq!(stats.distinct_values_count, 64);
435    }
436
437    #[test]
438    fn test_integer_stats_leading_nulls() {
439        let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
440
441        let stats = IntegerStats::generate(&ints);
442
443        assert_eq!(stats.value_count, 2);
444        assert_eq!(stats.null_count, 1);
445        assert_eq!(stats.average_run_length, 1);
446        assert_eq!(stats.distinct_values_count, 2);
447    }
448}