vortex_btrblocks/integer/
stats.rs

1use std::hash::Hash;
2
3use arrow_buffer::BooleanBuffer;
4use num_traits::PrimInt;
5use rustc_hash::FxBuildHasher;
6use vortex_array::aliases::hash_map::HashMap;
7use vortex_array::arrays::{NativeValue, PrimitiveArray};
8use vortex_array::stats::Stat;
9use vortex_array::variants::PrimitiveArrayTrait;
10use vortex_array::{Array, ToCanonical};
11use vortex_dtype::{NativePType, match_each_integer_ptype};
12use vortex_error::{VortexError, VortexExpect, VortexUnwrap};
13use vortex_mask::AllOr;
14use vortex_scalar::{PValue, ScalarValue};
15
16use crate::sample::sample;
17use crate::{CompressorStats, GenerateStatsOptions};
18
19#[derive(Clone, Debug)]
20pub struct TypedStats<T> {
21    pub min: T,
22    pub max: T,
23    pub top_value: T,
24    pub top_count: u32,
25    pub distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
26}
27
28/// Type-erased container for one of the [TypedStats] variants.
29///
30/// Building the `TypedStats` is considerably faster and cheaper than building a type-erased
31/// set of stats. We then perform a variety of access methods on them.
32#[derive(Clone, Debug)]
33pub enum ErasedStats {
34    U8(TypedStats<u8>),
35    U16(TypedStats<u16>),
36    U32(TypedStats<u32>),
37    U64(TypedStats<u64>),
38    I8(TypedStats<i8>),
39    I16(TypedStats<i16>),
40    I32(TypedStats<i32>),
41    I64(TypedStats<i64>),
42}
43
44impl ErasedStats {
45    pub fn min_is_zero(&self) -> bool {
46        match &self {
47            ErasedStats::U8(x) => x.min == 0,
48            ErasedStats::U16(x) => x.min == 0,
49            ErasedStats::U32(x) => x.min == 0,
50            ErasedStats::U64(x) => x.min == 0,
51            ErasedStats::I8(x) => x.min == 0,
52            ErasedStats::I16(x) => x.min == 0,
53            ErasedStats::I32(x) => x.min == 0,
54            ErasedStats::I64(x) => x.min == 0,
55        }
56    }
57
58    pub fn min_is_negative(&self) -> bool {
59        match &self {
60            ErasedStats::U8(_)
61            | ErasedStats::U16(_)
62            | ErasedStats::U32(_)
63            | ErasedStats::U64(_) => false,
64            ErasedStats::I8(x) => x.min < 0,
65            ErasedStats::I16(x) => x.min < 0,
66            ErasedStats::I32(x) => x.min < 0,
67            ErasedStats::I64(x) => x.min < 0,
68        }
69    }
70
71    // Difference between max and min.
72    pub fn max_minus_min(&self) -> u64 {
73        match &self {
74            ErasedStats::U8(x) => (x.max - x.min) as u64,
75            ErasedStats::U16(x) => (x.max - x.min) as u64,
76            ErasedStats::U32(x) => (x.max - x.min) as u64,
77            ErasedStats::U64(x) => x.max - x.min,
78            ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
79            ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
80            ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
81            ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
82                .vortex_expect("max minus min result bigger than u64"),
83        }
84    }
85
86    /// Get the most commonly occurring value and its count
87    pub fn top_value_and_count(&self) -> (PValue, u32) {
88        match &self {
89            ErasedStats::U8(x) => (x.top_value.into(), x.top_count),
90            ErasedStats::U16(x) => (x.top_value.into(), x.top_count),
91            ErasedStats::U32(x) => (x.top_value.into(), x.top_count),
92            ErasedStats::U64(x) => (x.top_value.into(), x.top_count),
93            ErasedStats::I8(x) => (x.top_value.into(), x.top_count),
94            ErasedStats::I16(x) => (x.top_value.into(), x.top_count),
95            ErasedStats::I32(x) => (x.top_value.into(), x.top_count),
96            ErasedStats::I64(x) => (x.top_value.into(), x.top_count),
97        }
98    }
99}
100
101macro_rules! impl_from_typed {
102    ($T:ty, $variant:path) => {
103        impl From<TypedStats<$T>> for ErasedStats {
104            fn from(typed: TypedStats<$T>) -> Self {
105                $variant(typed)
106            }
107        }
108    };
109}
110
111impl_from_typed!(u8, ErasedStats::U8);
112impl_from_typed!(u16, ErasedStats::U16);
113impl_from_typed!(u32, ErasedStats::U32);
114impl_from_typed!(u64, ErasedStats::U64);
115impl_from_typed!(i8, ErasedStats::I8);
116impl_from_typed!(i16, ErasedStats::I16);
117impl_from_typed!(i32, ErasedStats::I32);
118impl_from_typed!(i64, ErasedStats::I64);
119
120#[derive(Clone, Debug)]
121pub struct IntegerStats {
122    pub(super) src: PrimitiveArray,
123    // cache for validity.false_count()
124    pub(super) null_count: u32,
125    // cache for validity.true_count()
126    pub(super) value_count: u32,
127    pub(super) average_run_length: u32,
128    pub(super) distinct_values_count: u32,
129    pub(crate) typed: ErasedStats,
130}
131
132impl CompressorStats for IntegerStats {
133    type ArrayType = PrimitiveArray;
134
135    fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
136        match_each_integer_ptype!(input.ptype(), |$T| {
137            typed_int_stats::<$T>(input, opts.count_distinct_values)
138        })
139    }
140
141    fn source(&self) -> &PrimitiveArray {
142        &self.src
143    }
144
145    fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
146        let sampled = sample(self.src.clone(), sample_size, sample_count)
147            .to_primitive()
148            .vortex_expect("primitive");
149
150        Self::generate_opts(&sampled, opts)
151    }
152}
153
154fn typed_int_stats<T>(array: &PrimitiveArray, count_distinct_values: bool) -> IntegerStats
155where
156    T: NativePType + PrimInt + for<'a> TryFrom<&'a ScalarValue, Error = VortexError>,
157    TypedStats<T>: Into<ErasedStats>,
158    NativeValue<T>: Eq + Hash,
159{
160    // Special case: empty array
161    if array.is_empty() {
162        return IntegerStats {
163            src: array.clone(),
164            null_count: 0,
165            value_count: 0,
166            average_run_length: 0,
167            distinct_values_count: 0,
168            typed: TypedStats {
169                min: T::max_value(),
170                max: T::min_value(),
171                top_value: T::default(),
172                top_count: 0,
173                distinct_values: HashMap::with_hasher(FxBuildHasher),
174            }
175            .into(),
176        };
177    } else if array.all_invalid().vortex_expect("all_invalid") {
178        return IntegerStats {
179            src: array.clone(),
180            null_count: array.len().try_into().vortex_expect("null_count"),
181            value_count: 0,
182            average_run_length: 0,
183            distinct_values_count: 0,
184            typed: TypedStats {
185                min: T::max_value(),
186                max: T::min_value(),
187                top_value: T::default(),
188                top_count: 0,
189                distinct_values: HashMap::with_hasher(FxBuildHasher),
190            }
191            .into(),
192        };
193    }
194
195    let validity = array.validity_mask().vortex_expect("logical_validity");
196    let null_count = validity.false_count();
197    let value_count = validity.true_count();
198
199    // Initialize loop state
200    let head_idx = validity
201        .first()
202        .vortex_expect("All null masks have been handled before");
203    let buffer = array.buffer::<T>();
204    let head = buffer[head_idx];
205
206    let mut loop_state = LoopState {
207        distinct_values: if count_distinct_values {
208            HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
209        } else {
210            HashMap::with_hasher(FxBuildHasher)
211        },
212        prev: head,
213        runs: 1,
214    };
215
216    let sliced = buffer.slice(head_idx..array.len());
217    let mut chunks = sliced.as_slice().array_chunks::<64>();
218    match validity.boolean_buffer() {
219        AllOr::All => {
220            for chunk in &mut chunks {
221                inner_loop_nonnull(chunk, count_distinct_values, &mut loop_state)
222            }
223            let remainder = chunks.remainder();
224            inner_loop_naive(
225                remainder,
226                count_distinct_values,
227                &BooleanBuffer::new_set(remainder.len()),
228                &mut loop_state,
229            );
230        }
231        AllOr::None => unreachable!("All invalid arrays have been handled before"),
232        AllOr::Some(v) => {
233            let mask = v.slice(head_idx, array.len() - head_idx);
234            let mut offset = 0;
235            for chunk in &mut chunks {
236                let validity = mask.slice(offset, 64);
237                offset += 64;
238
239                match validity.count_set_bits() {
240                    // All nulls -> no stats to update
241                    0 => continue,
242                    // Inner loop for when validity check can be elided
243                    64 => inner_loop_nonnull(chunk, count_distinct_values, &mut loop_state),
244                    // Inner loop for when we need to check validity
245                    _ => inner_loop_nullable(
246                        chunk,
247                        count_distinct_values,
248                        &validity,
249                        &mut loop_state,
250                    ),
251                }
252            }
253            // Final iteration, run naive loop
254            let remainder = chunks.remainder();
255            inner_loop_naive(
256                remainder,
257                count_distinct_values,
258                &mask.slice(offset, remainder.len()),
259                &mut loop_state,
260            );
261        }
262    }
263
264    let (top_value, top_count) = if count_distinct_values {
265        let (&top_value, &top_count) = loop_state
266            .distinct_values
267            .iter()
268            .max_by_key(|&(_, &count)| count)
269            .vortex_expect("non-empty");
270        (top_value.0, top_count)
271    } else {
272        (T::default(), 0)
273    };
274
275    let runs = loop_state.runs;
276    let distinct_values_count = if count_distinct_values {
277        loop_state.distinct_values.len().try_into().vortex_unwrap()
278    } else {
279        u32::MAX
280    };
281
282    let min = array
283        .statistics()
284        .compute_as::<T>(Stat::Min)
285        .vortex_expect("min should be computed");
286
287    let max = array
288        .statistics()
289        .compute_as::<T>(Stat::Max)
290        .vortex_expect("max should be computed");
291
292    let typed = TypedStats {
293        min,
294        max,
295        distinct_values: loop_state.distinct_values,
296        top_value,
297        top_count,
298    };
299
300    let null_count = null_count
301        .try_into()
302        .vortex_expect("null_count must fit in u32");
303    let value_count = value_count
304        .try_into()
305        .vortex_expect("value_count must fit in u32");
306
307    IntegerStats {
308        src: array.clone(),
309        null_count,
310        value_count,
311        average_run_length: value_count / runs,
312        distinct_values_count,
313        typed: typed.into(),
314    }
315}
316
317struct LoopState<T> {
318    prev: T,
319    runs: u32,
320    distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
321}
322
323#[inline(always)]
324fn inner_loop_nonnull<T: NativePType>(
325    values: &[T; 64],
326    count_distinct_values: bool,
327    state: &mut LoopState<T>,
328) where
329    NativeValue<T>: Eq + Hash,
330{
331    for &value in values {
332        if count_distinct_values {
333            *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
334        }
335
336        if value != state.prev {
337            state.prev = value;
338            state.runs += 1;
339        }
340    }
341}
342
343#[inline(always)]
344fn inner_loop_nullable<T: NativePType>(
345    values: &[T; 64],
346    count_distinct_values: bool,
347    is_valid: &BooleanBuffer,
348    state: &mut LoopState<T>,
349) where
350    NativeValue<T>: Eq + Hash,
351{
352    for (idx, &value) in values.iter().enumerate() {
353        if is_valid.value(idx) {
354            if count_distinct_values {
355                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
356            }
357
358            if value != state.prev {
359                state.prev = value;
360                state.runs += 1;
361            }
362        }
363    }
364}
365
366#[inline(always)]
367fn inner_loop_naive<T: NativePType>(
368    values: &[T],
369    count_distinct_values: bool,
370    is_valid: &BooleanBuffer,
371    state: &mut LoopState<T>,
372) where
373    NativeValue<T>: Eq + Hash,
374{
375    for (idx, &value) in values.iter().enumerate() {
376        if is_valid.value(idx) {
377            if count_distinct_values {
378                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
379            }
380
381            if value != state.prev {
382                state.prev = value;
383                state.runs += 1;
384            }
385        }
386    }
387}
388
389#[cfg(test)]
390mod tests {
391    use std::iter;
392
393    use arrow_buffer::BooleanBuffer;
394    use vortex_array::arrays::PrimitiveArray;
395    use vortex_array::validity::Validity;
396    use vortex_buffer::{Buffer, buffer};
397
398    use crate::CompressorStats;
399    use crate::integer::IntegerStats;
400    use crate::integer::stats::typed_int_stats;
401
402    #[test]
403    fn test_naive_count_distinct_values() {
404        let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
405        let stats = typed_int_stats::<u8>(&array, true);
406        assert_eq!(stats.distinct_values_count, 2);
407    }
408
409    #[test]
410    fn test_naive_count_distinct_values_nullable() {
411        let array = PrimitiveArray::new(
412            buffer![217u8, 0],
413            Validity::from(BooleanBuffer::from(vec![true, false])),
414        );
415        let stats = typed_int_stats::<u8>(&array, true);
416        assert_eq!(stats.distinct_values_count, 1);
417    }
418
419    #[test]
420    fn test_count_distinct_values() {
421        let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
422        let stats = typed_int_stats::<u8>(&array, true);
423        assert_eq!(stats.distinct_values_count, 128);
424    }
425
426    #[test]
427    fn test_count_distinct_values_nullable() {
428        let array = PrimitiveArray::new(
429            (0..128u8).collect::<Buffer<u8>>(),
430            Validity::from(BooleanBuffer::from_iter(
431                iter::repeat_n(vec![true, false], 64).flatten(),
432            )),
433        );
434        let stats = typed_int_stats::<u8>(&array, true);
435        assert_eq!(stats.distinct_values_count, 64);
436    }
437
438    #[test]
439    fn test_integer_stats_leading_nulls() {
440        let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
441
442        let stats = IntegerStats::generate(&ints);
443
444        assert_eq!(stats.value_count, 2);
445        assert_eq!(stats.null_count, 1);
446        assert_eq!(stats.average_run_length, 1);
447        assert_eq!(stats.distinct_values_count, 2);
448    }
449}