vortex_btrblocks/integer/
stats.rs

1use std::hash::Hash;
2
3use arrow_buffer::BooleanBuffer;
4use num_traits::PrimInt;
5use rustc_hash::FxBuildHasher;
6use vortex_array::aliases::hash_map::HashMap;
7use vortex_array::arrays::PrimitiveArray;
8use vortex_array::variants::PrimitiveArrayTrait;
9use vortex_array::{Array, ToCanonical};
10use vortex_dtype::{NativePType, match_each_integer_ptype};
11use vortex_error::{VortexExpect, VortexUnwrap};
12use vortex_scalar::PValue;
13
14use crate::sample::sample;
15use crate::{CompressorStats, GenerateStatsOptions};
16
17#[derive(Clone, Debug)]
18pub struct TypedStats<T> {
19    pub min: T,
20    pub max: T,
21    pub top_value: T,
22    pub top_count: u32,
23    pub distinct_values: HashMap<T, u32, FxBuildHasher>,
24}
25
26/// Type-erased container for one of the [TypedStats] variants.
27///
28/// Building the `TypedStats` is considerably faster and cheaper than building a type-erased
29/// set of stats. We then perform a variety of access methods on them.
30#[derive(Clone, Debug)]
31pub enum ErasedStats {
32    U8(TypedStats<u8>),
33    U16(TypedStats<u16>),
34    U32(TypedStats<u32>),
35    U64(TypedStats<u64>),
36    I8(TypedStats<i8>),
37    I16(TypedStats<i16>),
38    I32(TypedStats<i32>),
39    I64(TypedStats<i64>),
40}
41
42impl ErasedStats {
43    pub fn min_is_zero(&self) -> bool {
44        match &self {
45            ErasedStats::U8(x) => x.min == 0,
46            ErasedStats::U16(x) => x.min == 0,
47            ErasedStats::U32(x) => x.min == 0,
48            ErasedStats::U64(x) => x.min == 0,
49            ErasedStats::I8(x) => x.min == 0,
50            ErasedStats::I16(x) => x.min == 0,
51            ErasedStats::I32(x) => x.min == 0,
52            ErasedStats::I64(x) => x.min == 0,
53        }
54    }
55
56    pub fn min_is_negative(&self) -> bool {
57        match &self {
58            ErasedStats::U8(_)
59            | ErasedStats::U16(_)
60            | ErasedStats::U32(_)
61            | ErasedStats::U64(_) => false,
62            ErasedStats::I8(x) => x.min < 0,
63            ErasedStats::I16(x) => x.min < 0,
64            ErasedStats::I32(x) => x.min < 0,
65            ErasedStats::I64(x) => x.min < 0,
66        }
67    }
68
69    // Difference between max and min.
70    pub fn max_minus_min(&self) -> u64 {
71        match &self {
72            ErasedStats::U8(x) => (x.max - x.min) as u64,
73            ErasedStats::U16(x) => (x.max - x.min) as u64,
74            ErasedStats::U32(x) => (x.max - x.min) as u64,
75            ErasedStats::U64(x) => x.max - x.min,
76            ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
77            ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
78            ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
79            ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
80                .vortex_expect("max minus min result bigger than u64"),
81        }
82    }
83
84    /// Get the most commonly occurring value and its count
85    pub fn top_value_and_count(&self) -> (PValue, u32) {
86        match &self {
87            ErasedStats::U8(x) => (x.top_value.into(), x.top_count),
88            ErasedStats::U16(x) => (x.top_value.into(), x.top_count),
89            ErasedStats::U32(x) => (x.top_value.into(), x.top_count),
90            ErasedStats::U64(x) => (x.top_value.into(), x.top_count),
91            ErasedStats::I8(x) => (x.top_value.into(), x.top_count),
92            ErasedStats::I16(x) => (x.top_value.into(), x.top_count),
93            ErasedStats::I32(x) => (x.top_value.into(), x.top_count),
94            ErasedStats::I64(x) => (x.top_value.into(), x.top_count),
95        }
96    }
97}
98
99macro_rules! impl_from_typed {
100    ($T:ty, $variant:path) => {
101        impl From<TypedStats<$T>> for ErasedStats {
102            fn from(typed: TypedStats<$T>) -> Self {
103                $variant(typed)
104            }
105        }
106    };
107}
108
109impl_from_typed!(u8, ErasedStats::U8);
110impl_from_typed!(u16, ErasedStats::U16);
111impl_from_typed!(u32, ErasedStats::U32);
112impl_from_typed!(u64, ErasedStats::U64);
113impl_from_typed!(i8, ErasedStats::I8);
114impl_from_typed!(i16, ErasedStats::I16);
115impl_from_typed!(i32, ErasedStats::I32);
116impl_from_typed!(i64, ErasedStats::I64);
117
118#[derive(Clone, Debug)]
119pub struct IntegerStats {
120    pub(super) src: PrimitiveArray,
121    // cache for validity.false_count()
122    pub(super) null_count: u32,
123    // cache for validity.true_count()
124    pub(super) value_count: u32,
125    pub(super) average_run_length: u32,
126    pub(super) distinct_values_count: u32,
127    pub(crate) typed: ErasedStats,
128}
129
130impl CompressorStats for IntegerStats {
131    type ArrayType = PrimitiveArray;
132
133    fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
134        match_each_integer_ptype!(input.ptype(), |$T| {
135            typed_int_stats::<$T>(input, opts.count_distinct_values)
136        })
137    }
138
139    fn source(&self) -> &PrimitiveArray {
140        &self.src
141    }
142
143    fn sample_opts(&self, sample_size: u16, sample_count: u16, opts: GenerateStatsOptions) -> Self {
144        let sampled = sample(self.src.clone(), sample_size, sample_count)
145            .to_primitive()
146            .vortex_expect("primitive");
147
148        Self::generate_opts(&sampled, opts)
149    }
150}
151
152fn typed_int_stats<T: NativePType + Hash + PrimInt>(
153    array: &PrimitiveArray,
154    count_distinct_values: bool,
155) -> IntegerStats
156where
157    TypedStats<T>: Into<ErasedStats>,
158{
159    // Special case: empty array
160    if array.is_empty() {
161        return IntegerStats {
162            src: array.clone(),
163            null_count: 0,
164            value_count: 0,
165            average_run_length: 0,
166            distinct_values_count: 0,
167            typed: TypedStats {
168                min: T::max_value(),
169                max: T::min_value(),
170                top_value: T::default(),
171                top_count: 0,
172                distinct_values: HashMap::with_hasher(FxBuildHasher),
173            }
174            .into(),
175        };
176    } else if array.all_invalid().vortex_expect("all_invalid") {
177        return IntegerStats {
178            src: array.clone(),
179            null_count: array.len().try_into().vortex_expect("null_count"),
180            value_count: 0,
181            average_run_length: 0,
182            distinct_values_count: 0,
183            typed: TypedStats {
184                min: T::max_value(),
185                max: T::min_value(),
186                top_value: T::default(),
187                top_count: 0,
188                distinct_values: HashMap::with_hasher(FxBuildHasher),
189            }
190            .into(),
191        };
192    }
193
194    let validity = array.validity_mask().vortex_expect("logical_validity");
195    let null_count = validity.false_count();
196    let value_count = validity.true_count();
197
198    // Initialize loop state
199    let head = array.as_slice::<T>()[0];
200    let mut loop_state = LoopState {
201        min: head,
202        max: head,
203        distinct_values: if count_distinct_values {
204            HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
205        } else {
206            HashMap::with_hasher(FxBuildHasher)
207        },
208        distinct_values_count: if count_distinct_values { 0 } else { u32::MAX },
209        prev: head,
210        runs: 1,
211    };
212
213    let values = array.buffer::<T>();
214    let mask = validity.to_boolean_buffer();
215
216    let mut offset = 0;
217    for chunk in values.as_slice().chunks(64) {
218        let validity = mask.slice(offset, chunk.len());
219        offset += chunk.len();
220
221        if chunk.len() < 64 {
222            // Final iteration, run naive loop
223            inner_loop_naive(chunk, count_distinct_values, &validity, &mut loop_state);
224            break;
225        }
226
227        let set_bits = validity.count_set_bits();
228
229        match set_bits {
230            // All nulls -> no stats to update
231            0 => continue,
232            // Inner loop for when validity check can be elided
233            64 => inner_loop_nonnull(
234                chunk.try_into().vortex_unwrap(),
235                count_distinct_values,
236                &mut loop_state,
237            ),
238            // Inner loop for when we need to check validity
239            _ => inner_loop_nullable(
240                chunk.try_into().vortex_unwrap(),
241                count_distinct_values,
242                &validity,
243                &mut loop_state,
244            ),
245        }
246    }
247
248    let (top_value, top_count) = if count_distinct_values {
249        let (&top_value, &top_count) = loop_state
250            .distinct_values
251            .iter()
252            .max_by_key(|&(_, &count)| count)
253            .vortex_expect("non-empty");
254        (top_value, top_count)
255    } else {
256        (T::default(), 0)
257    };
258
259    let runs = loop_state.runs;
260    let distinct_values_count = loop_state.distinct_values_count;
261
262    let typed = TypedStats {
263        min: loop_state.min,
264        max: loop_state.max,
265        distinct_values: loop_state.distinct_values,
266        top_value,
267        top_count,
268    };
269
270    let null_count = null_count
271        .try_into()
272        .vortex_expect("null_count must fit in u32");
273    let value_count = value_count
274        .try_into()
275        .vortex_expect("value_count must fit in u32");
276
277    IntegerStats {
278        src: array.clone(),
279        null_count,
280        value_count,
281        average_run_length: value_count / runs,
282        distinct_values_count,
283        typed: typed.into(),
284    }
285}
286
287struct LoopState<T> {
288    min: T,
289    max: T,
290    prev: T,
291    runs: u32,
292    distinct_values_count: u32,
293    distinct_values: HashMap<T, u32, FxBuildHasher>,
294}
295
296#[inline(always)]
297fn inner_loop_nonnull<T: PrimInt + Hash>(
298    values: &[T; 64],
299    count_distinct_values: bool,
300    state: &mut LoopState<T>,
301) {
302    for &value in values {
303        state.min = state.min.min(value);
304        state.max = state.max.max(value);
305
306        if count_distinct_values {
307            *state.distinct_values.entry(value).or_insert(0) += 1;
308            state.distinct_values_count = state.distinct_values.len().try_into().vortex_unwrap();
309        }
310
311        if value != state.prev {
312            state.prev = value;
313            state.runs += 1;
314        }
315    }
316}
317
318#[inline(always)]
319fn inner_loop_nullable<T: PrimInt + Hash>(
320    values: &[T; 64],
321    count_distinct_values: bool,
322    is_valid: &BooleanBuffer,
323    state: &mut LoopState<T>,
324) {
325    for (idx, &value) in values.iter().enumerate() {
326        if is_valid.value(idx) {
327            state.min = state.min.min(value);
328            state.max = state.max.max(value);
329
330            if count_distinct_values {
331                *state.distinct_values.entry(value).or_insert(0) += 1;
332                state.distinct_values_count =
333                    state.distinct_values.len().try_into().vortex_unwrap();
334            }
335
336            if value != state.prev {
337                state.prev = value;
338                state.runs += 1;
339            }
340        }
341    }
342}
343
344#[inline(always)]
345fn inner_loop_naive<T: PrimInt + Hash>(
346    values: &[T],
347    count_distinct_values: bool,
348    is_valid: &BooleanBuffer,
349    state: &mut LoopState<T>,
350) {
351    for (idx, &value) in values.iter().enumerate() {
352        if is_valid.value(idx) {
353            state.min = state.min.min(value);
354            state.max = state.max.max(value);
355
356            if count_distinct_values {
357                *state.distinct_values.entry(value).or_insert(0) += 1;
358                state.distinct_values_count =
359                    state.distinct_values.len().try_into().vortex_unwrap();
360            }
361
362            if value != state.prev {
363                state.prev = value;
364                state.runs += 1;
365            }
366        }
367    }
368}
369
370#[cfg(test)]
371mod tests {
372    use std::iter;
373
374    use arrow_buffer::BooleanBuffer;
375    use vortex_array::arrays::PrimitiveArray;
376    use vortex_array::validity::Validity;
377    use vortex_buffer::{Buffer, buffer};
378
379    use crate::integer::stats::typed_int_stats;
380
381    #[test]
382    fn test_naive_count_distinct_values() {
383        let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
384        let stats = typed_int_stats::<u8>(&array, true);
385        assert_eq!(stats.distinct_values_count, 2);
386    }
387
388    #[test]
389    fn test_naive_count_distinct_values_nullable() {
390        let array = PrimitiveArray::new(
391            buffer![217u8, 0],
392            Validity::from(BooleanBuffer::from(vec![true, false])),
393        );
394        let stats = typed_int_stats::<u8>(&array, true);
395        assert_eq!(stats.distinct_values_count, 1);
396    }
397
398    #[test]
399    fn test_count_distinct_values() {
400        let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
401        let stats = typed_int_stats::<u8>(&array, true);
402        assert_eq!(stats.distinct_values_count, 128);
403    }
404
405    #[test]
406    fn test_count_distinct_values_nullable() {
407        let array = PrimitiveArray::new(
408            (0..128u8).collect::<Buffer<u8>>(),
409            Validity::from(BooleanBuffer::from_iter(
410                iter::repeat_n(vec![true, false], 64).flatten(),
411            )),
412        );
413        let stats = typed_int_stats::<u8>(&array, true);
414        assert_eq!(stats.distinct_values_count, 64);
415    }
416}