Skip to main content

vortex_compressor/stats/
integer.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Integer compression statistics.
5
6use std::hash::Hash;
7
8use num_traits::PrimInt;
9use rustc_hash::FxBuildHasher;
10use vortex_array::arrays::PrimitiveArray;
11use vortex_array::arrays::primitive::NativeValue;
12use vortex_array::dtype::IntegerPType;
13use vortex_array::expr::stats::Stat;
14use vortex_array::match_each_integer_ptype;
15use vortex_array::scalar::PValue;
16use vortex_array::scalar::Scalar;
17use vortex_buffer::BitBuffer;
18use vortex_error::VortexError;
19use vortex_error::VortexExpect;
20use vortex_error::VortexResult;
21use vortex_mask::AllOr;
22use vortex_utils::aliases::hash_map::HashMap;
23
24use super::GenerateStatsOptions;
25
26/// Information about the distinct values in an integer array.
27#[derive(Debug, Clone)]
28pub struct DistinctInfo<T> {
29    /// The unique values and their occurrences.
30    distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
31    /// The count of unique values. This _must_ be non-zero.
32    distinct_count: u32,
33    /// The most frequent value.
34    most_frequent_value: T,
35    /// The number of times the most frequent value occurs.
36    top_frequency: u32,
37}
38
39impl<T> DistinctInfo<T> {
40    /// Returns a reference to the distinct values map.
41    pub fn distinct_values(&self) -> &HashMap<NativeValue<T>, u32, FxBuildHasher> {
42        &self.distinct_values
43    }
44}
45
46/// Typed statistics for a specific integer type.
47#[derive(Debug, Clone)]
48pub struct TypedStats<T> {
49    /// The minimum value.
50    min: T,
51    /// The maximum value.
52    max: T,
53    /// Distinct value information, or `None` if not computed.
54    distinct: Option<DistinctInfo<T>>,
55}
56
57impl<T> TypedStats<T> {
58    /// Returns the distinct value information, if computed.
59    pub fn distinct(&self) -> Option<&DistinctInfo<T>> {
60        self.distinct.as_ref()
61    }
62}
63
64impl<T> TypedStats<T> {
65    /// Get the count of distinct values, if we have computed it already.
66    fn distinct_count(&self) -> Option<u32> {
67        Some(self.distinct.as_ref()?.distinct_count)
68    }
69
70    /// Get the most commonly occurring value and its count, if we have computed it already.
71    fn most_frequent_value_and_count(&self) -> Option<(&T, u32)> {
72        let distinct = self.distinct.as_ref()?;
73        Some((&distinct.most_frequent_value, distinct.top_frequency))
74    }
75}
76
77/// Type-erased container for one of the [`TypedStats`] variants.
78///
79/// Building the `TypedStats` is considerably faster and cheaper than building a type-erased
80/// set of stats. We then perform a variety of access methods on them.
81#[derive(Clone, Debug)]
82pub enum ErasedStats {
83    /// Stats for `u8` arrays.
84    U8(TypedStats<u8>),
85    /// Stats for `u16` arrays.
86    U16(TypedStats<u16>),
87    /// Stats for `u32` arrays.
88    U32(TypedStats<u32>),
89    /// Stats for `u64` arrays.
90    U64(TypedStats<u64>),
91    /// Stats for `i8` arrays.
92    I8(TypedStats<i8>),
93    /// Stats for `i16` arrays.
94    I16(TypedStats<i16>),
95    /// Stats for `i32` arrays.
96    I32(TypedStats<i32>),
97    /// Stats for `i64` arrays.
98    I64(TypedStats<i64>),
99}
100
101impl ErasedStats {
102    /// Returns `true` if the minimum value is zero.
103    pub fn min_is_zero(&self) -> bool {
104        match &self {
105            ErasedStats::U8(x) => x.min == 0,
106            ErasedStats::U16(x) => x.min == 0,
107            ErasedStats::U32(x) => x.min == 0,
108            ErasedStats::U64(x) => x.min == 0,
109            ErasedStats::I8(x) => x.min == 0,
110            ErasedStats::I16(x) => x.min == 0,
111            ErasedStats::I32(x) => x.min == 0,
112            ErasedStats::I64(x) => x.min == 0,
113        }
114    }
115
116    /// Returns `true` if the minimum value is negative.
117    pub fn min_is_negative(&self) -> bool {
118        match &self {
119            ErasedStats::U8(_)
120            | ErasedStats::U16(_)
121            | ErasedStats::U32(_)
122            | ErasedStats::U64(_) => false,
123            ErasedStats::I8(x) => x.min < 0,
124            ErasedStats::I16(x) => x.min < 0,
125            ErasedStats::I32(x) => x.min < 0,
126            ErasedStats::I64(x) => x.min < 0,
127        }
128    }
129
130    /// Difference between max and min.
131    pub fn max_minus_min(&self) -> u64 {
132        match &self {
133            ErasedStats::U8(x) => (x.max - x.min) as u64,
134            ErasedStats::U16(x) => (x.max - x.min) as u64,
135            ErasedStats::U32(x) => (x.max - x.min) as u64,
136            ErasedStats::U64(x) => x.max - x.min,
137            ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
138            ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
139            ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
140            ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
141                .vortex_expect("max minus min result bigger than u64"),
142        }
143    }
144
145    /// Returns the ilog2 of the max value when transmuted to unsigned, or `None` if zero.
146    ///
147    /// This matches how BitPacking computes bit width: it reinterprets signed values as
148    /// unsigned (preserving bit pattern) and uses `leading_zeros`. For non-negative signed
149    /// values, the transmuted value equals the original value.
150    ///
151    /// This is used to determine if FOR encoding would reduce bit width compared to
152    /// direct BitPacking. If `max_ilog2() == max_minus_min_ilog2()`, FOR doesn't help.
153    pub fn max_ilog2(&self) -> Option<u32> {
154        match &self {
155            ErasedStats::U8(x) => x.max.checked_ilog2(),
156            ErasedStats::U16(x) => x.max.checked_ilog2(),
157            ErasedStats::U32(x) => x.max.checked_ilog2(),
158            ErasedStats::U64(x) => x.max.checked_ilog2(),
159            // Transmute signed to unsigned (bit pattern preserved) to match BitPacking behavior.
160            ErasedStats::I8(x) => (x.max as u8).checked_ilog2(),
161            ErasedStats::I16(x) => (x.max as u16).checked_ilog2(),
162            ErasedStats::I32(x) => (x.max as u32).checked_ilog2(),
163            ErasedStats::I64(x) => (x.max as u64).checked_ilog2(),
164        }
165    }
166
167    /// Get the count of distinct values, if we have computed it already.
168    pub fn distinct_count(&self) -> Option<u32> {
169        match &self {
170            ErasedStats::U8(x) => x.distinct_count(),
171            ErasedStats::U16(x) => x.distinct_count(),
172            ErasedStats::U32(x) => x.distinct_count(),
173            ErasedStats::U64(x) => x.distinct_count(),
174            ErasedStats::I8(x) => x.distinct_count(),
175            ErasedStats::I16(x) => x.distinct_count(),
176            ErasedStats::I32(x) => x.distinct_count(),
177            ErasedStats::I64(x) => x.distinct_count(),
178        }
179    }
180
181    /// Get the most commonly occurring value and its count.
182    pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> {
183        match &self {
184            ErasedStats::U8(x) => {
185                let (top_value, count) = x.most_frequent_value_and_count()?;
186                Some(((*top_value).into(), count))
187            }
188            ErasedStats::U16(x) => {
189                let (top_value, count) = x.most_frequent_value_and_count()?;
190                Some(((*top_value).into(), count))
191            }
192            ErasedStats::U32(x) => {
193                let (top_value, count) = x.most_frequent_value_and_count()?;
194                Some(((*top_value).into(), count))
195            }
196            ErasedStats::U64(x) => {
197                let (top_value, count) = x.most_frequent_value_and_count()?;
198                Some(((*top_value).into(), count))
199            }
200            ErasedStats::I8(x) => {
201                let (top_value, count) = x.most_frequent_value_and_count()?;
202                Some(((*top_value).into(), count))
203            }
204            ErasedStats::I16(x) => {
205                let (top_value, count) = x.most_frequent_value_and_count()?;
206                Some(((*top_value).into(), count))
207            }
208            ErasedStats::I32(x) => {
209                let (top_value, count) = x.most_frequent_value_and_count()?;
210                Some(((*top_value).into(), count))
211            }
212            ErasedStats::I64(x) => {
213                let (top_value, count) = x.most_frequent_value_and_count()?;
214                Some(((*top_value).into(), count))
215            }
216        }
217    }
218}
219
220/// Implements `From<TypedStats<$T>>` for [`ErasedStats`].
221macro_rules! impl_from_typed {
222    ($T:ty, $variant:path) => {
223        impl From<TypedStats<$T>> for ErasedStats {
224            fn from(typed: TypedStats<$T>) -> Self {
225                $variant(typed)
226            }
227        }
228    };
229}
230
231impl_from_typed!(u8, ErasedStats::U8);
232impl_from_typed!(u16, ErasedStats::U16);
233impl_from_typed!(u32, ErasedStats::U32);
234impl_from_typed!(u64, ErasedStats::U64);
235impl_from_typed!(i8, ErasedStats::I8);
236impl_from_typed!(i16, ErasedStats::I16);
237impl_from_typed!(i32, ErasedStats::I32);
238impl_from_typed!(i64, ErasedStats::I64);
239
240/// Array of integers and relevant stats for compression.
241#[derive(Clone, Debug)]
242pub struct IntegerStats {
243    /// Cache for `validity.false_count()`.
244    null_count: u32,
245    /// Cache for `validity.true_count()`.
246    value_count: u32,
247    /// The average run length.
248    average_run_length: u32,
249    /// Type-erased typed statistics.
250    erased: ErasedStats,
251}
252
253impl IntegerStats {
254    /// Generates stats, returning an error on failure.
255    fn generate_opts_fallible(
256        input: &PrimitiveArray,
257        opts: GenerateStatsOptions,
258    ) -> VortexResult<Self> {
259        match_each_integer_ptype!(input.ptype(), |T| {
260            typed_int_stats::<T>(input, opts.count_distinct_values)
261        })
262    }
263
264    /// Get the count of distinct values, if we have computed it already.
265    pub fn distinct_count(&self) -> Option<u32> {
266        self.erased.distinct_count()
267    }
268
269    /// Get the most commonly occurring value and its count, if we have computed it already.
270    pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> {
271        self.erased.most_frequent_value_and_count()
272    }
273}
274
275impl IntegerStats {
276    /// Generates stats with default options.
277    pub fn generate(input: &PrimitiveArray) -> Self {
278        Self::generate_opts(input, GenerateStatsOptions::default())
279    }
280
281    /// Generates stats with provided options.
282    pub fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
283        Self::generate_opts_fallible(input, opts)
284            .vortex_expect("IntegerStats::generate_opts should not fail")
285    }
286
287    /// Returns the number of null values.
288    pub fn null_count(&self) -> u32 {
289        self.null_count
290    }
291
292    /// Returns the number of non-null values.
293    pub fn value_count(&self) -> u32 {
294        self.value_count
295    }
296
297    /// Returns the average run length.
298    pub fn average_run_length(&self) -> u32 {
299        self.average_run_length
300    }
301
302    /// Returns the type-erased typed statistics.
303    pub fn erased(&self) -> &ErasedStats {
304        &self.erased
305    }
306}
307
308/// Computes typed integer statistics for a specific integer type.
309fn typed_int_stats<T>(
310    array: &PrimitiveArray,
311    count_distinct_values: bool,
312) -> VortexResult<IntegerStats>
313where
314    T: IntegerPType + PrimInt + for<'a> TryFrom<&'a Scalar, Error = VortexError>,
315    TypedStats<T>: Into<ErasedStats>,
316    NativeValue<T>: Eq + Hash,
317{
318    // Special case: empty array.
319    if array.is_empty() {
320        return Ok(IntegerStats {
321            null_count: 0,
322            value_count: 0,
323            average_run_length: 0,
324            erased: TypedStats {
325                min: T::max_value(),
326                max: T::min_value(),
327                distinct: None,
328            }
329            .into(),
330        });
331    }
332
333    if array.all_invalid()? {
334        return Ok(IntegerStats {
335            null_count: u32::try_from(array.len())?,
336            value_count: 0,
337            average_run_length: 0,
338            erased: TypedStats {
339                min: T::max_value(),
340                max: T::min_value(),
341                distinct: None,
342            }
343            .into(),
344        });
345    }
346
347    let validity = array.validity_mask()?;
348    let null_count = validity.false_count();
349    let value_count = validity.true_count();
350
351    // Initialize loop state.
352    let head_idx = validity
353        .first()
354        .vortex_expect("All null masks have been handled before");
355    let buffer = array.to_buffer::<T>();
356    let head = buffer[head_idx];
357
358    let mut loop_state = LoopState {
359        distinct_values: if count_distinct_values {
360            HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
361        } else {
362            HashMap::with_hasher(FxBuildHasher)
363        },
364        prev: head,
365        runs: 1,
366    };
367
368    let sliced = buffer.slice(head_idx..array.len());
369    let mut chunks = sliced.as_slice().chunks_exact(64);
370    match validity.bit_buffer() {
371        AllOr::All => {
372            for chunk in &mut chunks {
373                inner_loop_nonnull(
374                    chunk.try_into().ok().vortex_expect("chunk size must be 64"),
375                    count_distinct_values,
376                    &mut loop_state,
377                )
378            }
379            let remainder = chunks.remainder();
380            inner_loop_naive(
381                remainder,
382                count_distinct_values,
383                &BitBuffer::new_set(remainder.len()),
384                &mut loop_state,
385            );
386        }
387        AllOr::None => unreachable!("All invalid arrays have been handled before"),
388        AllOr::Some(v) => {
389            let mask = v.slice(head_idx..array.len());
390            let mut offset = 0;
391            for chunk in &mut chunks {
392                let validity = mask.slice(offset..(offset + 64));
393                offset += 64;
394
395                match validity.true_count() {
396                    // All nulls -> no stats to update.
397                    0 => continue,
398                    // Inner loop for when validity check can be elided.
399                    64 => inner_loop_nonnull(
400                        chunk.try_into().ok().vortex_expect("chunk size must be 64"),
401                        count_distinct_values,
402                        &mut loop_state,
403                    ),
404                    // Inner loop for when we need to check validity.
405                    _ => inner_loop_nullable(
406                        chunk.try_into().ok().vortex_expect("chunk size must be 64"),
407                        count_distinct_values,
408                        &validity,
409                        &mut loop_state,
410                    ),
411                }
412            }
413            // Final iteration, run naive loop.
414            let remainder = chunks.remainder();
415            inner_loop_naive(
416                remainder,
417                count_distinct_values,
418                &mask.slice(offset..(offset + remainder.len())),
419                &mut loop_state,
420            );
421        }
422    }
423
424    let runs = loop_state.runs;
425
426    let array_ref = array.as_ref();
427    let min = array_ref
428        .statistics()
429        .compute_as::<T>(Stat::Min)
430        .vortex_expect("min should be computed");
431
432    let max = array_ref
433        .statistics()
434        .compute_as::<T>(Stat::Max)
435        .vortex_expect("max should be computed");
436
437    let distinct = count_distinct_values.then(|| {
438        let (&top_value, &top_count) = loop_state
439            .distinct_values
440            .iter()
441            .max_by_key(|&(_, &count)| count)
442            .vortex_expect("we know this is non-empty");
443
444        DistinctInfo {
445            distinct_count: u32::try_from(loop_state.distinct_values.len())
446                .vortex_expect("there are more than `u32::MAX` distinct values"),
447            most_frequent_value: top_value.0,
448            top_frequency: top_count,
449            distinct_values: loop_state.distinct_values,
450        }
451    });
452
453    let typed = TypedStats { min, max, distinct };
454
455    let null_count = u32::try_from(null_count)?;
456    let value_count = u32::try_from(value_count)?;
457
458    Ok(IntegerStats {
459        null_count,
460        value_count,
461        average_run_length: value_count / runs,
462        erased: typed.into(),
463    })
464}
465
466/// Internal loop state for integer stats computation.
467struct LoopState<T> {
468    /// The previous value seen.
469    prev: T,
470    /// The run count.
471    runs: u32,
472    /// The distinct values map.
473    distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
474}
475
476/// Inner loop for non-null chunks of 64 values.
477#[inline(always)]
478fn inner_loop_nonnull<T: IntegerPType>(
479    values: &[T; 64],
480    count_distinct_values: bool,
481    state: &mut LoopState<T>,
482) where
483    NativeValue<T>: Eq + Hash,
484{
485    for &value in values {
486        if count_distinct_values {
487            *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
488        }
489
490        if value != state.prev {
491            state.prev = value;
492            state.runs += 1;
493        }
494    }
495}
496
497/// Inner loop for nullable chunks of 64 values.
498#[inline(always)]
499fn inner_loop_nullable<T: IntegerPType>(
500    values: &[T; 64],
501    count_distinct_values: bool,
502    is_valid: &BitBuffer,
503    state: &mut LoopState<T>,
504) where
505    NativeValue<T>: Eq + Hash,
506{
507    for (idx, &value) in values.iter().enumerate() {
508        if is_valid.value(idx) {
509            if count_distinct_values {
510                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
511            }
512
513            if value != state.prev {
514                state.prev = value;
515                state.runs += 1;
516            }
517        }
518    }
519}
520
521/// Fallback inner loop for remainder values.
522#[inline(always)]
523fn inner_loop_naive<T: IntegerPType>(
524    values: &[T],
525    count_distinct_values: bool,
526    is_valid: &BitBuffer,
527    state: &mut LoopState<T>,
528) where
529    NativeValue<T>: Eq + Hash,
530{
531    for (idx, &value) in values.iter().enumerate() {
532        if is_valid.value(idx) {
533            if count_distinct_values {
534                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
535            }
536
537            if value != state.prev {
538                state.prev = value;
539                state.runs += 1;
540            }
541        }
542    }
543}
544
545#[cfg(test)]
546mod tests {
547    use std::iter;
548
549    use vortex_array::arrays::PrimitiveArray;
550    use vortex_array::validity::Validity;
551    use vortex_buffer::BitBuffer;
552    use vortex_buffer::Buffer;
553    use vortex_buffer::buffer;
554    use vortex_error::VortexResult;
555
556    use super::IntegerStats;
557    use super::typed_int_stats;
558
559    #[test]
560    fn test_naive_count_distinct_values() -> VortexResult<()> {
561        let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
562        let stats = typed_int_stats::<u8>(&array, true)?;
563        assert_eq!(stats.distinct_count().unwrap(), 2);
564        Ok(())
565    }
566
567    #[test]
568    fn test_naive_count_distinct_values_nullable() -> VortexResult<()> {
569        let array = PrimitiveArray::new(
570            buffer![217u8, 0],
571            Validity::from(BitBuffer::from(vec![true, false])),
572        );
573        let stats = typed_int_stats::<u8>(&array, true)?;
574        assert_eq!(stats.distinct_count().unwrap(), 1);
575        Ok(())
576    }
577
578    #[test]
579    fn test_count_distinct_values() -> VortexResult<()> {
580        let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
581        let stats = typed_int_stats::<u8>(&array, true)?;
582        assert_eq!(stats.distinct_count().unwrap(), 128);
583        Ok(())
584    }
585
586    #[test]
587    fn test_count_distinct_values_nullable() -> VortexResult<()> {
588        let array = PrimitiveArray::new(
589            (0..128u8).collect::<Buffer<u8>>(),
590            Validity::from(BitBuffer::from_iter(
591                iter::repeat_n(vec![true, false], 64).flatten(),
592            )),
593        );
594        let stats = typed_int_stats::<u8>(&array, true)?;
595        assert_eq!(stats.distinct_count().unwrap(), 64);
596        Ok(())
597    }
598
599    #[test]
600    fn test_integer_stats_leading_nulls() {
601        let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
602
603        let stats = IntegerStats::generate_opts(
604            &ints,
605            crate::stats::GenerateStatsOptions {
606                count_distinct_values: true,
607            },
608        );
609
610        assert_eq!(stats.value_count, 2);
611        assert_eq!(stats.null_count, 1);
612        assert_eq!(stats.average_run_length, 1);
613        assert_eq!(stats.distinct_count().unwrap(), 2);
614    }
615}