Skip to main content

vortex_compressor/stats/
integer.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Integer compression statistics.
5
6use std::hash::Hash;
7
8use num_traits::PrimInt;
9use rustc_hash::FxBuildHasher;
10use vortex_array::ExecutionCtx;
11use vortex_array::arrays::PrimitiveArray;
12use vortex_array::arrays::primitive::NativeValue;
13use vortex_array::dtype::IntegerPType;
14use vortex_array::expr::stats::Stat;
15use vortex_array::match_each_integer_ptype;
16use vortex_array::scalar::PValue;
17use vortex_array::scalar::Scalar;
18use vortex_buffer::BitBuffer;
19use vortex_error::VortexError;
20use vortex_error::VortexExpect;
21use vortex_error::VortexResult;
22use vortex_mask::AllOr;
23use vortex_utils::aliases::hash_map::HashMap;
24
25use super::GenerateStatsOptions;
26
27/// Information about the distinct values in an integer array.
28#[derive(Debug, Clone)]
29pub struct DistinctInfo<T> {
30    /// The unique values and their occurrences.
31    distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
32    /// The count of unique values. This _must_ be non-zero.
33    distinct_count: u32,
34    /// The most frequent value.
35    most_frequent_value: T,
36    /// The number of times the most frequent value occurs.
37    top_frequency: u32,
38}
39
40impl<T> DistinctInfo<T> {
41    /// Returns a reference to the distinct values map.
42    pub fn distinct_values(&self) -> &HashMap<NativeValue<T>, u32, FxBuildHasher> {
43        &self.distinct_values
44    }
45}
46
47/// Typed statistics for a specific integer type.
48#[derive(Debug, Clone)]
49pub struct TypedStats<T> {
50    /// The minimum value.
51    min: T,
52    /// The maximum value.
53    max: T,
54    /// Distinct value information, or `None` if not computed.
55    distinct: Option<DistinctInfo<T>>,
56}
57
58impl<T> TypedStats<T> {
59    /// Returns the distinct value information, if computed.
60    pub fn distinct(&self) -> Option<&DistinctInfo<T>> {
61        self.distinct.as_ref()
62    }
63}
64
65impl<T> TypedStats<T> {
66    /// Get the count of distinct values, if we have computed it already.
67    fn distinct_count(&self) -> Option<u32> {
68        Some(self.distinct.as_ref()?.distinct_count)
69    }
70
71    /// Get the most commonly occurring value and its count, if we have computed it already.
72    fn most_frequent_value_and_count(&self) -> Option<(&T, u32)> {
73        let distinct = self.distinct.as_ref()?;
74        Some((&distinct.most_frequent_value, distinct.top_frequency))
75    }
76}
77
78/// Type-erased container for one of the [`TypedStats`] variants.
79///
80/// Building the `TypedStats` is considerably faster and cheaper than building a type-erased
81/// set of stats. We then perform a variety of access methods on them.
82#[derive(Clone, Debug)]
83pub enum ErasedStats {
84    /// Stats for `u8` arrays.
85    U8(TypedStats<u8>),
86    /// Stats for `u16` arrays.
87    U16(TypedStats<u16>),
88    /// Stats for `u32` arrays.
89    U32(TypedStats<u32>),
90    /// Stats for `u64` arrays.
91    U64(TypedStats<u64>),
92    /// Stats for `i8` arrays.
93    I8(TypedStats<i8>),
94    /// Stats for `i16` arrays.
95    I16(TypedStats<i16>),
96    /// Stats for `i32` arrays.
97    I32(TypedStats<i32>),
98    /// Stats for `i64` arrays.
99    I64(TypedStats<i64>),
100}
101
102impl ErasedStats {
103    /// Returns `true` if the minimum value is zero.
104    pub fn min_is_zero(&self) -> bool {
105        match &self {
106            ErasedStats::U8(x) => x.min == 0,
107            ErasedStats::U16(x) => x.min == 0,
108            ErasedStats::U32(x) => x.min == 0,
109            ErasedStats::U64(x) => x.min == 0,
110            ErasedStats::I8(x) => x.min == 0,
111            ErasedStats::I16(x) => x.min == 0,
112            ErasedStats::I32(x) => x.min == 0,
113            ErasedStats::I64(x) => x.min == 0,
114        }
115    }
116
117    /// Returns `true` if the minimum value is negative.
118    pub fn min_is_negative(&self) -> bool {
119        match &self {
120            ErasedStats::U8(_)
121            | ErasedStats::U16(_)
122            | ErasedStats::U32(_)
123            | ErasedStats::U64(_) => false,
124            ErasedStats::I8(x) => x.min < 0,
125            ErasedStats::I16(x) => x.min < 0,
126            ErasedStats::I32(x) => x.min < 0,
127            ErasedStats::I64(x) => x.min < 0,
128        }
129    }
130
131    /// Difference between max and min.
132    pub fn max_minus_min(&self) -> u64 {
133        match &self {
134            ErasedStats::U8(x) => (x.max - x.min) as u64,
135            ErasedStats::U16(x) => (x.max - x.min) as u64,
136            ErasedStats::U32(x) => (x.max - x.min) as u64,
137            ErasedStats::U64(x) => x.max - x.min,
138            ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
139            ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
140            ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
141            ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
142                .vortex_expect("max minus min result bigger than u64"),
143        }
144    }
145
146    /// Returns the ilog2 of the max value when transmuted to unsigned, or `None` if zero.
147    ///
148    /// This matches how BitPacking computes bit width: it reinterprets signed values as
149    /// unsigned (preserving bit pattern) and uses `leading_zeros`. For non-negative signed
150    /// values, the transmuted value equals the original value.
151    ///
152    /// This is used to determine if FOR encoding would reduce bit width compared to
153    /// direct BitPacking. If `max_ilog2() == max_minus_min_ilog2()`, FOR doesn't help.
154    pub fn max_ilog2(&self) -> Option<u32> {
155        match &self {
156            ErasedStats::U8(x) => x.max.checked_ilog2(),
157            ErasedStats::U16(x) => x.max.checked_ilog2(),
158            ErasedStats::U32(x) => x.max.checked_ilog2(),
159            ErasedStats::U64(x) => x.max.checked_ilog2(),
160            // Transmute signed to unsigned (bit pattern preserved) to match BitPacking behavior.
161            ErasedStats::I8(x) => (x.max as u8).checked_ilog2(),
162            ErasedStats::I16(x) => (x.max as u16).checked_ilog2(),
163            ErasedStats::I32(x) => (x.max as u32).checked_ilog2(),
164            ErasedStats::I64(x) => (x.max as u64).checked_ilog2(),
165        }
166    }
167
168    /// Get the count of distinct values, if we have computed it already.
169    pub fn distinct_count(&self) -> Option<u32> {
170        match &self {
171            ErasedStats::U8(x) => x.distinct_count(),
172            ErasedStats::U16(x) => x.distinct_count(),
173            ErasedStats::U32(x) => x.distinct_count(),
174            ErasedStats::U64(x) => x.distinct_count(),
175            ErasedStats::I8(x) => x.distinct_count(),
176            ErasedStats::I16(x) => x.distinct_count(),
177            ErasedStats::I32(x) => x.distinct_count(),
178            ErasedStats::I64(x) => x.distinct_count(),
179        }
180    }
181
182    /// Get the most commonly occurring value and its count.
183    pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> {
184        match &self {
185            ErasedStats::U8(x) => {
186                let (top_value, count) = x.most_frequent_value_and_count()?;
187                Some(((*top_value).into(), count))
188            }
189            ErasedStats::U16(x) => {
190                let (top_value, count) = x.most_frequent_value_and_count()?;
191                Some(((*top_value).into(), count))
192            }
193            ErasedStats::U32(x) => {
194                let (top_value, count) = x.most_frequent_value_and_count()?;
195                Some(((*top_value).into(), count))
196            }
197            ErasedStats::U64(x) => {
198                let (top_value, count) = x.most_frequent_value_and_count()?;
199                Some(((*top_value).into(), count))
200            }
201            ErasedStats::I8(x) => {
202                let (top_value, count) = x.most_frequent_value_and_count()?;
203                Some(((*top_value).into(), count))
204            }
205            ErasedStats::I16(x) => {
206                let (top_value, count) = x.most_frequent_value_and_count()?;
207                Some(((*top_value).into(), count))
208            }
209            ErasedStats::I32(x) => {
210                let (top_value, count) = x.most_frequent_value_and_count()?;
211                Some(((*top_value).into(), count))
212            }
213            ErasedStats::I64(x) => {
214                let (top_value, count) = x.most_frequent_value_and_count()?;
215                Some(((*top_value).into(), count))
216            }
217        }
218    }
219}
220
221/// Implements `From<TypedStats<$T>>` for [`ErasedStats`].
222macro_rules! impl_from_typed {
223    ($T:ty, $variant:path) => {
224        impl From<TypedStats<$T>> for ErasedStats {
225            fn from(typed: TypedStats<$T>) -> Self {
226                $variant(typed)
227            }
228        }
229    };
230}
231
232impl_from_typed!(u8, ErasedStats::U8);
233impl_from_typed!(u16, ErasedStats::U16);
234impl_from_typed!(u32, ErasedStats::U32);
235impl_from_typed!(u64, ErasedStats::U64);
236impl_from_typed!(i8, ErasedStats::I8);
237impl_from_typed!(i16, ErasedStats::I16);
238impl_from_typed!(i32, ErasedStats::I32);
239impl_from_typed!(i64, ErasedStats::I64);
240
241/// Array of integers and relevant stats for compression.
242#[derive(Clone, Debug)]
243pub struct IntegerStats {
244    /// Cache for `validity.false_count()`.
245    null_count: u32,
246    /// Cache for `validity.true_count()`.
247    value_count: u32,
248    /// The average run length.
249    average_run_length: u32,
250    /// Type-erased typed statistics.
251    erased: ErasedStats,
252}
253
254impl IntegerStats {
255    /// Generates stats, returning an error on failure.
256    fn generate_opts_fallible(
257        input: &PrimitiveArray,
258        opts: GenerateStatsOptions,
259        ctx: &mut ExecutionCtx,
260    ) -> VortexResult<Self> {
261        match_each_integer_ptype!(input.ptype(), |T| {
262            typed_int_stats::<T>(input, opts.count_distinct_values, ctx)
263        })
264    }
265
266    /// Get the count of distinct values, if we have computed it already.
267    pub fn distinct_count(&self) -> Option<u32> {
268        self.erased.distinct_count()
269    }
270
271    /// Get the most commonly occurring value and its count, if we have computed it already.
272    pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> {
273        self.erased.most_frequent_value_and_count()
274    }
275}
276
277impl IntegerStats {
278    /// Generates stats with default options.
279    pub fn generate(input: &PrimitiveArray, ctx: &mut ExecutionCtx) -> Self {
280        Self::generate_opts(input, GenerateStatsOptions::default(), ctx)
281    }
282
283    /// Generates stats with provided options.
284    pub fn generate_opts(
285        input: &PrimitiveArray,
286        opts: GenerateStatsOptions,
287        ctx: &mut ExecutionCtx,
288    ) -> Self {
289        Self::generate_opts_fallible(input, opts, ctx)
290            .vortex_expect("IntegerStats::generate_opts should not fail")
291    }
292
293    /// Returns the number of null values.
294    pub fn null_count(&self) -> u32 {
295        self.null_count
296    }
297
298    /// Returns the number of non-null values.
299    pub fn value_count(&self) -> u32 {
300        self.value_count
301    }
302
303    /// Returns the average run length.
304    pub fn average_run_length(&self) -> u32 {
305        self.average_run_length
306    }
307
308    /// Returns the type-erased typed statistics.
309    pub fn erased(&self) -> &ErasedStats {
310        &self.erased
311    }
312}
313
314/// Computes typed integer statistics for a specific integer type.
315fn typed_int_stats<T>(
316    array: &PrimitiveArray,
317    count_distinct_values: bool,
318    ctx: &mut ExecutionCtx,
319) -> VortexResult<IntegerStats>
320where
321    T: IntegerPType + PrimInt + for<'a> TryFrom<&'a Scalar, Error = VortexError>,
322    TypedStats<T>: Into<ErasedStats>,
323    NativeValue<T>: Eq + Hash,
324{
325    // Special case: empty array.
326    if array.is_empty() {
327        return Ok(IntegerStats {
328            null_count: 0,
329            value_count: 0,
330            average_run_length: 0,
331            erased: TypedStats {
332                min: T::max_value(),
333                max: T::min_value(),
334                distinct: None,
335            }
336            .into(),
337        });
338    }
339
340    if array.all_invalid(ctx)? {
341        return Ok(IntegerStats {
342            null_count: u32::try_from(array.len())?,
343            value_count: 0,
344            average_run_length: 0,
345            erased: TypedStats {
346                min: T::max_value(),
347                max: T::min_value(),
348                distinct: Some(DistinctInfo {
349                    distinct_values: HashMap::with_capacity_and_hasher(0, FxBuildHasher),
350                    distinct_count: 0,
351                    most_frequent_value: T::zero(),
352                    top_frequency: 0,
353                }),
354            }
355            .into(),
356        });
357    }
358
359    let validity = array
360        .as_ref()
361        .validity()?
362        .execute_mask(array.as_ref().len(), ctx)?;
363    let null_count = validity.false_count();
364    let value_count = validity.true_count();
365
366    // Initialize loop state.
367    let head_idx = validity
368        .first()
369        .vortex_expect("All null masks have been handled before");
370    let buffer = array.to_buffer::<T>();
371    let head = buffer[head_idx];
372
373    let mut loop_state = LoopState {
374        distinct_values: if count_distinct_values {
375            HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
376        } else {
377            HashMap::with_hasher(FxBuildHasher)
378        },
379        prev: head,
380        runs: 1,
381    };
382
383    let sliced = buffer.slice(head_idx..array.len());
384    let mut chunks = sliced.as_slice().chunks_exact(64);
385    match validity.bit_buffer() {
386        AllOr::All => {
387            for chunk in &mut chunks {
388                inner_loop_nonnull(
389                    chunk.try_into().ok().vortex_expect("chunk size must be 64"),
390                    count_distinct_values,
391                    &mut loop_state,
392                )
393            }
394            let remainder = chunks.remainder();
395            inner_loop_naive(
396                remainder,
397                count_distinct_values,
398                &BitBuffer::new_set(remainder.len()),
399                &mut loop_state,
400            );
401        }
402        AllOr::None => unreachable!("All invalid arrays have been handled before"),
403        AllOr::Some(v) => {
404            let mask = v.slice(head_idx..array.len());
405            let mut offset = 0;
406            for chunk in &mut chunks {
407                let validity = mask.slice(offset..(offset + 64));
408                offset += 64;
409
410                match validity.true_count() {
411                    // All nulls -> no stats to update.
412                    0 => continue,
413                    // Inner loop for when validity check can be elided.
414                    64 => inner_loop_nonnull(
415                        chunk.try_into().ok().vortex_expect("chunk size must be 64"),
416                        count_distinct_values,
417                        &mut loop_state,
418                    ),
419                    // Inner loop for when we need to check validity.
420                    _ => inner_loop_nullable(
421                        chunk.try_into().ok().vortex_expect("chunk size must be 64"),
422                        count_distinct_values,
423                        &validity,
424                        &mut loop_state,
425                    ),
426                }
427            }
428            // Final iteration, run naive loop.
429            let remainder = chunks.remainder();
430            inner_loop_naive(
431                remainder,
432                count_distinct_values,
433                &mask.slice(offset..(offset + remainder.len())),
434                &mut loop_state,
435            );
436        }
437    }
438
439    let runs = loop_state.runs;
440
441    let array_ref = array.as_ref();
442    let min = array_ref
443        .statistics()
444        .compute_as::<T>(Stat::Min, ctx)
445        .vortex_expect("min should be computed");
446
447    let max = array_ref
448        .statistics()
449        .compute_as::<T>(Stat::Max, ctx)
450        .vortex_expect("max should be computed");
451
452    let distinct = count_distinct_values.then(|| {
453        let (&top_value, &top_count) = loop_state
454            .distinct_values
455            .iter()
456            .max_by_key(|&(_, &count)| count)
457            .vortex_expect("we know this is non-empty");
458
459        DistinctInfo {
460            distinct_count: u32::try_from(loop_state.distinct_values.len())
461                .vortex_expect("there are more than `u32::MAX` distinct values"),
462            most_frequent_value: top_value.0,
463            top_frequency: top_count,
464            distinct_values: loop_state.distinct_values,
465        }
466    });
467
468    let typed = TypedStats { min, max, distinct };
469
470    let null_count = u32::try_from(null_count)?;
471    let value_count = u32::try_from(value_count)?;
472
473    Ok(IntegerStats {
474        null_count,
475        value_count,
476        average_run_length: value_count / runs,
477        erased: typed.into(),
478    })
479}
480
481/// Internal loop state for integer stats computation.
482struct LoopState<T> {
483    /// The previous value seen.
484    prev: T,
485    /// The run count.
486    runs: u32,
487    /// The distinct values map.
488    distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
489}
490
491/// Inner loop for non-null chunks of 64 values.
492#[inline(always)]
493fn inner_loop_nonnull<T: IntegerPType>(
494    values: &[T; 64],
495    count_distinct_values: bool,
496    state: &mut LoopState<T>,
497) where
498    NativeValue<T>: Eq + Hash,
499{
500    for &value in values {
501        if count_distinct_values {
502            *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
503        }
504
505        if value != state.prev {
506            state.prev = value;
507            state.runs += 1;
508        }
509    }
510}
511
512/// Inner loop for nullable chunks of 64 values.
513#[inline(always)]
514fn inner_loop_nullable<T: IntegerPType>(
515    values: &[T; 64],
516    count_distinct_values: bool,
517    is_valid: &BitBuffer,
518    state: &mut LoopState<T>,
519) where
520    NativeValue<T>: Eq + Hash,
521{
522    for (idx, &value) in values.iter().enumerate() {
523        if is_valid.value(idx) {
524            if count_distinct_values {
525                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
526            }
527
528            if value != state.prev {
529                state.prev = value;
530                state.runs += 1;
531            }
532        }
533    }
534}
535
536/// Fallback inner loop for remainder values.
537#[inline(always)]
538fn inner_loop_naive<T: IntegerPType>(
539    values: &[T],
540    count_distinct_values: bool,
541    is_valid: &BitBuffer,
542    state: &mut LoopState<T>,
543) where
544    NativeValue<T>: Eq + Hash,
545{
546    for (idx, &value) in values.iter().enumerate() {
547        if is_valid.value(idx) {
548            if count_distinct_values {
549                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
550            }
551
552            if value != state.prev {
553                state.prev = value;
554                state.runs += 1;
555            }
556        }
557    }
558}
559
560#[cfg(test)]
561mod tests {
562    use std::iter;
563
564    use vortex_array::LEGACY_SESSION;
565    use vortex_array::VortexSessionExecute;
566    use vortex_array::arrays::PrimitiveArray;
567    use vortex_array::validity::Validity;
568    use vortex_buffer::BitBuffer;
569    use vortex_buffer::Buffer;
570    use vortex_buffer::buffer;
571    use vortex_error::VortexResult;
572
573    use super::IntegerStats;
574    use super::typed_int_stats;
575
576    #[test]
577    fn test_naive_count_distinct_values() -> VortexResult<()> {
578        let mut ctx = LEGACY_SESSION.create_execution_ctx();
579        let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
580        let stats = typed_int_stats::<u8>(&array, true, &mut ctx)?;
581        assert_eq!(stats.distinct_count().unwrap(), 2);
582        Ok(())
583    }
584
585    #[test]
586    fn test_naive_count_distinct_values_nullable() -> VortexResult<()> {
587        let mut ctx = LEGACY_SESSION.create_execution_ctx();
588        let array = PrimitiveArray::new(
589            buffer![217u8, 0],
590            Validity::from(BitBuffer::from(vec![true, false])),
591        );
592        let stats = typed_int_stats::<u8>(&array, true, &mut ctx)?;
593        assert_eq!(stats.distinct_count().unwrap(), 1);
594        Ok(())
595    }
596
597    #[test]
598    fn test_count_distinct_values() -> VortexResult<()> {
599        let mut ctx = LEGACY_SESSION.create_execution_ctx();
600        let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
601        let stats = typed_int_stats::<u8>(&array, true, &mut ctx)?;
602        assert_eq!(stats.distinct_count().unwrap(), 128);
603        Ok(())
604    }
605
606    #[test]
607    fn test_count_distinct_values_nullable() -> VortexResult<()> {
608        let mut ctx = LEGACY_SESSION.create_execution_ctx();
609        let array = PrimitiveArray::new(
610            (0..128u8).collect::<Buffer<u8>>(),
611            Validity::from(BitBuffer::from_iter(
612                iter::repeat_n(vec![true, false], 64).flatten(),
613            )),
614        );
615        let stats = typed_int_stats::<u8>(&array, true, &mut ctx)?;
616        assert_eq!(stats.distinct_count().unwrap(), 64);
617        Ok(())
618    }
619
620    #[test]
621    fn test_integer_stats_leading_nulls() {
622        let mut ctx = LEGACY_SESSION.create_execution_ctx();
623        let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
624
625        let stats = IntegerStats::generate_opts(
626            &ints,
627            crate::stats::GenerateStatsOptions {
628                count_distinct_values: true,
629            },
630            &mut ctx,
631        );
632
633        assert_eq!(stats.value_count, 2);
634        assert_eq!(stats.null_count, 1);
635        assert_eq!(stats.average_run_length, 1);
636        assert_eq!(stats.distinct_count().unwrap(), 2);
637    }
638}