vortex_array/stats/
mod.rs

1//! Traits and utilities to compute and access array statistics.
2
3use std::fmt::{Debug, Display, Formatter};
4use std::hash::Hash;
5
6use arrow_buffer::bit_iterator::BitIterator;
7use arrow_buffer::{BooleanBufferBuilder, MutableBuffer};
8use enum_iterator::{Sequence, last};
9use log::debug;
10use num_enum::{IntoPrimitive, TryFromPrimitive};
11pub use stats_set::*;
12use vortex_dtype::Nullability::{NonNullable, Nullable};
13use vortex_dtype::{DType, PType};
14
15mod array;
16mod bound;
17pub mod flatbuffers;
18mod precision;
19mod stat_bound;
20mod stats_set;
21mod traits;
22
23pub use array::*;
24pub use bound::{LowerBound, UpperBound};
25pub use precision::Precision;
26pub use stat_bound::*;
27pub use traits::*;
28use vortex_error::VortexExpect;
29
30/// Statistics that are used for pruning files (i.e., we want to ensure they are computed when compressing/writing).
31/// Sum is included for boolean arrays.
32pub const PRUNING_STATS: &[Stat] = &[Stat::Min, Stat::Max, Stat::Sum, Stat::NullCount];
33
34/// Stats to keep when serializing arrays to layouts
35pub const STATS_TO_WRITE: &[Stat] = &[
36    Stat::Min,
37    Stat::Max,
38    Stat::NullCount,
39    Stat::Sum,
40    Stat::IsConstant,
41    Stat::IsSorted,
42    Stat::IsStrictSorted,
43    Stat::UncompressedSizeInBytes,
44];
45
46#[derive(
47    Debug,
48    Clone,
49    Copy,
50    PartialEq,
51    Eq,
52    PartialOrd,
53    Ord,
54    Hash,
55    Sequence,
56    IntoPrimitive,
57    TryFromPrimitive,
58)]
59#[repr(u8)]
60pub enum Stat {
61    /// Whether all values are the same (nulls are not equal to other non-null values,
62    /// so this is true iff all values are null or all values are the same non-null value)
63    IsConstant = 0,
64    /// Whether the non-null values in the array are sorted (i.e., we skip nulls)
65    IsSorted = 1,
66    /// Whether the non-null values in the array are strictly sorted (i.e., sorted with no duplicates)
67    IsStrictSorted = 2,
68    /// The maximum value in the array (ignoring nulls, unless all values are null)
69    Max = 3,
70    /// The minimum value in the array (ignoring nulls, unless all values are null)
71    Min = 4,
72    /// The sum of the non-null values of the array.
73    Sum = 5,
74    /// The number of null values in the array
75    NullCount = 6,
76    /// The uncompressed size of the array in bytes
77    UncompressedSizeInBytes = 7,
78}
79
80/// These structs allow the extraction of the bound from the `Precision` value.
81/// They tie together the Stat and the StatBound, which allows the bound to be extracted.
82pub struct Max;
83pub struct Min;
84pub struct Sum;
85pub struct IsConstant;
86pub struct IsSorted;
87pub struct IsStrictSorted;
88pub struct NullCount;
89pub struct UncompressedSizeInBytes;
90
91impl StatType<bool> for IsConstant {
92    type Bound = Precision<bool>;
93
94    const STAT: Stat = Stat::IsConstant;
95}
96
97impl StatType<bool> for IsSorted {
98    type Bound = Precision<bool>;
99
100    const STAT: Stat = Stat::IsSorted;
101}
102
103impl StatType<bool> for IsStrictSorted {
104    type Bound = Precision<bool>;
105
106    const STAT: Stat = Stat::IsStrictSorted;
107}
108
109impl<T: PartialOrd + Clone> StatType<T> for NullCount {
110    type Bound = UpperBound<T>;
111
112    const STAT: Stat = Stat::NullCount;
113}
114
115impl<T: PartialOrd + Clone> StatType<T> for UncompressedSizeInBytes {
116    type Bound = UpperBound<T>;
117
118    const STAT: Stat = Stat::UncompressedSizeInBytes;
119}
120
121impl<T: PartialOrd + Clone + Debug> StatType<T> for Max {
122    type Bound = UpperBound<T>;
123
124    const STAT: Stat = Stat::Max;
125}
126
127impl<T: PartialOrd + Clone + Debug> StatType<T> for Min {
128    type Bound = LowerBound<T>;
129
130    const STAT: Stat = Stat::Min;
131}
132
133impl<T: PartialOrd + Clone + Debug> StatType<T> for Sum {
134    type Bound = Precision<T>;
135
136    const STAT: Stat = Stat::Sum;
137}
138
139impl Stat {
140    /// Whether the statistic is commutative (i.e., whether merging can be done independently of ordering)
141    /// e.g., min/max are commutative, but is_sorted is not
142    pub fn is_commutative(&self) -> bool {
143        // NOTE: we prefer this syntax to force a compile error if we add a new stat
144        match self {
145            Stat::IsConstant
146            | Stat::Max
147            | Stat::Min
148            | Stat::NullCount
149            | Stat::Sum
150            | Stat::UncompressedSizeInBytes => true,
151            Stat::IsSorted | Stat::IsStrictSorted => false,
152        }
153    }
154
155    /// Whether the statistic has the same dtype as the array it's computed on
156    pub fn has_same_dtype_as_array(&self) -> bool {
157        matches!(self, Stat::Min | Stat::Max)
158    }
159
160    pub fn dtype(&self, data_type: &DType) -> Option<DType> {
161        Some(match self {
162            Stat::IsConstant => DType::Bool(NonNullable),
163            Stat::IsSorted => DType::Bool(NonNullable),
164            Stat::IsStrictSorted => DType::Bool(NonNullable),
165            Stat::Max => data_type.clone(),
166            Stat::Min => data_type.clone(),
167            Stat::NullCount => DType::Primitive(PType::U64, NonNullable),
168            Stat::UncompressedSizeInBytes => DType::Primitive(PType::U64, NonNullable),
169            Stat::Sum => {
170                // Any array that cannot be summed has a sum DType of null.
171                // Any array that can be summed, but overflows, has a sum _value_ of null.
172                // Therefore, we make integer sum stats nullable.
173                match data_type {
174                    DType::Bool(_) => DType::Primitive(PType::U64, Nullable),
175                    DType::Primitive(ptype, _) => match ptype {
176                        PType::U8 | PType::U16 | PType::U32 | PType::U64 => {
177                            DType::Primitive(PType::U64, Nullable)
178                        }
179                        PType::I8 | PType::I16 | PType::I32 | PType::I64 => {
180                            DType::Primitive(PType::I64, Nullable)
181                        }
182                        PType::F16 | PType::F32 | PType::F64 => {
183                            // Float sums cannot overflow, so it's non-nullable
184                            DType::Primitive(PType::F64, NonNullable)
185                        }
186                    },
187                    DType::Extension(ext_dtype) => self.dtype(ext_dtype.storage_dtype())?,
188                    // Unsupported types
189                    DType::Null
190                    // TODO(aduffy): implement more stats for Decimal
191                    | DType::Decimal(..)
192                    | DType::Utf8(_)
193                    | DType::Binary(_)
194                    | DType::Struct(..)
195                    | DType::List(..) => return None,
196                }
197            }
198        })
199    }
200
201    pub fn name(&self) -> &str {
202        match self {
203            Self::IsConstant => "is_constant",
204            Self::IsSorted => "is_sorted",
205            Self::IsStrictSorted => "is_strict_sorted",
206            Self::Max => "max",
207            Self::Min => "min",
208            Self::NullCount => "null_count",
209            Self::UncompressedSizeInBytes => "uncompressed_size_in_bytes",
210            Stat::Sum => "sum",
211        }
212    }
213}
214
215pub fn as_stat_bitset_bytes(stats: &[Stat]) -> Vec<u8> {
216    let max_stat = u8::from(last::<Stat>().vortex_expect("last stat")) as usize;
217    // TODO(ngates): use vortex-buffer::BitBuffer
218    let mut stat_bitset = BooleanBufferBuilder::new_from_buffer(
219        MutableBuffer::from_len_zeroed(max_stat.div_ceil(8)),
220        max_stat,
221    );
222    for stat in stats {
223        stat_bitset.set_bit(u8::from(*stat) as usize, true);
224    }
225
226    stat_bitset
227        .finish()
228        .into_inner()
229        .into_vec()
230        .unwrap_or_else(|b| b.to_vec())
231}
232
233pub fn stats_from_bitset_bytes(bytes: &[u8]) -> Vec<Stat> {
234    BitIterator::new(bytes, 0, bytes.len() * 8)
235        .enumerate()
236        .filter_map(|(i, b)| b.then_some(i))
237        // Filter out indices failing conversion, these are stats written by newer version of library
238        .filter_map(|i| {
239            let Ok(stat) = u8::try_from(i) else {
240                debug!("invalid stat encountered: {i}");
241                return None;
242            };
243            Stat::try_from(stat).ok()
244        })
245        .collect::<Vec<_>>()
246}
247
248impl Display for Stat {
249    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
250        write!(f, "{}", self.name())
251    }
252}
253
254#[cfg(test)]
255mod test {
256    use enum_iterator::all;
257
258    use crate::array::Array;
259    use crate::arrays::PrimitiveArray;
260    use crate::stats::Stat;
261
262    #[test]
263    fn min_of_nulls_is_not_panic() {
264        let min = PrimitiveArray::from_option_iter::<i32, _>([None, None, None, None])
265            .statistics()
266            .compute_as::<i64>(Stat::Min);
267
268        assert_eq!(min, None);
269    }
270
271    #[test]
272    fn has_same_dtype_as_array() {
273        assert!(Stat::Min.has_same_dtype_as_array());
274        assert!(Stat::Max.has_same_dtype_as_array());
275        for stat in all::<Stat>().filter(|s| !matches!(s, Stat::Min | Stat::Max)) {
276            assert!(!stat.has_same_dtype_as_array());
277        }
278    }
279}