vortex_array/stats/
mod.rs

1//! Traits and utilities to compute and access array statistics.
2
3use std::fmt::{Debug, Display, Formatter};
4use std::hash::Hash;
5
6use arrow_buffer::bit_iterator::BitIterator;
7use arrow_buffer::{BooleanBufferBuilder, MutableBuffer};
8use enum_iterator::{Sequence, all, last};
9use log::debug;
10use num_enum::{IntoPrimitive, TryFromPrimitive};
11pub use stats_set::*;
12use vortex_dtype::Nullability::{NonNullable, Nullable};
13use vortex_dtype::{DType, PType};
14
15mod array;
16mod bound;
17pub mod flatbuffers;
18mod precision;
19mod stat_bound;
20mod stats_set;
21mod traits;
22
23pub use array::*;
24pub use bound::{LowerBound, UpperBound};
25pub use precision::Precision;
26pub use stat_bound::*;
27pub use traits::*;
28use vortex_error::VortexExpect;
29
30/// Statistics that are used for pruning files (i.e., we want to ensure they are computed when compressing/writing).
31/// Sum is included for boolean arrays.
32pub const PRUNING_STATS: &[Stat] = &[
33    Stat::Min,
34    Stat::Max,
35    Stat::Sum,
36    Stat::NullCount,
37    Stat::NaNCount,
38];
39
40#[derive(
41    Debug,
42    Clone,
43    Copy,
44    PartialEq,
45    Eq,
46    PartialOrd,
47    Ord,
48    Hash,
49    Sequence,
50    IntoPrimitive,
51    TryFromPrimitive,
52)]
53#[repr(u8)]
54pub enum Stat {
55    /// Whether all values are the same (nulls are not equal to other non-null values,
56    /// so this is true iff all values are null or all values are the same non-null value)
57    IsConstant = 0,
58    /// Whether the non-null values in the array are sorted (i.e., we skip nulls)
59    IsSorted = 1,
60    /// Whether the non-null values in the array are strictly sorted (i.e., sorted with no duplicates)
61    IsStrictSorted = 2,
62    /// The maximum value in the array (ignoring nulls, unless all values are null)
63    Max = 3,
64    /// The minimum value in the array (ignoring nulls, unless all values are null)
65    Min = 4,
66    /// The sum of the non-null values of the array.
67    Sum = 5,
68    /// The number of null values in the array
69    NullCount = 6,
70    /// The uncompressed size of the array in bytes
71    UncompressedSizeInBytes = 7,
72    /// The number of NaN values in the array
73    NaNCount = 8,
74}
75
76/// These structs allow the extraction of the bound from the `Precision` value.
77/// They tie together the Stat and the StatBound, which allows the bound to be extracted.
78pub struct Max;
79pub struct Min;
80pub struct Sum;
81pub struct IsConstant;
82pub struct IsSorted;
83pub struct IsStrictSorted;
84pub struct NullCount;
85pub struct UncompressedSizeInBytes;
86pub struct NaNCount;
87
88impl StatType<bool> for IsConstant {
89    type Bound = Precision<bool>;
90
91    const STAT: Stat = Stat::IsConstant;
92}
93
94impl StatType<bool> for IsSorted {
95    type Bound = Precision<bool>;
96
97    const STAT: Stat = Stat::IsSorted;
98}
99
100impl StatType<bool> for IsStrictSorted {
101    type Bound = Precision<bool>;
102
103    const STAT: Stat = Stat::IsStrictSorted;
104}
105
106impl<T: PartialOrd + Clone> StatType<T> for NullCount {
107    type Bound = UpperBound<T>;
108
109    const STAT: Stat = Stat::NullCount;
110}
111
112impl<T: PartialOrd + Clone> StatType<T> for UncompressedSizeInBytes {
113    type Bound = UpperBound<T>;
114
115    const STAT: Stat = Stat::UncompressedSizeInBytes;
116}
117
118impl<T: PartialOrd + Clone + Debug> StatType<T> for Max {
119    type Bound = UpperBound<T>;
120
121    const STAT: Stat = Stat::Max;
122}
123
124impl<T: PartialOrd + Clone + Debug> StatType<T> for Min {
125    type Bound = LowerBound<T>;
126
127    const STAT: Stat = Stat::Min;
128}
129
130impl<T: PartialOrd + Clone + Debug> StatType<T> for Sum {
131    type Bound = Precision<T>;
132
133    const STAT: Stat = Stat::Sum;
134}
135
136impl<T: PartialOrd + Clone> StatType<T> for NaNCount {
137    type Bound = UpperBound<T>;
138
139    const STAT: Stat = Stat::NaNCount;
140}
141
142impl Stat {
143    /// Whether the statistic is commutative (i.e., whether merging can be done independently of ordering)
144    /// e.g., min/max are commutative, but is_sorted is not
145    pub fn is_commutative(&self) -> bool {
146        // NOTE: we prefer this syntax to force a compile error if we add a new stat
147        match self {
148            Self::IsConstant
149            | Self::Max
150            | Self::Min
151            | Self::NullCount
152            | Self::Sum
153            | Self::NaNCount
154            | Self::UncompressedSizeInBytes => true,
155            Self::IsSorted | Self::IsStrictSorted => false,
156        }
157    }
158
159    /// Whether the statistic has the same dtype as the array it's computed on
160    pub fn has_same_dtype_as_array(&self) -> bool {
161        matches!(self, Stat::Min | Stat::Max)
162    }
163
164    /// Return the [`DType`] of the statistic scalar assuming the array is of the given [`DType`].
165    pub fn dtype(&self, data_type: &DType) -> Option<DType> {
166        Some(match self {
167            Self::IsConstant => DType::Bool(NonNullable),
168            Self::IsSorted => DType::Bool(NonNullable),
169            Self::IsStrictSorted => DType::Bool(NonNullable),
170            Self::Max => data_type.clone(),
171            Self::Min => data_type.clone(),
172            Self::NullCount => DType::Primitive(PType::U64, NonNullable),
173            Self::UncompressedSizeInBytes => DType::Primitive(PType::U64, NonNullable),
174            Self::NaNCount => match data_type {
175                DType::Primitive(ptype, ..) if ptype.is_float() => {
176                    DType::Primitive(PType::U64, NonNullable)
177                }
178                // Any other type does not support NaN count
179                _ => return None,
180            },
181            Self::Sum => {
182                // Any array that cannot be summed has a sum DType of null.
183                // Any array that can be summed, but overflows, has a sum _value_ of null.
184                // Therefore, we make integer sum stats nullable.
185                match data_type {
186                    DType::Bool(_) => DType::Primitive(PType::U64, Nullable),
187                    DType::Primitive(ptype, _) => match ptype {
188                        PType::U8 | PType::U16 | PType::U32 | PType::U64 => {
189                            DType::Primitive(PType::U64, Nullable)
190                        }
191                        PType::I8 | PType::I16 | PType::I32 | PType::I64 => {
192                            DType::Primitive(PType::I64, Nullable)
193                        }
194                        PType::F16 | PType::F32 | PType::F64 => {
195                            // Float sums cannot overflow, so it's non-nullable
196                            DType::Primitive(PType::F64, NonNullable)
197                        }
198                    },
199                    DType::Extension(ext_dtype) => self.dtype(ext_dtype.storage_dtype())?,
200                    // Unsupported types
201                    DType::Null
202                    // TODO(aduffy): implement more stats for Decimal
203                    | DType::Decimal(..)
204                    | DType::Utf8(_)
205                    | DType::Binary(_)
206                    | DType::Struct(..)
207                    | DType::List(..) => return None,
208                }
209            }
210        })
211    }
212
213    pub fn name(&self) -> &str {
214        match self {
215            Self::IsConstant => "is_constant",
216            Self::IsSorted => "is_sorted",
217            Self::IsStrictSorted => "is_strict_sorted",
218            Self::Max => "max",
219            Self::Min => "min",
220            Self::NullCount => "null_count",
221            Self::UncompressedSizeInBytes => "uncompressed_size_in_bytes",
222            Self::Sum => "sum",
223            Self::NaNCount => "nan_count",
224        }
225    }
226
227    pub fn all() -> impl Iterator<Item = Stat> {
228        all::<Self>()
229    }
230}
231
232pub fn as_stat_bitset_bytes(stats: &[Stat]) -> Vec<u8> {
233    let max_stat = u8::from(last::<Stat>().vortex_expect("last stat")) as usize + 1;
234    // TODO(ngates): use vortex-buffer::BitBuffer
235    let mut stat_bitset = BooleanBufferBuilder::new_from_buffer(
236        MutableBuffer::from_len_zeroed(max_stat.div_ceil(8)),
237        max_stat,
238    );
239    for stat in stats {
240        stat_bitset.set_bit(u8::from(*stat) as usize, true);
241    }
242
243    stat_bitset
244        .finish()
245        .into_inner()
246        .into_vec()
247        .unwrap_or_else(|b| b.to_vec())
248}
249
250pub fn stats_from_bitset_bytes(bytes: &[u8]) -> Vec<Stat> {
251    BitIterator::new(bytes, 0, bytes.len() * 8)
252        .enumerate()
253        .filter_map(|(i, b)| b.then_some(i))
254        // Filter out indices failing conversion, these are stats written by newer version of library
255        .filter_map(|i| {
256            let Ok(stat) = u8::try_from(i) else {
257                debug!("invalid stat encountered: {i}");
258                return None;
259            };
260            Stat::try_from(stat).ok()
261        })
262        .collect::<Vec<_>>()
263}
264
265impl Display for Stat {
266    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
267        write!(f, "{}", self.name())
268    }
269}
270
271#[cfg(test)]
272mod test {
273    use enum_iterator::all;
274
275    use crate::arrays::PrimitiveArray;
276    use crate::stats::Stat;
277
278    #[test]
279    fn min_of_nulls_is_not_panic() {
280        let min = PrimitiveArray::from_option_iter::<i32, _>([None, None, None, None])
281            .statistics()
282            .compute_as::<i64>(Stat::Min);
283
284        assert_eq!(min, None);
285    }
286
287    #[test]
288    fn has_same_dtype_as_array() {
289        assert!(Stat::Min.has_same_dtype_as_array());
290        assert!(Stat::Max.has_same_dtype_as_array());
291        for stat in all::<Stat>().filter(|s| !matches!(s, Stat::Min | Stat::Max)) {
292            assert!(!stat.has_same_dtype_as_array());
293        }
294    }
295}
vortex_array/stats/mod.rs

vortex_array/stats/
mod.rs