vortex_array/arrays/varbin/
stats.rs

1use std::cmp::Ordering;
2
3use vortex_error::{VortexResult, vortex_panic};
4
5use crate::Array;
6use crate::accessor::ArrayAccessor;
7use crate::arrays::VarBinEncoding;
8use crate::arrays::varbin::VarBinArray;
9use crate::compute::scalar_at;
10use crate::nbytes::NBytes;
11use crate::stats::{Precision, Stat, StatsSet};
12use crate::vtable::StatisticsVTable;
13
14impl StatisticsVTable<&VarBinArray> for VarBinEncoding {
15    fn compute_statistics(&self, array: &VarBinArray, stat: Stat) -> VortexResult<StatsSet> {
16        compute_varbin_statistics(array, stat)
17    }
18}
19
20pub fn compute_varbin_statistics<T: ArrayAccessor<[u8]> + Array>(
21    array: &T,
22    stat: Stat,
23) -> VortexResult<StatsSet> {
24    if array.is_empty() {
25        return Ok(StatsSet::empty_array());
26    }
27
28    Ok(match stat {
29        Stat::NullCount => {
30            let null_count = array.validity_mask()?.false_count();
31            if null_count == array.len() {
32                return Ok(StatsSet::nulls(array.len()));
33            }
34
35            let mut stats = StatsSet::of(Stat::NullCount, Precision::exact(null_count));
36            if null_count > 0 {
37                // we know that there is at least one null, but not all nulls, so it's not constant
38                stats.set(Stat::IsConstant, Precision::exact(false));
39            }
40            stats
41        }
42        Stat::IsConstant => {
43            let is_constant = array.with_iterator(compute_is_constant)?;
44            if is_constant {
45                // we know that the array is not empty
46                StatsSet::constant(scalar_at(array, 0)?, array.len())
47            } else {
48                StatsSet::of(Stat::IsConstant, Precision::exact(is_constant))
49            }
50        }
51        Stat::IsSorted => {
52            let is_sorted = array.with_iterator(|iter| iter.flatten().is_sorted())?;
53            let mut stats = StatsSet::of(Stat::IsSorted, Precision::exact(is_sorted));
54            if !is_sorted {
55                stats.set(Stat::IsStrictSorted, Precision::exact(false));
56            }
57            stats
58        }
59        Stat::IsStrictSorted => {
60            let is_strict_sorted = array.with_iterator(|iter| {
61                iter.flatten()
62                    .is_sorted_by(|a, b| matches!(a.cmp(b), Ordering::Less))
63            })?;
64            let mut stats = StatsSet::of(Stat::IsStrictSorted, Precision::exact(is_strict_sorted));
65            if is_strict_sorted {
66                stats.set(Stat::IsSorted, Precision::exact(true));
67            }
68            stats
69        }
70        Stat::UncompressedSizeInBytes => StatsSet::of(stat, Precision::exact(array.nbytes())),
71        Stat::Min | Stat::Max => {
72            // Min and max are automatically dispatched to min_max compute function.
73            vortex_panic!(
74                "Unreachable, stat {} should have already been handled",
75                stat
76            )
77        }
78        Stat::Sum => unreachable!("Sum is not supported for VarBinArray"),
79    })
80}
81
82pub(super) fn compute_is_constant(iter: &mut dyn Iterator<Item = Option<&[u8]>>) -> bool {
83    let Some(first_value) = iter.next() else {
84        return true; // empty array is constant
85    };
86    for v in iter {
87        if v != first_value {
88            return false;
89        }
90    }
91    true
92}
93
94#[cfg(test)]
95mod test {
96    use std::ops::Deref;
97
98    use vortex_buffer::{BufferString, ByteBuffer};
99    use vortex_dtype::{DType, Nullability};
100
101    use crate::array::Array;
102    use crate::arrays::varbin::VarBinArray;
103
104    fn array(dtype: DType) -> VarBinArray {
105        VarBinArray::from_vec(
106            vec!["hello world", "hello world this is a long string"],
107            dtype,
108        )
109    }
110
111    #[test]
112    fn utf8_stats() {
113        let arr = array(DType::Utf8(Nullability::NonNullable));
114        assert_eq!(
115            arr.statistics().compute_min::<BufferString>().unwrap(),
116            BufferString::from("hello world".to_string())
117        );
118        assert_eq!(
119            arr.statistics().compute_max::<BufferString>().unwrap(),
120            BufferString::from("hello world this is a long string".to_string())
121        );
122        assert!(!arr.statistics().compute_is_constant().unwrap());
123        assert!(arr.statistics().compute_is_sorted().unwrap());
124    }
125
126    #[test]
127    fn binary_stats() {
128        let arr = array(DType::Binary(Nullability::NonNullable));
129        assert_eq!(
130            arr.statistics()
131                .compute_min::<ByteBuffer>()
132                .unwrap()
133                .deref(),
134            b"hello world"
135        );
136        assert_eq!(
137            arr.statistics()
138                .compute_max::<ByteBuffer>()
139                .unwrap()
140                .deref(),
141            "hello world this is a long string".as_bytes()
142        );
143        assert!(!arr.statistics().compute_is_constant().unwrap());
144        assert!(arr.statistics().compute_is_sorted().unwrap());
145    }
146}