vortex_array/arrays/bool/
stats.rs

1use std::ops::BitAnd;
2
3use arrow_buffer::BooleanBuffer;
4use itertools::Itertools;
5use vortex_error::VortexResult;
6use vortex_mask::Mask;
7
8use crate::Array;
9use crate::arrays::{BoolArray, BoolEncoding};
10use crate::stats::{Precision, Stat, StatsSet};
11use crate::vtable::StatisticsVTable;
12
13impl StatisticsVTable<&BoolArray> for BoolEncoding {
14    fn compute_statistics(&self, array: &BoolArray, stat: Stat) -> VortexResult<StatsSet> {
15        if array.is_empty() {
16            return Ok(StatsSet::new_unchecked(vec![
17                (Stat::Sum, Precision::exact(0)),
18                (Stat::NullCount, Precision::exact(0)),
19            ]));
20        }
21
22        match array.validity_mask()? {
23            Mask::AllTrue(_) => self.compute_statistics(array.boolean_buffer(), stat),
24            Mask::AllFalse(v) => Ok(StatsSet::nulls(v)),
25            Mask::Values(values) => self.compute_statistics(
26                &NullableBools(array.boolean_buffer(), values.boolean_buffer()),
27                stat,
28            ),
29        }
30    }
31}
32
33struct NullableBools<'a>(&'a BooleanBuffer, &'a BooleanBuffer);
34
35impl StatisticsVTable<&NullableBools<'_>> for BoolEncoding {
36    fn compute_statistics(&self, array: &NullableBools<'_>, stat: Stat) -> VortexResult<StatsSet> {
37        // Fast-path if we just want the true-count
38        if matches!(
39            stat,
40            Stat::Sum | Stat::Min | Stat::Max | Stat::IsConstant | Stat::NullCount
41        ) {
42            return Ok(StatsSet::bools_with_sum_and_null_count(
43                array.0.bitand(array.1).count_set_bits(),
44                array.1.len() - array.1.count_set_bits(),
45                array.0.len(),
46            ));
47        }
48
49        let first_non_null_idx = array
50            .1
51            .iter()
52            .enumerate()
53            .skip_while(|(_, valid)| !*valid)
54            .map(|(idx, _)| idx)
55            .next();
56
57        if let Some(first_non_null) = first_non_null_idx {
58            let mut acc = BoolStatsAccumulator::new(array.0.value(first_non_null));
59            acc.n_nulls(first_non_null);
60            array
61                .0
62                .iter()
63                .zip_eq(array.1.iter())
64                .skip(first_non_null + 1)
65                .map(|(next, valid)| valid.then_some(next))
66                .for_each(|next| acc.nullable_next(next));
67            Ok(acc.finish())
68        } else {
69            Ok(StatsSet::nulls(array.0.len()))
70        }
71    }
72}
73
74impl StatisticsVTable<&BooleanBuffer> for BoolEncoding {
75    fn compute_statistics(&self, buffer: &BooleanBuffer, stat: Stat) -> VortexResult<StatsSet> {
76        // Fast-path if we just want the true-count
77        if matches!(
78            stat,
79            Stat::Sum | Stat::Min | Stat::Max | Stat::IsConstant | Stat::NullCount
80        ) {
81            return Ok(StatsSet::bools_with_sum_and_null_count(
82                buffer.count_set_bits(),
83                0,
84                buffer.len(),
85            ));
86        }
87
88        let mut stats = BoolStatsAccumulator::new(buffer.value(0));
89        buffer.iter().skip(1).for_each(|next| stats.next(next));
90        Ok(stats.finish())
91    }
92}
93
94struct BoolStatsAccumulator {
95    prev: bool,
96    is_sorted: bool,
97    run_count: usize,
98    null_count: usize,
99    true_count: usize,
100    len: usize,
101}
102
103impl BoolStatsAccumulator {
104    pub fn new(first_value: bool) -> Self {
105        Self {
106            prev: first_value,
107            is_sorted: true,
108            run_count: 1,
109            null_count: 0,
110            true_count: if first_value { 1 } else { 0 },
111            len: 1,
112        }
113    }
114
115    pub fn n_nulls(&mut self, n_nulls: usize) {
116        self.null_count += n_nulls;
117        self.len += n_nulls;
118    }
119
120    pub fn nullable_next(&mut self, next: Option<bool>) {
121        match next {
122            Some(n) => self.next(n),
123            None => {
124                self.null_count += 1;
125                self.len += 1;
126            }
127        }
128    }
129
130    pub fn next(&mut self, next: bool) {
131        self.len += 1;
132
133        if next {
134            self.true_count += 1
135        }
136        if !next & self.prev {
137            self.is_sorted = false;
138        }
139        if next != self.prev {
140            self.run_count += 1;
141            self.prev = next;
142        }
143    }
144
145    pub fn finish(self) -> StatsSet {
146        StatsSet::new_unchecked(vec![
147            (Stat::NullCount, Precision::exact(self.null_count)),
148            (Stat::IsSorted, Precision::exact(self.is_sorted)),
149            (
150                Stat::IsStrictSorted,
151                Precision::exact(
152                    self.is_sorted && (self.len < 2 || (self.len == 2 && self.true_count == 1)),
153                ),
154            ),
155        ])
156    }
157}
158
159#[cfg(test)]
160mod test {
161    use arrow_buffer::BooleanBuffer;
162
163    use crate::ArrayVariants;
164    use crate::array::Array;
165    use crate::arrays::BoolArray;
166    use crate::stats::Stat;
167    use crate::validity::Validity;
168
169    #[test]
170    fn bool_stats() {
171        let bool_arr = BoolArray::from_iter([false, false, true, true, false, true, true, false]);
172        assert!(!bool_arr.statistics().compute_is_strict_sorted().unwrap());
173        assert!(!bool_arr.statistics().compute_is_sorted().unwrap());
174        assert!(!bool_arr.statistics().compute_is_constant().unwrap());
175        assert!(!bool_arr.statistics().compute_min::<bool>().unwrap());
176        assert!(bool_arr.statistics().compute_max::<bool>().unwrap());
177        assert_eq!(bool_arr.statistics().compute_null_count().unwrap(), 0);
178        assert_eq!(bool_arr.as_bool_typed().unwrap().true_count().unwrap(), 4);
179    }
180
181    #[test]
182    fn strict_sorted() {
183        let bool_arr_1 = BoolArray::from_iter([false, true]);
184        assert!(bool_arr_1.statistics().compute_is_strict_sorted().unwrap());
185        assert!(bool_arr_1.statistics().compute_is_sorted().unwrap());
186
187        let bool_arr_2 = BoolArray::from_iter([true]);
188        assert!(bool_arr_2.statistics().compute_is_strict_sorted().unwrap());
189        assert!(bool_arr_2.statistics().compute_is_sorted().unwrap());
190
191        let bool_arr_3 = BoolArray::from_iter([false]);
192        assert!(bool_arr_3.statistics().compute_is_strict_sorted().unwrap());
193        assert!(bool_arr_3.statistics().compute_is_sorted().unwrap());
194
195        let bool_arr_4 = BoolArray::from_iter([true, false]);
196        assert!(!bool_arr_4.statistics().compute_is_strict_sorted().unwrap());
197        assert!(!bool_arr_4.statistics().compute_is_sorted().unwrap());
198
199        let bool_arr_5 = BoolArray::from_iter([false, true, true]);
200        assert!(!bool_arr_5.statistics().compute_is_strict_sorted().unwrap());
201        assert!(bool_arr_5.statistics().compute_is_sorted().unwrap());
202    }
203
204    #[test]
205    fn nullable_stats() {
206        let bool_arr = BoolArray::from_iter(vec![
207            Some(false),
208            Some(true),
209            None,
210            Some(true),
211            Some(false),
212            None,
213            None,
214        ]);
215        assert!(!bool_arr.statistics().compute_is_strict_sorted().unwrap());
216        assert!(!bool_arr.statistics().compute_is_sorted().unwrap());
217        assert!(!bool_arr.statistics().compute_is_constant().unwrap());
218        assert!(!bool_arr.statistics().compute_min::<bool>().unwrap());
219        assert!(bool_arr.statistics().compute_max::<bool>().unwrap());
220        assert_eq!(bool_arr.as_bool_typed().unwrap().true_count().unwrap(), 2);
221        assert_eq!(bool_arr.statistics().compute_null_count().unwrap(), 3);
222    }
223
224    #[test]
225    fn one_non_null_value() {
226        let bool_arr = BoolArray::from_iter(vec![Some(false), None]);
227        assert!(!bool_arr.statistics().compute_is_strict_sorted().unwrap());
228        assert!(bool_arr.statistics().compute_is_sorted().unwrap());
229        assert!(!bool_arr.statistics().compute_is_constant().unwrap());
230        assert!(!bool_arr.statistics().compute_min::<bool>().unwrap());
231        assert!(!bool_arr.statistics().compute_max::<bool>().unwrap());
232        assert_eq!(bool_arr.as_bool_typed().unwrap().true_count().unwrap(), 0);
233        assert_eq!(bool_arr.statistics().compute_null_count().unwrap(), 1);
234    }
235
236    #[test]
237    fn empty_array() {
238        let bool_arr = BoolArray::new(BooleanBuffer::new_set(0), Validity::NonNullable);
239        assert!(bool_arr.statistics().compute_is_strict_sorted().is_none());
240        assert!(bool_arr.statistics().compute_is_sorted().is_none());
241        assert!(bool_arr.statistics().compute_is_constant().is_none());
242        assert!(bool_arr.statistics().compute_min::<bool>().is_none());
243        assert!(bool_arr.statistics().compute_max::<bool>().is_none());
244        assert_eq!(bool_arr.as_bool_typed().unwrap().true_count().unwrap(), 0);
245        assert_eq!(bool_arr.statistics().compute_null_count().unwrap(), 0);
246    }
247
248    #[test]
249    fn all_false() {
250        let bool_arr = BoolArray::from_iter(vec![false, false, false]);
251        assert!(!bool_arr.statistics().compute_is_strict_sorted().unwrap());
252        assert!(bool_arr.statistics().compute_is_sorted().unwrap());
253        assert!(bool_arr.statistics().compute_is_constant().unwrap());
254        assert!(!bool_arr.statistics().compute_min::<bool>().unwrap());
255        assert!(!bool_arr.statistics().compute_max::<bool>().unwrap());
256        assert_eq!(bool_arr.as_bool_typed().unwrap().true_count().unwrap(), 0);
257        assert_eq!(bool_arr.statistics().compute_null_count().unwrap(), 0);
258    }
259
260    #[test]
261    fn all_nullable_stats() {
262        let bool_arr = BoolArray::from_iter(vec![None, None, None, None, None]);
263        assert!(!bool_arr.statistics().compute_is_strict_sorted().unwrap());
264        assert!(bool_arr.statistics().compute_is_sorted().unwrap());
265        assert!(bool_arr.statistics().compute_is_constant().unwrap());
266        assert!(
267            bool_arr
268                .statistics()
269                .compute_stat(Stat::Min)
270                .unwrap()
271                .is_none()
272        );
273        assert!(
274            bool_arr
275                .statistics()
276                .compute_stat(Stat::Max)
277                .unwrap()
278                .is_none()
279        );
280        assert_eq!(bool_arr.as_bool_typed().unwrap().true_count().unwrap(), 0);
281        assert_eq!(bool_arr.statistics().compute_null_count().unwrap(), 5);
282    }
283}