vortex_runend/
statistics.rs

1use std::cmp;
2
3use arrow_buffer::BooleanBuffer;
4use itertools::Itertools;
5use vortex_array::stats::{Precision, Stat, StatsSet};
6use vortex_array::variants::PrimitiveArrayTrait;
7use vortex_array::vtable::StatisticsVTable;
8use vortex_array::{Array, ToCanonical as _};
9use vortex_dtype::{NativePType, match_each_unsigned_integer_ptype};
10use vortex_error::VortexResult;
11use vortex_mask::Mask;
12use vortex_scalar::ScalarValue;
13
14use crate::{RunEndArray, RunEndEncoding};
15
16impl StatisticsVTable<&RunEndArray> for RunEndEncoding {
17    fn compute_statistics(&self, array: &RunEndArray, stat: Stat) -> VortexResult<StatsSet> {
18        let maybe_stat = match stat {
19            Stat::Min | Stat::Max => array.values().statistics().compute_stat(stat)?,
20            Stat::IsSorted => Some(ScalarValue::from(
21                array
22                    .values()
23                    .statistics()
24                    .compute_is_sorted()
25                    .unwrap_or(false)
26                    && array.validity_mask()?.all_true(),
27            )),
28            Stat::NullCount => Some(ScalarValue::from(array.null_count()?)),
29            _ => None,
30        };
31
32        let mut stats = StatsSet::default();
33        if let Some(stat_value) = maybe_stat {
34            stats.set(stat, Precision::exact(stat_value));
35        }
36        Ok(stats)
37    }
38}
39
40impl RunEndArray {
41    // TODO(ngates): this should be re-used for impl SumFn.
42    #[allow(dead_code)]
43    fn true_count(&self) -> VortexResult<u64> {
44        let ends = self.ends().to_primitive()?;
45        let values = self.values().to_bool()?.boolean_buffer().clone();
46
47        match_each_unsigned_integer_ptype!(ends.ptype(), |$P| self.typed_true_count(ends.as_slice::<$P>(), values))
48    }
49
50    // TODO(ngates): this should be re-used for impl SumFn.
51    #[allow(dead_code)]
52    fn typed_true_count<P: NativePType + Into<u64>>(
53        &self,
54        decompressed_ends: &[P],
55        decompressed_values: BooleanBuffer,
56    ) -> VortexResult<u64> {
57        Ok(match self.values().validity_mask()? {
58            Mask::AllTrue(_) => {
59                let mut begin = self.offset() as u64;
60                decompressed_ends
61                    .iter()
62                    .copied()
63                    .zip_eq(&decompressed_values)
64                    .map(|(end, bool_value)| {
65                        let end: u64 = end.into();
66                        let len = end - begin;
67                        begin = end;
68                        len * u64::from(bool_value)
69                    })
70                    .sum()
71            }
72            Mask::AllFalse(_) => 0,
73            Mask::Values(values) => {
74                let mut is_valid = values.indices().iter();
75                match is_valid.next() {
76                    None => self.len() as u64,
77                    Some(&valid_index) => {
78                        let mut true_count: u64 = 0;
79                        let offsetted_begin = self.offset() as u64;
80                        let offsetted_len = (self.len() + self.offset()) as u64;
81                        let begin = if valid_index == 0 {
82                            offsetted_begin
83                        } else {
84                            decompressed_ends[valid_index - 1].into()
85                        };
86
87                        let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len);
88                        true_count += decompressed_values.value(valid_index) as u64 * (end - begin);
89
90                        for &valid_index in is_valid {
91                            let valid_end: u64 = decompressed_ends[valid_index].into();
92                            let end = cmp::min(valid_end, offsetted_len);
93                            true_count +=
94                                decompressed_values.value(valid_index) as u64 * (end - valid_end);
95                        }
96
97                        true_count
98                    }
99                }
100            }
101        })
102    }
103
104    fn null_count(&self) -> VortexResult<u64> {
105        let ends = self.ends().to_primitive()?;
106        let null_count = match self.values().validity_mask()? {
107            Mask::AllTrue(_) => 0u64,
108            Mask::AllFalse(_) => self.len() as u64,
109            Mask::Values(mask) => {
110                match_each_unsigned_integer_ptype!(ends.ptype(), |$P| self.null_count_with_array_validity(ends.as_slice::<$P>(), mask.boolean_buffer()))
111            }
112        };
113        Ok(null_count)
114    }
115
116    fn null_count_with_array_validity<P: NativePType + Into<u64>>(
117        &self,
118        decompressed_ends: &[P],
119        is_valid: &BooleanBuffer,
120    ) -> u64 {
121        let mut is_valid = is_valid.set_indices();
122        match is_valid.next() {
123            None => self.len() as u64,
124            Some(valid_index) => {
125                let offsetted_len = (self.len() + self.offset()) as u64;
126                let mut null_count: u64 = self.len() as u64;
127                let begin = if valid_index == 0 {
128                    0
129                } else {
130                    decompressed_ends[valid_index - 1].into()
131                };
132
133                let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len);
134                null_count -= end - begin;
135
136                for valid_index in is_valid {
137                    let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len);
138                    null_count -= end - decompressed_ends[valid_index - 1].into();
139                }
140
141                null_count
142            }
143        }
144    }
145}
146
147#[cfg(test)]
148mod tests {
149    use arrow_buffer::BooleanBuffer;
150    use vortex_array::arrays::BoolArray;
151    use vortex_array::compute::slice;
152    use vortex_array::stats::Stat;
153    use vortex_array::validity::Validity;
154    use vortex_array::{Array, IntoArray};
155    use vortex_buffer::buffer;
156
157    use crate::RunEndArray;
158
159    #[test]
160    fn test_runend_int_stats() {
161        let arr = RunEndArray::try_new(
162            buffer![2u32, 5, 10].into_array(),
163            buffer![1i32, 2, 3].into_array(),
164        )
165        .unwrap();
166
167        assert_eq!(arr.statistics().compute_as::<i32>(Stat::Min).unwrap(), 1);
168        assert_eq!(arr.statistics().compute_as::<i32>(Stat::Max).unwrap(), 3);
169        assert_eq!(
170            arr.statistics().compute_as::<u64>(Stat::NullCount).unwrap(),
171            0
172        );
173        assert!(arr.statistics().compute_as::<bool>(Stat::IsSorted).unwrap());
174    }
175
176    #[test]
177    fn test_runend_bool_stats() {
178        let arr = RunEndArray::try_new(
179            buffer![2u32, 5, 10].into_array(),
180            BoolArray::new(
181                BooleanBuffer::from_iter([true, true, false]),
182                Validity::Array(BoolArray::from_iter([true, false, true]).into_array()),
183            )
184            .into_array(),
185        )
186        .unwrap();
187
188        assert!(!arr.statistics().compute_as::<bool>(Stat::Min).unwrap());
189        assert!(arr.statistics().compute_as::<bool>(Stat::Max).unwrap());
190        assert_eq!(
191            arr.statistics().compute_as::<u64>(Stat::NullCount).unwrap(),
192            3
193        );
194        assert!(!arr.statistics().compute_as::<bool>(Stat::IsSorted).unwrap());
195
196        let sliced = slice(&arr, 4, 7).unwrap();
197
198        assert!(!sliced.statistics().compute_as::<bool>(Stat::Min).unwrap());
199        assert!(!sliced.statistics().compute_as::<bool>(Stat::Max).unwrap());
200        assert_eq!(
201            sliced
202                .statistics()
203                .compute_as::<u64>(Stat::NullCount)
204                .unwrap(),
205            1
206        );
207        // Not sorted because null must come last
208        assert!(
209            !sliced
210                .statistics()
211                .compute_as::<bool>(Stat::IsSorted)
212                .unwrap()
213        );
214    }
215
216    #[test]
217    fn test_all_invalid_true_count() {
218        let arr = RunEndArray::try_new(
219            buffer![2u32, 5, 10].into_array(),
220            BoolArray::from_iter([None, None, None]).into_array(),
221        )
222        .unwrap()
223        .into_array();
224        assert_eq!(
225            arr.statistics().compute_as::<u64>(Stat::NullCount).unwrap(),
226            10
227        );
228    }
229}