vortex_roaring/boolean/
stats.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
use croaring::Bitset;
use vortex_array::aliases::hash_map::HashMap;
use vortex_array::stats::{ArrayStatisticsCompute, Stat, StatsSet};
use vortex_error::{vortex_err, VortexResult};

use crate::RoaringBoolArray;

impl ArrayStatisticsCompute for RoaringBoolArray {
    fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
        if self.is_empty() {
            return Ok(StatsSet::new());
        }

        // Only needs to compute IsSorted, IsStrictSorted and RunCount all other stats have been populated on construction
        let bitmap = self.bitmap();
        BitmapStats(
            bitmap
                .to_bitset()
                .ok_or_else(|| vortex_err!("Bitmap to Bitset conversion run out of memory"))?,
            self.len(),
            bitmap.statistics().cardinality,
        )
        .compute_statistics(stat)
    }
}

// Underlying bitset, length in bits, cardinality (true count) of the bitset
struct BitmapStats(Bitset, usize, u64);

impl ArrayStatisticsCompute for BitmapStats {
    fn compute_statistics(&self, _stat: Stat) -> VortexResult<StatsSet> {
        let bitset_slice = self.0.as_slice();
        let whole_chunks = self.1 / 64;
        let last_chunk_len = self.1 % 64;
        let fist_bool = bitset_slice[0] & 1 == 1;
        let mut stats = RoaringBoolStatsAccumulator::new(fist_bool);
        for bits64 in bitset_slice[0..whole_chunks].iter() {
            stats.next(*bits64);
        }
        stats.next_up_to_length(bitset_slice[whole_chunks], last_chunk_len);
        Ok(stats.finish(self.2))
    }
}

struct RoaringBoolStatsAccumulator {
    prev: bool,
    is_sorted: bool,
    run_count: usize,
    len: usize,
}

impl RoaringBoolStatsAccumulator {
    fn new(first_value: bool) -> Self {
        Self {
            prev: first_value,
            is_sorted: true,
            run_count: 1,
            len: 0,
        }
    }

    pub fn next_up_to_length(&mut self, next: u64, len: usize) {
        assert!(len <= 64);
        self.len += len;
        for i in 0..len {
            let current = ((next >> i) & 1) == 1;
            // Booleans are sorted true > false so we aren't sorted if we switched from true to false value
            if !current && self.prev {
                self.is_sorted = false;
            }
            if current != self.prev {
                self.run_count += 1;
                self.prev = current;
            }
        }
    }

    pub fn next(&mut self, next: u64) {
        self.next_up_to_length(next, 64)
    }

    pub fn finish(self, cardinality: u64) -> StatsSet {
        StatsSet::from(HashMap::from([
            (Stat::IsSorted, self.is_sorted.into()),
            (
                Stat::IsStrictSorted,
                (self.is_sorted && (self.len < 2 || (self.len == 2 && cardinality == 1))).into(),
            ),
            (Stat::RunCount, self.run_count.into()),
        ]))
    }
}

#[cfg(test)]
mod test {
    use vortex_array::array::BoolArray;
    use vortex_array::stats::ArrayStatistics;
    use vortex_array::IntoArray;

    use crate::RoaringBoolArray;

    #[test]
    #[cfg_attr(miri, ignore)]
    fn bool_stats() {
        let bool_arr = RoaringBoolArray::encode(
            BoolArray::from(vec![false, false, true, true, false, true, true, false]).into_array(),
        )
        .unwrap();
        assert!(!bool_arr.statistics().compute_is_strict_sorted().unwrap());
        assert!(!bool_arr.statistics().compute_is_sorted().unwrap());
        assert!(!bool_arr.statistics().compute_is_constant().unwrap());
        assert!(!bool_arr.statistics().compute_min::<bool>().unwrap());
        assert!(bool_arr.statistics().compute_max::<bool>().unwrap());
        assert_eq!(bool_arr.statistics().compute_run_count().unwrap(), 5);
        assert_eq!(bool_arr.statistics().compute_true_count().unwrap(), 4);
    }

    #[test]
    #[cfg_attr(miri, ignore)]
    fn strict_sorted() {
        let bool_arr_1 =
            RoaringBoolArray::encode(BoolArray::from(vec![false, true]).into_array()).unwrap();
        assert!(bool_arr_1.statistics().compute_is_strict_sorted().unwrap());
        assert!(bool_arr_1.statistics().compute_is_sorted().unwrap());

        let bool_arr_2 =
            RoaringBoolArray::encode(BoolArray::from(vec![true]).into_array()).unwrap();
        assert!(bool_arr_2.statistics().compute_is_strict_sorted().unwrap());
        assert!(bool_arr_2.statistics().compute_is_sorted().unwrap());

        let bool_arr_3 =
            RoaringBoolArray::encode(BoolArray::from(vec![false]).into_array()).unwrap();
        assert!(bool_arr_3.statistics().compute_is_strict_sorted().unwrap());
        assert!(bool_arr_3.statistics().compute_is_sorted().unwrap());

        let bool_arr_4 =
            RoaringBoolArray::encode(BoolArray::from(vec![true, false]).into_array()).unwrap();
        assert!(!bool_arr_4.statistics().compute_is_strict_sorted().unwrap());
        assert!(!bool_arr_4.statistics().compute_is_sorted().unwrap());

        let bool_arr_5 =
            RoaringBoolArray::encode(BoolArray::from(vec![false, true, true]).into_array())
                .unwrap();
        assert!(!bool_arr_5.statistics().compute_is_strict_sorted().unwrap());
        assert!(bool_arr_5.statistics().compute_is_sorted().unwrap());
    }
}