vortex_roaring/boolean/
mod.rs

1use std::fmt::{Debug, Display};
2use std::sync::Arc;
3
4use arrow_buffer::{BooleanBuffer, MutableBuffer};
5pub use compress::*;
6use croaring::Native;
7pub use croaring::{Bitmap, Portable};
8use serde::{Deserialize, Serialize};
9use vortex_array::array::BoolArray;
10use vortex_array::encoding::ids;
11use vortex_array::stats::StatsSet;
12use vortex_array::validity::{LogicalValidity, ValidityVTable};
13use vortex_array::variants::{BoolArrayTrait, VariantsVTable};
14use vortex_array::visitor::{ArrayVisitor, VisitorVTable};
15use vortex_array::{
16    impl_encoding, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoArrayData, IntoCanonical,
17};
18use vortex_buffer::Buffer;
19use vortex_dtype::{DType, Nullability};
20use vortex_error::{vortex_bail, vortex_err, VortexExpect as _, VortexResult};
21
22mod compress;
23mod compute;
24mod stats;
25
26impl_encoding!("vortex.roaring_bool", ids::ROARING_BOOL, RoaringBool);
27
28#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct RoaringBoolMetadata;
30
31impl Display for RoaringBoolMetadata {
32    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
33        Debug::fmt(self, f)
34    }
35}
36
37impl RoaringBoolArray {
38    pub fn try_new(bitmap: Bitmap, length: usize) -> VortexResult<Self> {
39        let max_set = bitmap.maximum().unwrap_or(0) as usize;
40        if length < max_set {
41            vortex_bail!(
42                "RoaringBoolArray length is less than bitmap maximum {}",
43                max_set
44            )
45        }
46
47        let stats = StatsSet::bools_with_true_and_null_count(
48            bitmap.statistics().cardinality as usize,
49            0,
50            length,
51        );
52
53        ArrayData::try_new_owned(
54            &RoaringBoolEncoding,
55            DType::Bool(Nullability::NonNullable),
56            length,
57            Arc::new(RoaringBoolMetadata),
58            Some(Buffer::from(bitmap.serialize::<Native>())),
59            vec![].into(),
60            stats,
61        )?
62        .try_into()
63    }
64
65    pub fn bitmap(&self) -> Bitmap {
66        //TODO(@jdcasale): figure out a way to avoid this deserialization per-call
67        Bitmap::deserialize::<Native>(self.buffer().as_ref())
68    }
69
70    pub fn encode(array: ArrayData) -> VortexResult<ArrayData> {
71        if let Ok(bools) = BoolArray::try_from(array) {
72            roaring_bool_encode(bools).map(|a| a.into_array())
73        } else {
74            vortex_bail!("RoaringBool can only encode boolean arrays")
75        }
76    }
77
78    pub fn buffer(&self) -> &Buffer {
79        self.as_ref()
80            .buffer()
81            .vortex_expect("Missing buffer in PrimitiveArray")
82    }
83}
84
85impl ArrayTrait for RoaringBoolArray {}
86
87impl VariantsVTable<RoaringBoolArray> for RoaringBoolEncoding {
88    fn as_bool_array<'a>(&self, array: &'a RoaringBoolArray) -> Option<&'a dyn BoolArrayTrait> {
89        Some(array)
90    }
91}
92
93impl BoolArrayTrait for RoaringBoolArray {}
94
95impl VisitorVTable<RoaringBoolArray> for RoaringBoolEncoding {
96    fn accept(&self, array: &RoaringBoolArray, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> {
97        // TODO(ngates): should we store a buffer in memory? Or delay serialization?
98        //  Or serialize into metadata? The only reason we support buffers is so we can write to
99        //  the wire without copying into FlatBuffers. But if we need to allocate to serialize
100        //  the bitmap anyway, then may as well shove it into metadata.
101        visitor.visit_buffer(array.buffer())
102    }
103}
104
105impl ValidityVTable<RoaringBoolArray> for RoaringBoolEncoding {
106    fn is_valid(&self, _array: &RoaringBoolArray, _index: usize) -> bool {
107        true
108    }
109
110    fn logical_validity(&self, array: &RoaringBoolArray) -> LogicalValidity {
111        LogicalValidity::AllValid(array.len())
112    }
113}
114
115impl IntoCanonical for RoaringBoolArray {
116    fn into_canonical(self) -> VortexResult<Canonical> {
117        // TODO(ngates): benchmark the fastest conversion from BitMap.
118        //  Via bitset requires two copies.
119        let bitset = self
120            .bitmap()
121            .to_bitset()
122            .ok_or_else(|| vortex_err!("Failed to convert RoaringBitmap to Bitset"))?;
123
124        let byte_length = (self.len() + 7) / 8;
125        let mut buffer = MutableBuffer::with_capacity(byte_length);
126        buffer.extend_from_slice(bitset.as_slice());
127        if byte_length > bitset.size_in_bytes() {
128            buffer.extend_zeros(byte_length - bitset.size_in_bytes());
129        }
130        Ok(Canonical::Bool(BoolArray::new(
131            BooleanBuffer::new(buffer.into(), 0, self.len()),
132            Nullability::NonNullable,
133        )))
134    }
135}
136
137#[cfg(test)]
138mod test {
139    use std::iter;
140
141    use vortex_array::array::BoolArray;
142    use vortex_array::{ArrayLen, IntoArrayData, IntoArrayVariant};
143
144    use crate::RoaringBoolArray;
145
146    #[test]
147    #[cfg_attr(miri, ignore)]
148    pub fn iter() {
149        let bool: BoolArray = BoolArray::from_iter([true, false, true, true]);
150        let array = RoaringBoolArray::encode(bool.into_array()).unwrap();
151        let round_trip = RoaringBoolArray::try_from(array).unwrap();
152        let values = round_trip.bitmap().to_vec();
153        assert_eq!(values, vec![0, 2, 3]);
154    }
155
156    #[test]
157    #[cfg_attr(miri, ignore)]
158    pub fn trailing_false() {
159        let bool: BoolArray = BoolArray::from_iter(
160            [true, true]
161                .into_iter()
162                .chain(iter::repeat(false).take(100)),
163        );
164        let array = RoaringBoolArray::encode(bool.into_array()).unwrap();
165        let round_trip = RoaringBoolArray::try_from(array).unwrap();
166        let bool_arr = round_trip.into_bool().unwrap();
167        assert_eq!(bool_arr.len(), 102);
168    }
169}