vortex_array/arrays/bool/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::ops::BitAnd;
5
6use arrow_array::BooleanArray;
7use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder, MutableBuffer};
8use vortex_buffer::ByteBuffer;
9use vortex_dtype::DType;
10use vortex_error::{VortexExpect, VortexResult, vortex_ensure};
11use vortex_mask::Mask;
12
13use crate::Canonical;
14use crate::arrays::{BoolVTable, bool};
15use crate::builders::ArrayBuilder;
16use crate::stats::{ArrayStats, StatsSetRef};
17use crate::validity::Validity;
18use crate::vtable::{ArrayVTable, CanonicalVTable, ValidityHelper};
19
20/// A boolean array that stores true/false values in a compact bit-packed format.
21///
22/// This mirrors the Apache Arrow Boolean array encoding, where each boolean value
23/// is stored as a single bit rather than a full byte.
24///
25/// The data layout uses:
26/// - A bit-packed buffer where each bit represents one boolean value (0 = false, 1 = true)
27/// - An optional validity child array, which must be of type `Bool(NonNullable)`, where true values
28///   indicate valid and false indicates null. if the i-th value is null in the validity child,
29///   the i-th packed bit in the buffer may be 0 or 1, i.e. it is undefined.
30/// - Bit-level slicing is supported with minimal overhead
31///
32/// # Examples
33///
34/// ```
35/// use vortex_array::arrays::BoolArray;
36/// use vortex_array::IntoArray;
37///
38/// // Create from iterator using FromIterator impl
39/// let array: BoolArray = [true, false, true, false].into_iter().collect();
40///
41/// // Slice the array
42/// let sliced = array.slice(1..3);
43/// assert_eq!(sliced.len(), 2);
44///
45/// // Access individual values
46/// let value = array.scalar_at(0);
47/// assert_eq!(value, true.into());
48/// ```
49#[derive(Clone, Debug)]
50pub struct BoolArray {
51    dtype: DType,
52    buffer: BooleanBuffer,
53    pub(crate) validity: Validity,
54    pub(crate) stats_set: ArrayStats,
55}
56
57impl BoolArray {
58    fn validate(
59        buffer: &ByteBuffer,
60        offset: usize,
61        len: usize,
62        validity: &Validity,
63    ) -> VortexResult<()> {
64        vortex_ensure!(
65            offset < 8,
66            "offset must be less than whole byte, was {offset} bits"
67        );
68
69        // Validate the buffer is large enough to hold all the bits
70        let required_bytes = offset.saturating_add(len).div_ceil(8);
71        vortex_ensure!(
72            buffer.len() >= required_bytes,
73            "BoolArray with offset={offset} len={len} cannot be built from buffer of size {}",
74            buffer.len()
75        );
76
77        // Validate validity
78        if let Some(validity_len) = validity.maybe_len() {
79            vortex_ensure!(
80                validity_len == len,
81                "BoolArray of size {len} cannot be built with validity of size {validity_len}"
82            );
83        }
84
85        Ok(())
86    }
87}
88
89impl BoolArray {
90    /// Construct a new `BoolArray` from its components:
91    ///
92    /// * `buffer` is a raw ByteBuffer holding the packed bits
93    /// * `offset` is the number of bits in the start of the buffer that should be skipped when
94    ///   looking up the i-th value.
95    /// * `len` is the length of the array, which should correspond to the number of bits
96    /// * `validity` holds the null values.
97    ///
98    /// # Validation
99    ///
100    /// Buffer must be at least large enough to hold `len` bits starting at `offset`.
101    ///
102    /// A provided validity array must be of size `len`.
103    ///
104    /// The offset must be less than a whole byte.
105    pub fn try_new(
106        buffer: ByteBuffer,
107        offset: usize,
108        len: usize,
109        validity: Validity,
110    ) -> VortexResult<Self> {
111        Self::validate(&buffer, offset, len, &validity)?;
112
113        Ok(Self::new(
114            BooleanBuffer::new(buffer.into_arrow_buffer(), offset, len),
115            validity,
116        ))
117    }
118
119    /// Creates a new [`BoolArray`] from a [`BooleanBuffer`] and [`Validity`] directly.
120    ///
121    /// Panics if the validity length differs from the buffer length.
122    pub fn new(buffer: BooleanBuffer, validity: Validity) -> Self {
123        if let Some(validity_len) = validity.maybe_len() {
124            assert_eq!(buffer.len(), validity_len);
125        }
126
127        // Shrink the buffer to remove any whole bytes.
128        let buffer = buffer.shrink_offset();
129        Self {
130            dtype: DType::Bool(validity.nullability()),
131            buffer,
132            validity,
133            stats_set: ArrayStats::default(),
134        }
135    }
136
137    /// Create a new BoolArray from a set of indices and a length.
138    ///
139    /// All indices must be less than the length.
140    pub fn from_indices<I: IntoIterator<Item = usize>>(
141        length: usize,
142        indices: I,
143        validity: Validity,
144    ) -> Self {
145        let mut buffer = MutableBuffer::new_null(length);
146        let buffer_slice = buffer.as_slice_mut();
147        indices
148            .into_iter()
149            .for_each(|idx| arrow_buffer::bit_util::set_bit(buffer_slice, idx));
150        Self::new(
151            BooleanBufferBuilder::new_from_buffer(buffer, length).finish(),
152            validity,
153        )
154    }
155
156    /// Returns the underlying [`BooleanBuffer`] of the array.
157    pub fn boolean_buffer(&self) -> &BooleanBuffer {
158        assert!(
159            self.buffer.offset() < 8,
160            "Offset must be <8, did we forget to call shrink_offset? Found {}",
161            self.buffer.offset()
162        );
163        &self.buffer
164    }
165
166    /// Get a mutable version of this array.
167    ///
168    /// If the caller holds the only reference to the underlying buffer the underlying buffer is returned
169    /// otherwise a copy is created.
170    ///
171    /// The second value of the tuple is a bit_offset of first value in first byte of the returned builder
172    pub fn into_boolean_builder(self) -> (BooleanBufferBuilder, usize) {
173        let offset = self.buffer.offset();
174        let len = self.buffer.len();
175        let arrow_buffer = self.buffer.into_inner();
176        let mutable_buf = if arrow_buffer.ptr_offset() == 0 {
177            arrow_buffer.into_mutable().unwrap_or_else(|b| {
178                let mut buf = MutableBuffer::with_capacity(b.len());
179                buf.extend_from_slice(b.as_slice());
180                buf
181            })
182        } else {
183            let mut buf = MutableBuffer::with_capacity(arrow_buffer.len());
184            buf.extend_from_slice(arrow_buffer.as_slice());
185            buf
186        };
187
188        (
189            BooleanBufferBuilder::new_from_buffer(mutable_buf, offset + len),
190            offset,
191        )
192    }
193
194    pub fn to_mask(&self) -> Mask {
195        self.maybe_to_mask()
196            .vortex_expect("cannot convert nullable boolean array to mask")
197    }
198
199    pub fn maybe_to_mask(&self) -> Option<Mask> {
200        self.all_valid()
201            .then(|| Mask::from_buffer(self.boolean_buffer().clone()))
202    }
203
204    pub fn to_mask_fill_null_false(&self) -> Mask {
205        if let Some(constant) = self.as_constant() {
206            let bool_constant = constant.as_bool();
207            if bool_constant.value().unwrap_or(false) {
208                return Mask::new_true(self.len());
209            } else {
210                return Mask::new_false(self.len());
211            }
212        }
213        // Extract a boolean buffer, treating null values to false
214        let buffer = match self.validity_mask() {
215            Mask::AllTrue(_) => self.boolean_buffer().clone(),
216            Mask::AllFalse(_) => return Mask::new_false(self.len()),
217            Mask::Values(validity) => validity.boolean_buffer().bitand(self.boolean_buffer()),
218        };
219        Mask::from_buffer(buffer)
220    }
221}
222
223impl From<BooleanBuffer> for BoolArray {
224    fn from(value: BooleanBuffer) -> Self {
225        Self::new(value, Validity::NonNullable)
226    }
227}
228
229impl FromIterator<bool> for BoolArray {
230    fn from_iter<T: IntoIterator<Item = bool>>(iter: T) -> Self {
231        Self::new(BooleanBuffer::from_iter(iter), Validity::NonNullable)
232    }
233}
234
235impl FromIterator<Option<bool>> for BoolArray {
236    fn from_iter<I: IntoIterator<Item = Option<bool>>>(iter: I) -> Self {
237        let (buffer, nulls) = BooleanArray::from_iter(iter).into_parts();
238
239        Self::new(
240            buffer,
241            nulls.map(Validity::from).unwrap_or(Validity::AllValid),
242        )
243    }
244}
245
246impl ValidityHelper for BoolArray {
247    fn validity(&self) -> &Validity {
248        &self.validity
249    }
250}
251
252impl ArrayVTable<BoolVTable> for BoolVTable {
253    fn len(array: &BoolArray) -> usize {
254        array.buffer.len()
255    }
256
257    fn dtype(array: &BoolArray) -> &DType {
258        &array.dtype
259    }
260
261    fn stats(array: &BoolArray) -> StatsSetRef<'_> {
262        array.stats_set.to_ref(array.as_ref())
263    }
264}
265
266impl CanonicalVTable<BoolVTable> for BoolVTable {
267    fn canonicalize(array: &BoolArray) -> Canonical {
268        Canonical::Bool(array.clone())
269    }
270
271    fn append_to_builder(array: &BoolArray, builder: &mut dyn ArrayBuilder) {
272        builder.extend_from_array(array.as_ref())
273    }
274}
275
276pub trait BooleanBufferExt {
277    /// Slice any full bytes from the buffer, leaving the offset < 8.
278    fn shrink_offset(self) -> Self;
279}
280
281impl BooleanBufferExt for BooleanBuffer {
282    fn shrink_offset(self) -> Self {
283        let byte_offset = self.offset() / 8;
284        let bit_offset = self.offset() % 8;
285        let len = self.len();
286        let buffer = self
287            .into_inner()
288            .slice_with_length(byte_offset, (len + bit_offset).div_ceil(8));
289        BooleanBuffer::new(buffer, bit_offset, len)
290    }
291}
292
293#[cfg(test)]
294mod tests {
295    use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder};
296    use vortex_buffer::buffer;
297
298    use crate::arrays::{BoolArray, PrimitiveArray};
299    use crate::patches::Patches;
300    use crate::validity::Validity;
301    use crate::vtable::ValidityHelper;
302    use crate::{Array, IntoArray, ToCanonical};
303
304    #[test]
305    fn bool_array() {
306        let arr = BoolArray::from_iter([true, false, true]);
307        let scalar = bool::try_from(&arr.scalar_at(0)).unwrap();
308        assert!(scalar);
309    }
310
311    #[test]
312    fn test_all_some_iter() {
313        let arr = BoolArray::from_iter([Some(true), Some(false)]);
314
315        assert!(matches!(arr.validity(), Validity::AllValid));
316
317        let scalar = bool::try_from(&arr.scalar_at(0)).unwrap();
318        assert!(scalar);
319        let scalar = bool::try_from(&arr.scalar_at(1)).unwrap();
320        assert!(!scalar);
321    }
322
323    #[test]
324    fn test_bool_from_iter() {
325        let arr = BoolArray::from_iter([Some(true), Some(true), None, Some(false), None]);
326
327        let scalar = bool::try_from(&arr.scalar_at(0)).unwrap();
328        assert!(scalar);
329
330        let scalar = bool::try_from(&arr.scalar_at(1)).unwrap();
331        assert!(scalar);
332
333        let scalar = arr.scalar_at(2);
334        assert!(scalar.is_null());
335
336        let scalar = bool::try_from(&arr.scalar_at(3)).unwrap();
337        assert!(!scalar);
338
339        let scalar = arr.scalar_at(4);
340        assert!(scalar.is_null());
341    }
342
343    #[test]
344    fn patch_sliced_bools() {
345        let arr = {
346            let mut builder = BooleanBufferBuilder::new(12);
347            builder.append(false);
348            builder.append_n(11, true);
349            BoolArray::from(builder.finish())
350        };
351        let sliced = arr.slice(4..12);
352        let sliced_len = sliced.len();
353        let (values, offset) = sliced.to_bool().into_boolean_builder();
354        assert_eq!(offset, 4);
355        assert_eq!(values.as_slice(), &[254, 15]);
356
357        // patch the underlying array
358        let patches = Patches::new(
359            arr.len(),
360            0,
361            buffer![4u32].into_array(), // This creates a non-nullable array
362            BoolArray::from(BooleanBuffer::new_unset(1)).into_array(),
363        );
364        let arr = arr.patch(&patches);
365        let arr_len = arr.len();
366        let (values, offset) = arr.to_bool().into_boolean_builder();
367        assert_eq!(offset, 0);
368        assert_eq!(values.len(), arr_len + offset);
369        assert_eq!(values.as_slice(), &[238, 15]);
370
371        // the slice should be unchanged
372        let (values, offset) = sliced.to_bool().into_boolean_builder();
373        assert_eq!(offset, 4);
374        assert_eq!(values.len(), sliced_len + offset);
375        assert_eq!(values.as_slice(), &[254, 15]); // unchanged
376    }
377
378    #[test]
379    fn slice_array_in_middle() {
380        let arr = BoolArray::from(BooleanBuffer::new_set(16));
381        let sliced = arr.slice(4..12);
382        let sliced_len = sliced.len();
383        let (values, offset) = sliced.to_bool().into_boolean_builder();
384        assert_eq!(offset, 4);
385        assert_eq!(values.len(), sliced_len + offset);
386        assert_eq!(values.as_slice(), &[255, 15]);
387    }
388
389    #[test]
390    #[should_panic]
391    fn patch_bools_owned() {
392        let buffer = buffer![255u8; 2];
393        let buf = BooleanBuffer::new(buffer.into_arrow_buffer(), 0, 15);
394        let arr = BoolArray::new(buf, Validity::NonNullable);
395        let buf_ptr = arr.boolean_buffer().sliced().as_ptr();
396
397        let patches = Patches::new(
398            arr.len(),
399            0,
400            PrimitiveArray::new(buffer![0u32], Validity::AllValid).into_array(),
401            BoolArray::from(BooleanBuffer::new_unset(1)).into_array(),
402        );
403        let arr = arr.patch(&patches);
404        assert_eq!(arr.boolean_buffer().sliced().as_ptr(), buf_ptr);
405
406        let (values, _byte_bit_offset) = arr.to_bool().into_boolean_builder();
407        assert_eq!(values.as_slice(), &[254, 127]);
408    }
409}