vortex_array/arrays/bool/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::ops::BitAnd;
5
6use arrow_array::BooleanArray;
7use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder, MutableBuffer};
8use itertools::Itertools;
9use vortex_buffer::ByteBuffer;
10use vortex_dtype::{DType, match_each_integer_ptype};
11use vortex_error::{VortexExpect, VortexResult, vortex_ensure};
12use vortex_mask::Mask;
13
14use crate::ToCanonical;
15use crate::arrays::bool;
16use crate::patches::Patches;
17use crate::stats::ArrayStats;
18use crate::validity::Validity;
19use crate::vtable::ValidityHelper;
20
21pub trait BooleanBufferExt {
22    /// Slice any full bytes from the buffer, leaving the offset < 8.
23    fn shrink_offset(self) -> Self;
24}
25
26impl BooleanBufferExt for BooleanBuffer {
27    fn shrink_offset(self) -> Self {
28        let byte_offset = self.offset() / 8;
29        let bit_offset = self.offset() % 8;
30        let len = self.len();
31        let buffer = self
32            .into_inner()
33            .slice_with_length(byte_offset, (len + bit_offset).div_ceil(8));
34        BooleanBuffer::new(buffer, bit_offset, len)
35    }
36}
37
38/// A boolean array that stores true/false values in a compact bit-packed format.
39///
40/// This mirrors the Apache Arrow Boolean array encoding, where each boolean value
41/// is stored as a single bit rather than a full byte.
42///
43/// The data layout uses:
44/// - A bit-packed buffer where each bit represents one boolean value (0 = false, 1 = true)
45/// - An optional validity child array, which must be of type `Bool(NonNullable)`, where true values
46///   indicate valid and false indicates null. if the i-th value is null in the validity child,
47///   the i-th packed bit in the buffer may be 0 or 1, i.e. it is undefined.
48/// - Bit-level slicing is supported with minimal overhead
49///
50/// # Examples
51///
52/// ```
53/// use vortex_array::arrays::BoolArray;
54/// use vortex_array::IntoArray;
55///
56/// // Create from iterator using FromIterator impl
57/// let array: BoolArray = [true, false, true, false].into_iter().collect();
58///
59/// // Slice the array
60/// let sliced = array.slice(1..3);
61/// assert_eq!(sliced.len(), 2);
62///
63/// // Access individual values
64/// let value = array.scalar_at(0);
65/// assert_eq!(value, true.into());
66/// ```
67#[derive(Clone, Debug)]
68pub struct BoolArray {
69    pub(super) dtype: DType,
70    pub(super) buffer: BooleanBuffer,
71    pub(super) validity: Validity,
72    pub(super) stats_set: ArrayStats,
73}
74
75impl BoolArray {
76    /// Constructs a new `BoolArray`.
77    ///
78    /// See [`BoolArray::new_unchecked`] for more information.
79    ///
80    /// # Errors
81    ///
82    /// Returns an error if the provided components do not satisfy the invariants documented in
83    /// [`BoolArray::new_unchecked`].
84    pub fn try_new(
85        buffer: ByteBuffer,
86        offset: usize,
87        len: usize,
88        validity: Validity,
89    ) -> VortexResult<Self> {
90        Self::validate(&buffer, offset, len, &validity)?;
91
92        // SAFETY: validate ensures all invariants are met.
93        Ok(unsafe { Self::new_unchecked(buffer, offset, len, validity) })
94    }
95
96    /// Creates a new [`BoolArray`] without validation from these components:
97    ///
98    /// * `buffer` is a raw [`ByteBuffer`] holding the packed bits.
99    /// * `offset` is the number of bits in the start of the buffer that should be skipped when
100    ///   looking up the i-th value.
101    /// * `len` is the length of the array, which should correspond to the number of bits.
102    /// * `validity` holds the null values.
103    ///
104    /// # Safety
105    ///
106    /// The caller must ensure all of the following invariants are satisfied:
107    ///
108    /// - `buffer` must contain at least `(offset + len).div_ceil(8)` bytes.
109    /// - `offset` must be less than 8 (it represents the bit offset within the first byte).
110    /// - If `validity` is `Validity::Array`, its length must exactly equal `len`.
111    pub unsafe fn new_unchecked(
112        buffer: ByteBuffer,
113        offset: usize,
114        len: usize,
115        validity: Validity,
116    ) -> Self {
117        #[cfg(debug_assertions)]
118        Self::validate(&buffer, offset, len, &validity)
119            .vortex_expect("[Debug Assertion]: Invalid `BoolArray` parameters");
120
121        let buffer = BooleanBuffer::new(buffer.into_arrow_buffer(), offset, len);
122        let buffer = buffer.shrink_offset();
123        Self {
124            dtype: DType::Bool(validity.nullability()),
125            buffer,
126            validity,
127            stats_set: ArrayStats::default(),
128        }
129    }
130
131    /// Validates the components that would be used to create a [`BoolArray`].
132    ///
133    /// This function checks all the invariants required by [`BoolArray::new_unchecked`].
134    pub fn validate(
135        buffer: &ByteBuffer,
136        offset: usize,
137        len: usize,
138        validity: &Validity,
139    ) -> VortexResult<()> {
140        vortex_ensure!(
141            offset < 8,
142            "offset must be less than whole byte, was {offset} bits"
143        );
144
145        // Validate the buffer is large enough to hold all the bits
146        let required_bytes = offset.saturating_add(len).div_ceil(8);
147        vortex_ensure!(
148            buffer.len() >= required_bytes,
149            "BoolArray with offset={offset} len={len} cannot be built from buffer of size {}",
150            buffer.len()
151        );
152
153        // Validate validity
154        if let Some(validity_len) = validity.maybe_len() {
155            vortex_ensure!(
156                validity_len == len,
157                "BoolArray of size {len} cannot be built with validity of size {validity_len}"
158            );
159        }
160
161        Ok(())
162    }
163
164    /// Creates a new [`BoolArray`] from a [`BooleanBuffer`] and [`Validity`] directly.
165    ///
166    /// # Panics
167    ///
168    /// Panics if the validity is [`Validity::Array`] and the length is not the same as the buffer.
169    pub fn from_bool_buffer(buffer: BooleanBuffer, validity: Validity) -> Self {
170        if let Some(validity_len) = validity.maybe_len() {
171            assert_eq!(buffer.len(), validity_len);
172        }
173
174        // Shrink the buffer to remove any whole bytes.
175        let buffer = buffer.shrink_offset();
176        Self {
177            dtype: DType::Bool(validity.nullability()),
178            buffer,
179            validity,
180            stats_set: ArrayStats::default(),
181        }
182    }
183
184    /// Create a new BoolArray from a set of indices and a length.
185    ///
186    /// All indices must be less than the length.
187    pub fn from_indices<I: IntoIterator<Item = usize>>(
188        length: usize,
189        indices: I,
190        validity: Validity,
191    ) -> Self {
192        let mut buffer = MutableBuffer::new_null(length);
193        let buffer_slice = buffer.as_slice_mut();
194        indices
195            .into_iter()
196            .for_each(|idx| arrow_buffer::bit_util::set_bit(buffer_slice, idx));
197        Self::from_bool_buffer(
198            BooleanBufferBuilder::new_from_buffer(buffer, length).finish(),
199            validity,
200        )
201    }
202
203    /// Returns the underlying [`BooleanBuffer`] of the array.
204    pub fn boolean_buffer(&self) -> &BooleanBuffer {
205        assert!(
206            self.buffer.offset() < 8,
207            "Offset must be <8, did we forget to call shrink_offset? Found {}",
208            self.buffer.offset()
209        );
210        &self.buffer
211    }
212
213    /// Get a mutable version of this array.
214    ///
215    /// If the caller holds the only reference to the underlying buffer the underlying buffer is returned
216    /// otherwise a copy is created.
217    ///
218    /// The second value of the tuple is a bit_offset of first value in first byte of the returned builder
219    pub fn into_boolean_builder(self) -> (BooleanBufferBuilder, usize) {
220        let offset = self.buffer.offset();
221        let len = self.buffer.len();
222        let arrow_buffer = self.buffer.into_inner();
223        let mutable_buf = if arrow_buffer.ptr_offset() == 0 {
224            arrow_buffer.into_mutable().unwrap_or_else(|b| {
225                let mut buf = MutableBuffer::with_capacity(b.len());
226                buf.extend_from_slice(b.as_slice());
227                buf
228            })
229        } else {
230            let mut buf = MutableBuffer::with_capacity(arrow_buffer.len());
231            buf.extend_from_slice(arrow_buffer.as_slice());
232            buf
233        };
234
235        (
236            BooleanBufferBuilder::new_from_buffer(mutable_buf, offset + len),
237            offset,
238        )
239    }
240
241    pub fn to_mask(&self) -> Mask {
242        self.maybe_to_mask()
243            .vortex_expect("cannot convert nullable boolean array to mask")
244    }
245
246    pub fn maybe_to_mask(&self) -> Option<Mask> {
247        self.all_valid()
248            .then(|| Mask::from_buffer(self.boolean_buffer().clone()))
249    }
250
251    pub fn to_mask_fill_null_false(&self) -> Mask {
252        if let Some(constant) = self.as_constant() {
253            let bool_constant = constant.as_bool();
254            if bool_constant.value().unwrap_or(false) {
255                return Mask::new_true(self.len());
256            } else {
257                return Mask::new_false(self.len());
258            }
259        }
260        // Extract a boolean buffer, treating null values to false
261        let buffer = match self.validity_mask() {
262            Mask::AllTrue(_) => self.boolean_buffer().clone(),
263            Mask::AllFalse(_) => return Mask::new_false(self.len()),
264            Mask::Values(validity) => validity.boolean_buffer().bitand(self.boolean_buffer()),
265        };
266        Mask::from_buffer(buffer)
267    }
268
269    pub fn patch(self, patches: &Patches) -> Self {
270        let len = self.len();
271        let offset = patches.offset();
272        let indices = patches.indices().to_primitive();
273        let values = patches.values().to_bool();
274
275        let patched_validity =
276            self.validity()
277                .clone()
278                .patch(len, offset, indices.as_ref(), values.validity());
279
280        let (mut own_values, bit_offset) = self.into_boolean_builder();
281        match_each_integer_ptype!(indices.ptype(), |I| {
282            for (idx, value) in indices
283                .as_slice::<I>()
284                .iter()
285                .zip_eq(values.boolean_buffer().iter())
286            {
287                #[allow(clippy::cast_possible_truncation)]
288                own_values.set_bit(*idx as usize - offset + bit_offset, value);
289            }
290        });
291
292        Self::from_bool_buffer(own_values.finish().slice(bit_offset, len), patched_validity)
293    }
294}
295
296impl From<BooleanBuffer> for BoolArray {
297    fn from(value: BooleanBuffer) -> Self {
298        Self::from_bool_buffer(value, Validity::NonNullable)
299    }
300}
301
302impl FromIterator<bool> for BoolArray {
303    fn from_iter<T: IntoIterator<Item = bool>>(iter: T) -> Self {
304        Self::from_bool_buffer(BooleanBuffer::from_iter(iter), Validity::NonNullable)
305    }
306}
307
308impl FromIterator<Option<bool>> for BoolArray {
309    fn from_iter<I: IntoIterator<Item = Option<bool>>>(iter: I) -> Self {
310        let (buffer, nulls) = BooleanArray::from_iter(iter).into_parts();
311
312        Self::from_bool_buffer(
313            buffer,
314            nulls.map(Validity::from).unwrap_or(Validity::AllValid),
315        )
316    }
317}
318
319#[cfg(test)]
320mod tests {
321    use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder};
322    use vortex_buffer::buffer;
323
324    use crate::arrays::{BoolArray, PrimitiveArray};
325    use crate::patches::Patches;
326    use crate::validity::Validity;
327    use crate::vtable::ValidityHelper;
328    use crate::{Array, IntoArray, ToCanonical};
329
330    #[test]
331    fn bool_array() {
332        let arr = BoolArray::from_iter([true, false, true]);
333        let scalar = bool::try_from(&arr.scalar_at(0)).unwrap();
334        assert!(scalar);
335    }
336
337    #[test]
338    fn test_all_some_iter() {
339        let arr = BoolArray::from_iter([Some(true), Some(false)]);
340
341        assert!(matches!(arr.validity(), Validity::AllValid));
342
343        let scalar = bool::try_from(&arr.scalar_at(0)).unwrap();
344        assert!(scalar);
345        let scalar = bool::try_from(&arr.scalar_at(1)).unwrap();
346        assert!(!scalar);
347    }
348
349    #[test]
350    fn test_bool_from_iter() {
351        let arr = BoolArray::from_iter([Some(true), Some(true), None, Some(false), None]);
352
353        let scalar = bool::try_from(&arr.scalar_at(0)).unwrap();
354        assert!(scalar);
355
356        let scalar = bool::try_from(&arr.scalar_at(1)).unwrap();
357        assert!(scalar);
358
359        let scalar = arr.scalar_at(2);
360        assert!(scalar.is_null());
361
362        let scalar = bool::try_from(&arr.scalar_at(3)).unwrap();
363        assert!(!scalar);
364
365        let scalar = arr.scalar_at(4);
366        assert!(scalar.is_null());
367    }
368
369    #[test]
370    fn patch_sliced_bools() {
371        let arr = BoolArray::from(BooleanBuffer::new_set(12));
372        let sliced = arr.slice(4..12);
373        let (values, offset) = sliced.to_bool().into_boolean_builder();
374        assert_eq!(offset, 4);
375        assert_eq!(values.len(), 12);
376        assert_eq!(values.as_slice(), &[255, 15]);
377
378        let arr = {
379            let mut builder = BooleanBufferBuilder::new(12);
380            builder.append(false);
381            builder.append_n(11, true);
382            BoolArray::from(builder.finish())
383        };
384        let sliced = arr.slice(4..12);
385        let sliced_len = sliced.len();
386        let (values, offset) = sliced.to_bool().into_boolean_builder();
387        assert_eq!(offset, 4);
388        assert_eq!(values.as_slice(), &[254, 15]);
389
390        // patch the underlying array
391        let patches = Patches::new(
392            arr.len(),
393            0,
394            buffer![4u32].into_array(), // This creates a non-nullable array
395            BoolArray::from(BooleanBuffer::new_unset(1)).into_array(),
396            None,
397        );
398        let arr = arr.patch(&patches);
399        let arr_len = arr.len();
400        let (values, offset) = arr.to_bool().into_boolean_builder();
401        assert_eq!(offset, 0);
402        assert_eq!(values.len(), arr_len + offset);
403        assert_eq!(values.as_slice(), &[238, 15]);
404
405        // the slice should be unchanged
406        let (values, offset) = sliced.to_bool().into_boolean_builder();
407        assert_eq!(offset, 4);
408        assert_eq!(values.len(), sliced_len + offset);
409        assert_eq!(values.as_slice(), &[254, 15]); // unchanged
410    }
411
412    #[test]
413    fn slice_array_in_middle() {
414        let arr = BoolArray::from(BooleanBuffer::new_set(16));
415        let sliced = arr.slice(4..12);
416        let sliced_len = sliced.len();
417        let (values, offset) = sliced.to_bool().into_boolean_builder();
418        assert_eq!(offset, 4);
419        assert_eq!(values.len(), sliced_len + offset);
420        assert_eq!(values.as_slice(), &[255, 15]);
421    }
422
423    #[test]
424    #[should_panic]
425    fn patch_bools_owned() {
426        let buffer = buffer![255u8; 2];
427        let buf = BooleanBuffer::new(buffer.into_arrow_buffer(), 0, 15);
428        let arr = BoolArray::from_bool_buffer(buf, Validity::NonNullable);
429        let buf_ptr = arr.boolean_buffer().sliced().as_ptr();
430
431        let patches = Patches::new(
432            arr.len(),
433            0,
434            PrimitiveArray::new(buffer![0u32], Validity::AllValid).into_array(),
435            BoolArray::from(BooleanBuffer::new_unset(1)).into_array(),
436            None,
437        );
438        let arr = arr.patch(&patches);
439        assert_eq!(arr.boolean_buffer().sliced().as_ptr(), buf_ptr);
440
441        let (values, _byte_bit_offset) = arr.to_bool().into_boolean_builder();
442        assert_eq!(values.as_slice(), &[254, 127]);
443    }
444
445    #[test]
446    fn patch_sliced_bools_offset() {
447        let arr = BoolArray::from(BooleanBuffer::new_set(15));
448        let sliced = arr.slice(4..15);
449        let (values, offset) = sliced.to_bool().into_boolean_builder();
450        assert_eq!(offset, 4);
451        assert_eq!(values.as_slice(), &[255, 127]);
452    }
453}