arrow_buffer/buffer/null.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::bit_iterator::{BitIndexIterator, BitIterator, BitSliceIterator};
19use crate::buffer::BooleanBuffer;
20use crate::{Buffer, MutableBuffer};
21
22/// A [`BooleanBuffer`] used to encode validity for arrow arrays
23///
24/// As per the [Arrow specification], array validity is encoded in a packed bitmask with a
25/// `true` value indicating the corresponding slot is not null, and `false` indicating
26/// that it is null.
27///
28/// [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps
29#[derive(Debug, Clone, Eq, PartialEq)]
30pub struct NullBuffer {
31 buffer: BooleanBuffer,
32 null_count: usize,
33}
34
35impl NullBuffer {
36 /// Create a new [`NullBuffer`] computing the null count
37 pub fn new(buffer: BooleanBuffer) -> Self {
38 let null_count = buffer.len() - buffer.count_set_bits();
39 Self { buffer, null_count }
40 }
41
42 /// Create a new [`NullBuffer`] of length `len` where all values are null
43 pub fn new_null(len: usize) -> Self {
44 Self {
45 buffer: BooleanBuffer::new_unset(len),
46 null_count: len,
47 }
48 }
49
50 /// Create a new [`NullBuffer`] of length `len` where all values are valid
51 ///
52 /// Note: it is more efficient to not set the null buffer if it is known to be all valid
53 pub fn new_valid(len: usize) -> Self {
54 Self {
55 buffer: BooleanBuffer::new_set(len),
56 null_count: 0,
57 }
58 }
59
60 /// Create a new [`NullBuffer`] with the provided `buffer` and `null_count`
61 ///
62 /// # Safety
63 ///
64 /// `buffer` must contain `null_count` `0` bits
65 pub unsafe fn new_unchecked(buffer: BooleanBuffer, null_count: usize) -> Self {
66 Self { buffer, null_count }
67 }
68
69 /// Computes the union of the nulls in two optional [`NullBuffer`]
70 ///
71 /// This is commonly used by binary operations where the result is NULL if either
72 /// of the input values is NULL. Handling the null mask separately in this way
73 /// can yield significant performance improvements over an iterator approach
74 pub fn union(lhs: Option<&NullBuffer>, rhs: Option<&NullBuffer>) -> Option<NullBuffer> {
75 match (lhs, rhs) {
76 (Some(lhs), Some(rhs)) => Some(Self::new(lhs.inner() & rhs.inner())),
77 (Some(n), None) | (None, Some(n)) => Some(n.clone()),
78 (None, None) => None,
79 }
80 }
81
82 /// Returns true if all nulls in `other` also exist in self
83 pub fn contains(&self, other: &NullBuffer) -> bool {
84 if other.null_count == 0 {
85 return true;
86 }
87 let lhs = self.inner().bit_chunks().iter_padded();
88 let rhs = other.inner().bit_chunks().iter_padded();
89 lhs.zip(rhs).all(|(l, r)| (l & !r) == 0)
90 }
91
92 /// Returns a new [`NullBuffer`] where each bit in the current null buffer
93 /// is repeated `count` times. This is useful for masking the nulls of
94 /// the child of a FixedSizeListArray based on its parent
95 pub fn expand(&self, count: usize) -> Self {
96 let capacity = self.buffer.len().checked_mul(count).unwrap();
97 let mut buffer = MutableBuffer::new_null(capacity);
98
99 // Expand each bit within `null_mask` into `element_len`
100 // bits, constructing the implicit mask of the child elements
101 for i in 0..self.buffer.len() {
102 if self.is_null(i) {
103 continue;
104 }
105 for j in 0..count {
106 crate::bit_util::set_bit(buffer.as_mut(), i * count + j)
107 }
108 }
109 Self {
110 buffer: BooleanBuffer::new(buffer.into(), 0, capacity),
111 null_count: self.null_count * count,
112 }
113 }
114
115 /// Returns the length of this [`NullBuffer`]
116 #[inline]
117 pub fn len(&self) -> usize {
118 self.buffer.len()
119 }
120
121 /// Returns the offset of this [`NullBuffer`] in bits
122 #[inline]
123 pub fn offset(&self) -> usize {
124 self.buffer.offset()
125 }
126
127 /// Returns true if this [`NullBuffer`] is empty
128 #[inline]
129 pub fn is_empty(&self) -> bool {
130 self.buffer.is_empty()
131 }
132
133 /// Free up unused memory.
134 pub fn shrink_to_fit(&mut self) {
135 self.buffer.shrink_to_fit();
136 }
137
138 /// Returns the null count for this [`NullBuffer`]
139 #[inline]
140 pub fn null_count(&self) -> usize {
141 self.null_count
142 }
143
144 /// Returns `true` if the value at `idx` is not null
145 #[inline]
146 pub fn is_valid(&self, idx: usize) -> bool {
147 self.buffer.value(idx)
148 }
149
150 /// Returns `true` if the value at `idx` is null
151 #[inline]
152 pub fn is_null(&self, idx: usize) -> bool {
153 !self.is_valid(idx)
154 }
155
156 /// Returns the packed validity of this [`NullBuffer`] not including any offset
157 #[inline]
158 pub fn validity(&self) -> &[u8] {
159 self.buffer.values()
160 }
161
162 /// Slices this [`NullBuffer`] by the provided `offset` and `length`
163 pub fn slice(&self, offset: usize, len: usize) -> Self {
164 Self::new(self.buffer.slice(offset, len))
165 }
166
167 /// Returns an iterator over the bits in this [`NullBuffer`]
168 ///
169 /// * `true` indicates that the corresponding value is not NULL
170 /// * `false` indicates that the corresponding value is NULL
171 ///
172 /// Note: [`Self::valid_indices`] will be significantly faster for most use-cases
173 pub fn iter(&self) -> BitIterator<'_> {
174 self.buffer.iter()
175 }
176
177 /// Returns a [`BitIndexIterator`] over the valid indices in this [`NullBuffer`]
178 ///
179 /// Valid indices indicate the corresponding value is not NULL
180 pub fn valid_indices(&self) -> BitIndexIterator<'_> {
181 self.buffer.set_indices()
182 }
183
184 /// Returns a [`BitSliceIterator`] yielding contiguous ranges of valid indices
185 ///
186 /// Valid indices indicate the corresponding value is not NULL
187 pub fn valid_slices(&self) -> BitSliceIterator<'_> {
188 self.buffer.set_slices()
189 }
190
191 /// Calls the provided closure for each index in this null mask that is set
192 #[inline]
193 pub fn try_for_each_valid_idx<E, F: FnMut(usize) -> Result<(), E>>(
194 &self,
195 f: F,
196 ) -> Result<(), E> {
197 if self.null_count == self.len() {
198 return Ok(());
199 }
200 self.valid_indices().try_for_each(f)
201 }
202
203 /// Returns the inner [`BooleanBuffer`]
204 #[inline]
205 pub fn inner(&self) -> &BooleanBuffer {
206 &self.buffer
207 }
208
209 /// Returns the inner [`BooleanBuffer`]
210 #[inline]
211 pub fn into_inner(self) -> BooleanBuffer {
212 self.buffer
213 }
214
215 /// Returns the underlying [`Buffer`]
216 #[inline]
217 pub fn buffer(&self) -> &Buffer {
218 self.buffer.inner()
219 }
220}
221
222impl<'a> IntoIterator for &'a NullBuffer {
223 type Item = bool;
224 type IntoIter = BitIterator<'a>;
225
226 fn into_iter(self) -> Self::IntoIter {
227 self.buffer.iter()
228 }
229}
230
231impl From<BooleanBuffer> for NullBuffer {
232 fn from(value: BooleanBuffer) -> Self {
233 Self::new(value)
234 }
235}
236
237impl From<&[bool]> for NullBuffer {
238 fn from(value: &[bool]) -> Self {
239 BooleanBuffer::from(value).into()
240 }
241}
242
243impl From<Vec<bool>> for NullBuffer {
244 fn from(value: Vec<bool>) -> Self {
245 BooleanBuffer::from(value).into()
246 }
247}
248
249impl FromIterator<bool> for NullBuffer {
250 fn from_iter<T: IntoIterator<Item = bool>>(iter: T) -> Self {
251 BooleanBuffer::from_iter(iter).into()
252 }
253}
254
255#[cfg(test)]
256mod tests {
257 use super::*;
258 #[test]
259 fn test_size() {
260 // This tests that the niche optimisation eliminates the overhead of an option
261 assert_eq!(
262 std::mem::size_of::<NullBuffer>(),
263 std::mem::size_of::<Option<NullBuffer>>()
264 );
265 }
266}