minarrow/traits/masked_array.rs
1// Copyright 2025 Peter Garfield Bower
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! # **MaskedArray Module** - *Standardises all Inner Array types and null handling in Minarrow*
16//!
17//! Defines the `MaskedArray` trait - the common interface for all nullable array types in Minarrow.
18//!
19//! This module standardises how arrays store and manage optional null bitmasks,
20//! ensuring consistent null-handling behaviour across fixed-width and variable-length arrays.
21//! It also provides default implementations for common mask operations to reduce duplication.
22
23use crate::{Bitmask, Length, Offset, enums::error::MinarrowError};
24
25/// # MaskedArray
26///
27/// MaskedArray is implemented by all inner, nullable arrays.
28///
29/// ### Purpose
30/// - MaskedArray ensures interface consistency across `BooleanArray`,
31/// `CategoricalArray`, `DatetimeArray`, `FloatArray`, `IntegerArray`
32/// and `StringArray`.
33/// - It avoids repeition through default boilerplate implementations,
34/// focusing on null value handling.
35/// - This serves to enforce the base pattern contract, and is either overriden
36/// on non-fixed width types (e.g., `BooleanArray`, `StringArray`), or, for fixed
37/// width types (e.g., `FloatArray`, `IntegerArray`), is supported by macros.
38pub trait MaskedArray {
39 /// The element type (e.g. `f32`, `bool`, etc.)
40 /// Or, utility type e.g., `Offsets` for cases
41 /// like `String`
42 type T: Default + PartialEq + Clone + Copy;
43
44 /// The backing store (e.g. `Vec64<Self::Elem>` or `Bitmask`)
45 type Container;
46
47 /// The logical type that the data carries
48 type LogicalType: Default;
49
50 /// The type that implements `Copy` (e.g., &str)
51 type CopyType: Default;
52
53 /// **************************************************
54 /// The below methods differ for the Boolean (bit-packed),
55 /// and String (variable-length) variants and thus are
56 /// implemented via macros for the standard variants,
57 /// and then implemented on those types directly
58 /// *************************************************
59
60 /// Returns the number of elements in the array.
61 fn len(&self) -> usize;
62
63 /// Returns true if the array is empty.
64 #[inline]
65 fn is_empty(&self) -> bool {
66 self.len() == 0
67 }
68
69 /// Returns a reference to the underlying data.
70 fn data(&self) -> &Self::Container;
71
72 /// Returns a mutable reference to the underlying data.
73 fn data_mut(&mut self) -> &mut Self::Container;
74
75 /// Retrieves the value at the given index, or None if null or beyond length.
76 fn get(&self, idx: usize) -> Option<Self::CopyType>;
77
78 /// Sets the value at the given index, updating the null‐mask.
79 fn set(&mut self, idx: usize, value: Self::LogicalType);
80
81 /// Like `get`, but skips the `idx >= len()` check.
82 unsafe fn get_unchecked(&self, idx: usize) -> Option<Self::CopyType>;
83
84 /// Like `set`, but skips bounds checks.
85 unsafe fn set_unchecked(&mut self, idx: usize, value: Self::LogicalType);
86
87 /// Low-level accessor for when working directly with
88 /// mutable array variants.
89 ///
90 /// Borrows with window parameters as a tuple,
91 /// for 'DIY' window access, retaining access to the whole original array.
92 ///
93 /// `Offset` and `Length` are `usize` aliases.
94 ///
95 /// For the standard zero-copy accessors, see the `View` trait.
96 fn tuple_ref(&self, offset: usize, len: usize) -> (&Self, Offset, Length) {
97 (&self, offset, len)
98 }
99
100 /// Returns an iterator over the T values in this array.
101 fn iter(&self) -> impl Iterator<Item = Self::CopyType> + '_;
102
103 /// Returns an iterator over the T values, as `Option<Self::T>`.
104 fn iter_opt(&self) -> impl Iterator<Item = Option<Self::CopyType>> + '_;
105
106 /// Returns an iterator over a range of T values in this array.
107 fn iter_range(&self, offset: usize, len: usize) -> impl Iterator<Item = Self::CopyType> + '_;
108
109 /// Returns an iterator over a range of T values, as `Option<T>`.
110 fn iter_opt_range(
111 &self,
112 offset: usize,
113 len: usize,
114 ) -> impl Iterator<Item = Option<Self::CopyType>> + '_;
115
116 /// Appends a value to the array, updating masks if present.
117 fn push(&mut self, value: Self::LogicalType);
118
119 /// Appends a value to the array, updating masks if present,
120 /// without bounds checks.
121 ///
122 /// # Safety
123 /// The caller must make sure there is enough pre-allocated
124 /// size in the array, and no thread contention.
125 unsafe fn push_unchecked(&mut self, value: Self::LogicalType);
126
127 /// Returns a logical slice of the MaskedArray<Self::T> [offset, offset+len)
128 /// as a new MaskedArray<Self::T> object via clone.
129 ///
130 /// Prefer `View` trait slicers for zero-copy.
131 fn slice_clone(&self, offset: usize, len: usize) -> Self;
132
133 /// Resizes the array to contain `n` elements, via a call into self.data.resize().
134 ///
135 /// If `n` is greater than the current length, new elements are added using `T::default()`.
136 /// If `n` is smaller, the array is truncated. This only affects the data buffer,
137 /// not the null mask.
138 fn resize(&mut self, n: usize, value: Self::LogicalType);
139
140 /// **************************************************
141 /// We handle null masks consistently across all variants
142 /// and thus their implementation sits on the trait, other
143 /// than trait methods that need to access data state.
144 /// **************************************************
145
146 /// Returns a reference to the optional null mask.
147 fn null_mask(&self) -> Option<&Bitmask>;
148
149 /// Returns true if the value at the given index is null.
150 #[inline]
151 fn is_null(&self, idx: usize) -> bool {
152 match &self.null_mask() {
153 Some(mask) => !mask.get(idx),
154 None => false,
155 }
156 }
157
158 /// Checks if the array has a null bitmask.
159 fn is_nullable(&self) -> bool {
160 self.null_mask().is_some()
161 }
162
163 /// Returns the total number of nulls.
164 fn null_count(&self) -> usize {
165 match self.null_mask().as_ref() {
166 Some(mask) => mask.count_zeros(),
167 None => 0,
168 }
169 }
170
171 /// Append a null value to the array, creating mask if needed
172 #[inline]
173 fn push_null(&mut self) {
174 self.push(Self::LogicalType::default());
175 let i = self.len() - 1;
176 match self.null_mask_mut() {
177 Some(m) => m.set(i, false),
178 None => {
179 let mut m = Bitmask::new_set_all(self.len(), true);
180 m.set(i, false);
181 self.set_null_mask(Some(m));
182 }
183 }
184 }
185
186 /// Returns a mutable reference to the optional null mask.
187 fn null_mask_mut(&mut self) -> Option<&mut Bitmask>;
188
189 /// Sets the null mask.
190 fn set_null_mask(&mut self, mask: Option<Bitmask>);
191
192 /// Appends a null value _without_ any bounds‐checks on the mask.
193 ///
194 /// # Safety
195 /// You must ensure that after `push`, the data and mask (if present)
196 /// have capacity for the new index, or you risk OOB on either.
197 #[inline(always)]
198 unsafe fn push_null_unchecked(&mut self) {
199 // first, append a default element
200 let idx = self.len();
201 unsafe { self.set_unchecked(idx, Self::LogicalType::default()) };
202
203 if let Some(mask) = self.null_mask_mut() {
204 // mark null
205 unsafe { mask.set_unchecked(idx, false) };
206 } else {
207 // initialise a new mask and mark this slot null
208 let mut m = Bitmask::new_set_all(idx, true);
209 unsafe { m.set_unchecked(idx, false) };
210 self.set_null_mask(Some(m));
211 }
212 }
213
214 /// Marks the value at the given index as null.
215 #[inline]
216 fn set_null(&mut self, idx: usize) {
217 if let Some(nmask) = &mut self.null_mask_mut() {
218 if nmask.len() <= idx {
219 nmask.resize(idx + 1, true);
220 }
221 nmask.set(idx, false);
222 } else {
223 let mut m = Bitmask::new_set_all(self.len(), true);
224 m.set(idx, false);
225 self.set_null_mask(Some(m));
226 }
227 }
228
229 /// Like `set_null`, but skips bounds checks.
230 #[inline(always)]
231 unsafe fn set_null_unchecked(&mut self, idx: usize) {
232 if let Some(mask) = self.null_mask_mut() {
233 mask.set(idx, false);
234 } else {
235 let mut m = Bitmask::new_set_all(self.len(), true);
236 m.set(idx, false);
237 self.set_null_mask(Some(m));
238 }
239 }
240
241 /// Bulk-extend this array with `n` null entries
242 #[inline]
243 fn push_nulls(&mut self, n: usize) {
244 let start = self.len();
245 let end = start + n;
246
247 self.resize(end, Self::LogicalType::default());
248
249 if let Some(mask) = self.null_mask_mut() {
250 mask.resize(end, false);
251 } else {
252 let mut m = Bitmask::new_set_all(end, true);
253 for i in start..end {
254 m.set(i, false);
255 }
256 self.set_null_mask(Some(m));
257 }
258 }
259
260 /// Bulk-extend this array with `n` null entries, using unchecked mask writes.
261 ///
262 /// # Safety
263 /// Caller must ensure there are no data races across threads on this mask.
264 #[inline(always)]
265 unsafe fn push_nulls_unchecked(&mut self, n: usize) {
266 let start = self.len();
267 let end = start + n;
268
269 self.resize(end, Self::LogicalType::default());
270
271 if let Some(mask) = self.null_mask_mut() {
272 mask.resize(end, true);
273 for i in 0..n {
274 unsafe { mask.set_unchecked(start + i, false) };
275 }
276 } else {
277 let mut m = Bitmask::new_set_all(end, true);
278 for i in start..end {
279 unsafe { m.set_unchecked(i, false) };
280 }
281 self.set_null_mask(Some(m));
282 }
283 }
284
285 /// Appends all values (and null mask if present) from `other` to `self`.
286 ///
287 /// The appended array must be of the same concrete type and element type.
288 ///
289 /// If this array is wrapped in a `FieldArray`, it will not be possible to
290 /// mutate the array without reconstructing first, and a `ChunkedArray`
291 /// is an alternative option.
292 fn append_array(&mut self, other: &Self);
293
294 /// Appends rows `[offset..offset+len)` from another array into self.
295 ///
296 /// Like `append_array` but for a sub-range. Data and null masks are
297 /// extended from the source range. The destination grows via its
298 /// backing allocator.
299 fn append_range(
300 &mut self,
301 other: &Self,
302 offset: usize,
303 len: usize,
304 ) -> Result<(), MinarrowError>;
305
306 /// Inserts all values (and null mask if present) from `other` into `self` at the specified index.
307 ///
308 /// The inserted array must be of the same concrete type and element type.
309 /// Elements at and after `index` are shifted to make room for the inserted values.
310 ///
311 /// # Performance
312 /// This is an **O(n)** operation, where n is the number of elements that need to be shifted.
313 /// For appending to the end, prefer `append_array` which is more efficient.
314 ///
315 /// # Arguments
316 /// * `index` - Position before which to insert (0 = prepend, self.len() = append)
317 /// * `other` - Array to insert
318 ///
319 /// # Errors
320 /// Returns an error if:
321 /// * `index > self.len()` (out of bounds)
322 /// * Type mismatch between self and other (for enum variants)
323 ///
324 /// # Example
325 /// ```ignore
326 /// let mut arr = IntegerArray::from(vec![1, 2, 5, 6]);
327 /// let insert = IntegerArray::from(vec![3, 4]);
328 /// arr.insert_rows(2, &insert)?; // Now: [1, 2, 3, 4, 5, 6]
329 /// ```
330 fn insert_rows(&mut self, index: usize, other: &Self) -> Result<(), MinarrowError>;
331
332 /// Splits this array at the specified index, consuming self and returning two arrays.
333 ///
334 /// The original array's memory is split between the two resulting arrays without copying.
335 /// The first array contains elements [0..index), the second contains [index..len).
336 ///
337 /// # Arguments
338 /// * `index` - Position to split at (0 < index < len)
339 ///
340 /// # Errors
341 /// - Returns an error if index == 0 or index >= len()
342 ///
343 /// # Returns
344 /// A tuple of (before, after) where before contains [0..index) and after contains [index..len)
345 fn split(self, index: usize) -> Result<(Self, Self), MinarrowError>
346 where
347 Self: Sized;
348
349 /// Extends the array from an iterator with pre-allocated capacity.
350 ///
351 /// Pre-allocates the specified additional capacity to avoid reallocations during bulk insertion,
352 /// providing optimal performance for large datasets where the final size is known in advance.
353 fn extend_from_iter_with_capacity<I>(&mut self, iter: I, additional_capacity: usize)
354 where
355 I: Iterator<Item = Self::LogicalType>;
356
357 /// Extends the array from a slice of values.
358 ///
359 /// More efficient than individual `push` operations as it pre-allocates capacity
360 /// and can use bulk copy operations for compatible data types. For variable-length
361 /// types like strings, calculates total byte requirements upfront.
362 fn extend_from_slice(&mut self, slice: &[Self::LogicalType]);
363
364 /// Creates a new array filled with the specified value repeated `count` times.
365 ///
366 /// Pre-allocates exact capacity to avoid reallocations and uses
367 /// efficient bulk operations where possible. For string types,
368 /// calculates total byte requirements to minimise memory overhead.
369 fn fill(value: Self::LogicalType, count: usize) -> Self;
370}