Skip to main content

minarrow/traits/
masked_array.rs

1// Copyright 2025 Peter Garfield Bower
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! # **MaskedArray Module** - *Standardises all Inner Array types and null handling in Minarrow*
16//!
17//! Defines the `MaskedArray` trait - the common interface for all nullable array types in Minarrow.
18//!
19//! This module standardises how arrays store and manage optional null bitmasks,
20//! ensuring consistent null-handling behaviour across fixed-width and variable-length arrays.
21//! It also provides default implementations for common mask operations to reduce duplication.
22
23use crate::{Bitmask, Length, Offset, enums::error::MinarrowError};
24
25/// # MaskedArray
26///
27/// MaskedArray is implemented by all inner, nullable arrays.
28///
29/// ### Purpose
30/// - MaskedArray ensures interface consistency across `BooleanArray`,
31/// `CategoricalArray`, `DatetimeArray`, `FloatArray`, `IntegerArray`
32/// and `StringArray`.
33/// - It avoids repeition through default boilerplate implementations,
34/// focusing on null value handling.
35/// - This serves to enforce the base pattern contract, and is either overriden
36/// on non-fixed width types (e.g., `BooleanArray`, `StringArray`), or, for fixed
37/// width types (e.g., `FloatArray`, `IntegerArray`), is supported by macros.
38pub trait MaskedArray {
39    /// The element type (e.g. `f32`, `bool`, etc.)
40    /// Or, utility type e.g., `Offsets` for cases
41    /// like `String`
42    type T: Default + PartialEq + Clone + Copy;
43
44    /// The backing store (e.g. `Vec64<Self::Elem>` or `Bitmask`)
45    type Container;
46
47    /// The logical type that the data carries
48    type LogicalType: Default;
49
50    /// The type that implements `Copy` (e.g., &str)
51    type CopyType: Default;
52
53    /// **************************************************
54    /// The below methods differ for the Boolean (bit-packed),
55    /// and String (variable-length) variants and thus are
56    /// implemented via macros for the standard variants,
57    /// and then implemented on those types directly
58    /// *************************************************
59
60    /// Returns the number of elements in the array.
61    fn len(&self) -> usize;
62
63    /// Returns true if the array is empty.
64    #[inline]
65    fn is_empty(&self) -> bool {
66        self.len() == 0
67    }
68
69    /// Returns a reference to the underlying data.
70    fn data(&self) -> &Self::Container;
71
72    /// Returns a mutable reference to the underlying data.
73    fn data_mut(&mut self) -> &mut Self::Container;
74
75    /// Retrieves the value at the given index, or None if null or beyond length.
76    fn get(&self, idx: usize) -> Option<Self::CopyType>;
77
78    /// Sets the value at the given index, updating the null‐mask.
79    fn set(&mut self, idx: usize, value: Self::LogicalType);
80
81    /// Like `get`, but skips the `idx >= len()` check.
82    unsafe fn get_unchecked(&self, idx: usize) -> Option<Self::CopyType>;
83
84    /// Like `set`, but skips bounds checks.
85    unsafe fn set_unchecked(&mut self, idx: usize, value: Self::LogicalType);
86
87    /// Low-level accessor for when working directly with
88    /// mutable array variants.
89    ///
90    /// Borrows with window parameters as a tuple,
91    /// for 'DIY' window access, retaining access to the whole original array.
92    ///
93    /// `Offset` and `Length` are `usize` aliases.
94    ///
95    /// For the standard zero-copy accessors, see the `View` trait.
96    fn tuple_ref(&self, offset: usize, len: usize) -> (&Self, Offset, Length) {
97        (&self, offset, len)
98    }
99
100    /// Returns an iterator over the T values in this array.
101    fn iter(&self) -> impl Iterator<Item = Self::CopyType> + '_;
102
103    /// Returns an iterator over the T values, as `Option<Self::T>`.
104    fn iter_opt(&self) -> impl Iterator<Item = Option<Self::CopyType>> + '_;
105
106    /// Returns an iterator over a range of T values in this array.
107    fn iter_range(&self, offset: usize, len: usize) -> impl Iterator<Item = Self::CopyType> + '_;
108
109    /// Returns an iterator over a range of T values, as `Option<T>`.
110    fn iter_opt_range(
111        &self,
112        offset: usize,
113        len: usize,
114    ) -> impl Iterator<Item = Option<Self::CopyType>> + '_;
115
116    /// Appends a value to the array, updating masks if present.
117    fn push(&mut self, value: Self::LogicalType);
118
119    /// Appends a value to the array, updating masks if present,
120    /// without bounds checks.
121    ///
122    /// # Safety
123    /// The caller must make sure there is enough pre-allocated
124    /// size in the array, and no thread contention.
125    unsafe fn push_unchecked(&mut self, value: Self::LogicalType);
126
127    /// Returns a logical slice of the MaskedArray<Self::T> [offset, offset+len)
128    /// as a new MaskedArray<Self::T> object via clone.
129    ///
130    /// Prefer `View` trait slicers for zero-copy.
131    fn slice_clone(&self, offset: usize, len: usize) -> Self;
132
133    /// Resizes the array to contain `n` elements, via a call into self.data.resize().
134    ///
135    /// If `n` is greater than the current length, new elements are added using `T::default()`.
136    /// If `n` is smaller, the array is truncated. This only affects the data buffer,
137    /// not the null mask.
138    fn resize(&mut self, n: usize, value: Self::LogicalType);
139
140    /// **************************************************
141    ///  We handle null masks consistently across all variants
142    /// and thus their implementation sits on the trait, other
143    /// than trait methods that need to access data state.
144    /// **************************************************
145
146    /// Returns a reference to the optional null mask.
147    fn null_mask(&self) -> Option<&Bitmask>;
148
149    /// Returns true if the value at the given index is null.
150    #[inline]
151    fn is_null(&self, idx: usize) -> bool {
152        match &self.null_mask() {
153            Some(mask) => !mask.get(idx),
154            None => false,
155        }
156    }
157
158    /// Checks if the array has a null bitmask.
159    fn is_nullable(&self) -> bool {
160        self.null_mask().is_some()
161    }
162
163    /// Returns the total number of nulls.
164    fn null_count(&self) -> usize {
165        match self.null_mask().as_ref() {
166            Some(mask) => mask.count_zeros(),
167            None => 0,
168        }
169    }
170
171    /// Append a null value to the array, creating mask if needed
172    #[inline]
173    fn push_null(&mut self) {
174        self.push(Self::LogicalType::default());
175        let i = self.len() - 1;
176        match self.null_mask_mut() {
177            Some(m) => m.set(i, false),
178            None => {
179                let mut m = Bitmask::new_set_all(self.len(), true);
180                m.set(i, false);
181                self.set_null_mask(Some(m));
182            }
183        }
184    }
185
186    /// Returns a mutable reference to the optional null mask.
187    fn null_mask_mut(&mut self) -> Option<&mut Bitmask>;
188
189    /// Sets the null mask.
190    fn set_null_mask(&mut self, mask: Option<Bitmask>);
191
192    /// Appends a null value _without_ any bounds‐checks on the mask.
193    ///
194    /// # Safety
195    /// You must ensure that after `push`, the data and mask (if present)
196    /// have capacity for the new index, or you risk OOB on either.
197    #[inline(always)]
198    unsafe fn push_null_unchecked(&mut self) {
199        // first, append a default element
200        let idx = self.len();
201        unsafe { self.set_unchecked(idx, Self::LogicalType::default()) };
202
203        if let Some(mask) = self.null_mask_mut() {
204            // mark null
205            unsafe { mask.set_unchecked(idx, false) };
206        } else {
207            // initialise a new mask and mark this slot null
208            let mut m = Bitmask::new_set_all(idx, true);
209            unsafe { m.set_unchecked(idx, false) };
210            self.set_null_mask(Some(m));
211        }
212    }
213
214    /// Marks the value at the given index as null.
215    #[inline]
216    fn set_null(&mut self, idx: usize) {
217        if let Some(nmask) = &mut self.null_mask_mut() {
218            if nmask.len() <= idx {
219                nmask.resize(idx + 1, true);
220            }
221            nmask.set(idx, false);
222        } else {
223            let mut m = Bitmask::new_set_all(self.len(), true);
224            m.set(idx, false);
225            self.set_null_mask(Some(m));
226        }
227    }
228
229    /// Like `set_null`, but skips bounds checks.
230    #[inline(always)]
231    unsafe fn set_null_unchecked(&mut self, idx: usize) {
232        if let Some(mask) = self.null_mask_mut() {
233            mask.set(idx, false);
234        } else {
235            let mut m = Bitmask::new_set_all(self.len(), true);
236            m.set(idx, false);
237            self.set_null_mask(Some(m));
238        }
239    }
240
241    /// Bulk-extend this array with `n` null entries
242    #[inline]
243    fn push_nulls(&mut self, n: usize) {
244        let start = self.len();
245        let end = start + n;
246
247        self.resize(end, Self::LogicalType::default());
248
249        if let Some(mask) = self.null_mask_mut() {
250            mask.resize(end, false);
251        } else {
252            let mut m = Bitmask::new_set_all(end, true);
253            for i in start..end {
254                m.set(i, false);
255            }
256            self.set_null_mask(Some(m));
257        }
258    }
259
260    /// Bulk-extend this array with `n` null entries, using unchecked mask writes.
261    ///
262    /// # Safety
263    /// Caller must ensure there are no data races across threads on this mask.
264    #[inline(always)]
265    unsafe fn push_nulls_unchecked(&mut self, n: usize) {
266        let start = self.len();
267        let end = start + n;
268
269        self.resize(end, Self::LogicalType::default());
270
271        if let Some(mask) = self.null_mask_mut() {
272            mask.resize(end, true);
273            for i in 0..n {
274                unsafe { mask.set_unchecked(start + i, false) };
275            }
276        } else {
277            let mut m = Bitmask::new_set_all(end, true);
278            for i in start..end {
279                unsafe { m.set_unchecked(i, false) };
280            }
281            self.set_null_mask(Some(m));
282        }
283    }
284
285    /// Appends all values (and null mask if present) from `other` to `self`.
286    ///
287    /// The appended array must be of the same concrete type and element type.
288    ///
289    /// If this array is wrapped in a `FieldArray`, it will not be possible to
290    /// mutate the array without reconstructing first, and a `ChunkedArray`
291    /// is an alternative option.
292    fn append_array(&mut self, other: &Self);
293
294    /// Appends rows `[offset..offset+len)` from another array into self.
295    ///
296    /// Like `append_array` but for a sub-range. Data and null masks are
297    /// extended from the source range. The destination grows via its
298    /// backing allocator.
299    fn append_range(
300        &mut self,
301        other: &Self,
302        offset: usize,
303        len: usize,
304    ) -> Result<(), MinarrowError>;
305
306    /// Inserts all values (and null mask if present) from `other` into `self` at the specified index.
307    ///
308    /// The inserted array must be of the same concrete type and element type.
309    /// Elements at and after `index` are shifted to make room for the inserted values.
310    ///
311    /// # Performance
312    /// This is an **O(n)** operation, where n is the number of elements that need to be shifted.
313    /// For appending to the end, prefer `append_array` which is more efficient.
314    ///
315    /// # Arguments
316    /// * `index` - Position before which to insert (0 = prepend, self.len() = append)
317    /// * `other` - Array to insert
318    ///
319    /// # Errors
320    /// Returns an error if:
321    /// * `index > self.len()` (out of bounds)
322    /// * Type mismatch between self and other (for enum variants)
323    ///
324    /// # Example
325    /// ```ignore
326    /// let mut arr = IntegerArray::from(vec![1, 2, 5, 6]);
327    /// let insert = IntegerArray::from(vec![3, 4]);
328    /// arr.insert_rows(2, &insert)?; // Now: [1, 2, 3, 4, 5, 6]
329    /// ```
330    fn insert_rows(&mut self, index: usize, other: &Self) -> Result<(), MinarrowError>;
331
332    /// Splits this array at the specified index, consuming self and returning two arrays.
333    ///
334    /// The original array's memory is split between the two resulting arrays without copying.
335    /// The first array contains elements [0..index), the second contains [index..len).
336    ///
337    /// # Arguments
338    /// * `index` - Position to split at (0 < index < len)
339    ///
340    /// # Errors
341    /// - Returns an error if index == 0 or index >= len()
342    ///
343    /// # Returns
344    /// A tuple of (before, after) where before contains [0..index) and after contains [index..len)
345    fn split(self, index: usize) -> Result<(Self, Self), MinarrowError>
346    where
347        Self: Sized;
348
349    /// Extends the array from an iterator with pre-allocated capacity.
350    ///
351    /// Pre-allocates the specified additional capacity to avoid reallocations during bulk insertion,
352    /// providing optimal performance for large datasets where the final size is known in advance.
353    fn extend_from_iter_with_capacity<I>(&mut self, iter: I, additional_capacity: usize)
354    where
355        I: Iterator<Item = Self::LogicalType>;
356
357    /// Extends the array from a slice of values.
358    ///
359    /// More efficient than individual `push` operations as it pre-allocates capacity
360    /// and can use bulk copy operations for compatible data types. For variable-length
361    /// types like strings, calculates total byte requirements upfront.
362    fn extend_from_slice(&mut self, slice: &[Self::LogicalType]);
363
364    /// Creates a new array filled with the specified value repeated `count` times.
365    ///
366    /// Pre-allocates exact capacity to avoid reallocations and uses
367    /// efficient bulk operations where possible. For string types,
368    /// calculates total byte requirements to minimise memory overhead.
369    fn fill(value: Self::LogicalType, count: usize) -> Self;
370}