vortex_array/builders/primitive.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::any::Any;
5use std::mem::MaybeUninit;
6
7use vortex_buffer::BufferMut;
8use vortex_dtype::{DType, NativePType, Nullability};
9use vortex_mask::Mask;
10
11use crate::arrays::PrimitiveArray;
12use crate::builders::{ArrayBuilder, DEFAULT_BUILDER_CAPACITY, LazyNullBufferBuilder};
13use crate::canonical::{Canonical, ToCanonical};
14use crate::{Array, ArrayRef, IntoArray};
15
16/// The builder for building a [`PrimitiveArray`], parametrized by the `PType`.
17pub struct PrimitiveBuilder<T> {
18 dtype: DType,
19 values: BufferMut<T>,
20 nulls: LazyNullBufferBuilder,
21}
22
23impl<T: NativePType> PrimitiveBuilder<T> {
24 /// Creates a new `PrimitiveBuilder` with a capacity of [`DEFAULT_BUILDER_CAPACITY`].
25 pub fn new(nullability: Nullability) -> Self {
26 Self::with_capacity(nullability, DEFAULT_BUILDER_CAPACITY)
27 }
28
29 /// Creates a new `PrimitiveBuilder` with the given `capacity`.
30 pub fn with_capacity(nullability: Nullability, capacity: usize) -> Self {
31 Self {
32 values: BufferMut::with_capacity(capacity),
33 nulls: LazyNullBufferBuilder::new(capacity),
34 dtype: DType::Primitive(T::PTYPE, nullability),
35 }
36 }
37
38 /// Appends a primitive `value` to the builder.
39 pub fn append_value(&mut self, value: T) {
40 self.values.push(value);
41 self.nulls.append_non_null();
42 }
43
44 /// Appends an optional primitive value to the builder.
45 ///
46 /// If the value is `Some`, it appends the primitive value. If the value is `None`, it appends a
47 /// null.
48 ///
49 /// # Panics
50 ///
51 /// This method will panic if the input is `None` and the builder is non-nullable.
52 pub(crate) fn append_option(&mut self, value: Option<T>) {
53 match value {
54 Some(value) => self.append_value(value),
55 None => self.append_null(),
56 }
57 }
58
59 /// Returns the raw primitive values in this builder as a slice.
60 pub fn values(&self) -> &[T] {
61 self.values.as_ref()
62 }
63
64 /// Create a new handle to the next `len` uninitialized values in the builder.
65 ///
66 /// All reads/writes through the handle to the values buffer or the validity buffer will operate
67 /// on indices relative to the start of the range.
68 ///
69 /// # Panics
70 ///
71 /// Panics if `len` is 0 or if the current length of the builder plus `len` would exceed the
72 /// capacity of the builder's memory.
73 ///
74 /// ## Example
75 ///
76 /// ```
77 /// use std::mem::MaybeUninit;
78 /// use vortex_array::builders::{ArrayBuilder, PrimitiveBuilder};
79 /// use vortex_dtype::Nullability;
80 ///
81 /// // Create a new builder.
82 /// let mut builder: PrimitiveBuilder<i32> =
83 /// PrimitiveBuilder::with_capacity(Nullability::NonNullable, 5);
84 ///
85 /// // Populate the values.
86 /// let mut uninit_range = builder.uninit_range(5);
87 /// uninit_range.copy_from_slice(0, &[0, 1, 2, 3, 4]);
88 ///
89 /// // SAFETY: We have initialized all 5 values in the range, and since the array builder is
90 /// // non-nullable, we don't need to set any null bits.
91 /// unsafe { uninit_range.finish(); }
92 ///
93 /// let built = builder.finish_into_primitive();
94 ///
95 /// assert_eq!(built.as_slice::<i32>(), &[0i32, 1, 2, 3, 4]);
96 /// ```
97 pub fn uninit_range(&mut self, len: usize) -> UninitRange<'_, T> {
98 assert_ne!(0, len, "cannot create an uninit range of length 0");
99
100 let current_len = self.values.len();
101 assert!(
102 current_len + len <= self.values.capacity(),
103 "uninit_range of len {len} exceeds builder capacity {}",
104 self.values.capacity()
105 );
106
107 UninitRange { len, builder: self }
108 }
109
110 /// Finishes the builder directly into a [`PrimitiveArray`].
111 pub fn finish_into_primitive(&mut self) -> PrimitiveArray {
112 let validity = self
113 .nulls
114 .finish_with_nullability(self.dtype().nullability());
115
116 PrimitiveArray::new(std::mem::take(&mut self.values).freeze(), validity)
117 }
118
119 /// Extends the primitive array with an iterator.
120 pub fn extend_with_iterator(&mut self, iter: impl IntoIterator<Item = T>, mask: Mask) {
121 self.values.extend(iter);
122 self.nulls.append_validity_mask(mask);
123 }
124}
125
126impl<T: NativePType> ArrayBuilder for PrimitiveBuilder<T> {
127 fn as_any(&self) -> &dyn Any {
128 self
129 }
130
131 fn as_any_mut(&mut self) -> &mut dyn Any {
132 self
133 }
134
135 fn dtype(&self) -> &DType {
136 &self.dtype
137 }
138
139 fn len(&self) -> usize {
140 self.values.len()
141 }
142
143 fn append_zeros(&mut self, n: usize) {
144 self.values.push_n(T::default(), n);
145 self.nulls.append_n_non_nulls(n);
146 }
147
148 unsafe fn append_nulls_unchecked(&mut self, n: usize) {
149 self.values.push_n(T::default(), n);
150 self.nulls.append_n_nulls(n);
151 }
152
153 unsafe fn extend_from_array_unchecked(&mut self, array: &dyn Array) {
154 let array = array.to_primitive();
155
156 // This should be checked in `extend_from_array` but we can check it again.
157 debug_assert_eq!(
158 array.ptype(),
159 T::PTYPE,
160 "Cannot extend from array with different ptype"
161 );
162
163 self.values.extend_from_slice(array.as_slice::<T>());
164 self.nulls.append_validity_mask(array.validity_mask());
165 }
166
167 fn ensure_capacity(&mut self, capacity: usize) {
168 if capacity > self.values.capacity() {
169 self.values.reserve(capacity - self.values.len());
170 self.nulls.ensure_capacity(capacity);
171 }
172 }
173
174 fn set_validity(&mut self, validity: Mask) {
175 self.nulls = LazyNullBufferBuilder::new(validity.len());
176 self.nulls.append_validity_mask(validity);
177 }
178
179 fn finish(&mut self) -> ArrayRef {
180 self.finish_into_primitive().into_array()
181 }
182
183 fn finish_into_canonical(&mut self) -> Canonical {
184 Canonical::Primitive(self.finish_into_primitive())
185 }
186}
187
188/// A range of uninitialized values in the primitive builder that can be filled.
189pub struct UninitRange<'a, T> {
190 /// The length of the uninitialized range.
191 ///
192 /// This is guaranteed to be within the memory capacity of the builder.
193 len: usize,
194
195 /// A mutable reference to the builder.
196 ///
197 /// Since this is a mutable reference, we can guarantee that nothing else can modify the builder
198 /// while this `UninitRange` exists.
199 builder: &'a mut PrimitiveBuilder<T>,
200}
201
202impl<T> UninitRange<'_, T> {
203 /// Returns the length of this uninitialized range.
204 #[inline]
205 pub fn len(&self) -> usize {
206 self.len
207 }
208
209 /// Returns true if this range has zero length.
210 #[inline]
211 pub fn is_empty(&self) -> bool {
212 self.len == 0
213 }
214
215 /// Set a value at the given index within this range.
216 ///
217 /// # Panics
218 ///
219 /// Panics if the index is out of bounds.
220 #[inline]
221 pub fn set_value(&mut self, index: usize, value: T) {
222 assert!(index < self.len, "index out of bounds");
223 let spare = self.builder.values.spare_capacity_mut();
224 spare[index] = MaybeUninit::new(value);
225 }
226
227 /// Append a [`Mask`] to this builder's null buffer.
228 ///
229 /// # Panics
230 ///
231 /// Panics if the mask length is not equal to the the length of the current `UninitRange`.
232 ///
233 /// # Safety
234 ///
235 /// - The caller must ensure that they safely initialize `mask.len()` primitive values via
236 /// [`UninitRange::copy_from_slice`].
237 /// - The caller must also ensure that they only call this method once.
238 pub unsafe fn append_mask(&mut self, mask: Mask) {
239 assert_eq!(
240 mask.len(),
241 self.len,
242 "Tried to append a mask to an `UninitRange` that was beyond the allowed range"
243 );
244
245 // TODO(connor): Ideally, we would call this function `set_mask` and directly set all of the
246 // bits (so that we can call this multiple times), but the underlying `BooleanBuffer` does
247 // not have an easy way to do this correctly.
248
249 self.builder.nulls.append_validity_mask(mask);
250 }
251
252 /// Set a validity bit at the given index.
253 ///
254 /// The index is relative to the start of this range (not relative to the values already in the
255 /// builder).
256 ///
257 /// Note that this will have no effect if the builder is non-nullable.
258 pub fn set_bit(&mut self, index: usize, v: bool) {
259 assert!(index < self.len, "set_bit index out of bounds");
260 // Note that this won't panic because we can only create an `UninitRange` within the
261 // capacity of the builder (it will not automatically resize).
262 let absolute_index = self.builder.values.len() + index;
263 self.builder.nulls.set_bit(absolute_index, v);
264 }
265
266 /// Set values from an initialized range.
267 ///
268 /// Note that the input `offset` should be an offset relative to the local `UninitRange`, not
269 /// the entire `PrimitiveBuilder`.
270 pub fn copy_from_slice(&mut self, local_offset: usize, src: &[T])
271 where
272 T: Copy,
273 {
274 debug_assert!(
275 local_offset + src.len() <= self.len,
276 "tried to copy a slice into a `UninitRange` past its boundary"
277 );
278
279 // SAFETY: &[T] and &[MaybeUninit<T>] have the same layout.
280 let uninit_src: &[MaybeUninit<T>] = unsafe { std::mem::transmute(src) };
281
282 // Note: spare_capacity_mut() returns the spare capacity starting from the current length,
283 // so we just use local_offset directly.
284 let dst =
285 &mut self.builder.values.spare_capacity_mut()[local_offset..local_offset + src.len()];
286 dst.copy_from_slice(uninit_src);
287 }
288
289 /// Get a mutable slice of uninitialized memory at the specified offset within this range.
290 ///
291 /// Note that the offsets are relative to this local range, not to the values already in the
292 /// builder.
293 ///
294 /// # Safety
295 ///
296 /// The caller must ensure that they properly initialize the returned memory before calling
297 /// `finish()` on this range.
298 ///
299 /// # Panics
300 ///
301 /// Panics if `offset + len` exceeds the range bounds.
302 pub unsafe fn slice_uninit_mut(&mut self, offset: usize, len: usize) -> &mut [MaybeUninit<T>] {
303 assert!(
304 offset + len <= self.len,
305 "slice_uninit_mut: offset {} + len {} exceeds range length {}",
306 offset,
307 len,
308 self.len
309 );
310 &mut self.builder.values.spare_capacity_mut()[offset..offset + len]
311 }
312
313 /// Finish building this range, marking it as initialized and advancing the length of the
314 /// underlying values buffer.
315 ///
316 /// # Safety
317 ///
318 /// The caller must ensure that they have safely initialized all `len` values via
319 /// [`UninitRange::copy_from_slice`] as well as correctly set all of the null bits via
320 /// [`set_bit`] or [`append_mask`] if the builder is nullable.
321 ///
322 /// [`set_bit`]: UninitRange::set_bit
323 /// [`append_mask`]: UninitRange::append_mask
324 pub unsafe fn finish(self) {
325 // SAFETY: constructor enforces that current length + len does not exceed the capacity of the array.
326 let new_len = self.builder.values.len() + self.len;
327 unsafe { self.builder.values.set_len(new_len) };
328 }
329}
330
331#[cfg(test)]
332mod tests {
333 use super::*;
334
335 /// REGRESSION TEST: This test verifies that multiple sequential ranges have correct offsets.
336 ///
337 /// This would have caught the `Deref` bug where it always returned from the start of the
338 /// buffer.
339 #[test]
340 fn test_multiple_uninit_ranges_correct_offsets() {
341 let mut builder = PrimitiveBuilder::<i32>::with_capacity(Nullability::NonNullable, 10);
342
343 // First range.
344 let mut range1 = builder.uninit_range(3);
345 range1.copy_from_slice(0, &[1, 2, 3]);
346
347 // SAFETY: We initialized all 3 values.
348 unsafe {
349 range1.finish();
350 }
351
352 // Verify the builder now has these values.
353 assert_eq!(builder.values(), &[1, 2, 3]);
354
355 // Second range - this would fail with the old Deref implementation.
356 let mut range2 = builder.uninit_range(2);
357
358 // Set values using copy_from_slice.
359 range2.copy_from_slice(0, &[4, 5]);
360
361 // SAFETY: We initialized both values.
362 unsafe {
363 range2.finish();
364 }
365
366 // Verify the builder now has all 5 values.
367 assert_eq!(builder.values(), &[1, 2, 3, 4, 5]);
368
369 let array = builder.finish_into_primitive();
370 assert_eq!(array.as_slice::<i32>(), &[1, 2, 3, 4, 5]);
371 }
372
373 /// REGRESSION TEST: This test verifies that `append_mask` was correctly moved from
374 /// `PrimitiveBuilder` to `UninitRange`.
375 ///
376 /// The old API had `append_mask` on the builder, which was confusing when used with ranges.
377 /// This test ensures the new API works correctly.
378 #[test]
379 fn test_append_mask_on_uninit_range() {
380 let mut builder = PrimitiveBuilder::<i32>::with_capacity(Nullability::Nullable, 5);
381 let mut range = builder.uninit_range(3);
382
383 // Create a mask for 3 values.
384 let mask = Mask::from_iter([true, false, true]);
385
386 // SAFETY: We're about to initialize the values.
387 unsafe {
388 range.append_mask(mask);
389 }
390
391 // Initialize the values.
392 range.copy_from_slice(0, &[10, 20, 30]);
393
394 // SAFETY: We've initialized all values and set the mask.
395 unsafe {
396 range.finish();
397 }
398
399 let array = builder.finish_into_primitive();
400 assert_eq!(array.len(), 3);
401 // Check validity using scalar_at - nulls will return is_null() = true.
402 assert!(!array.scalar_at(0).is_null());
403 assert!(array.scalar_at(1).is_null());
404 assert!(!array.scalar_at(2).is_null());
405 }
406
407 /// REGRESSION TEST: This test verifies that `append_mask` validates the mask length.
408 ///
409 /// This ensures that masks can only be appended if they match the range length.
410 #[test]
411 #[should_panic(
412 expected = "Tried to append a mask to an `UninitRange` that was beyond the allowed range"
413 )]
414 fn test_append_mask_wrong_length_panics() {
415 let mut builder = PrimitiveBuilder::<i32>::with_capacity(Nullability::Nullable, 10);
416 let mut range = builder.uninit_range(5);
417
418 // Try to append a mask with wrong length (3 instead of 5).
419 let wrong_mask = Mask::from_iter([true, false, true]);
420
421 // SAFETY: This is expected to panic due to length mismatch.
422 unsafe {
423 range.append_mask(wrong_mask);
424 }
425 }
426
427 /// Test that `copy_from_slice` works correctly with different offsets.
428 ///
429 /// This verifies the new simplified API without the redundant `len` parameter.
430 #[test]
431 fn test_copy_from_slice_with_offsets() {
432 let mut builder = PrimitiveBuilder::<i32>::with_capacity(Nullability::NonNullable, 10);
433 let mut range = builder.uninit_range(6);
434
435 // Copy to different offsets.
436 range.copy_from_slice(0, &[1, 2]);
437 range.copy_from_slice(2, &[3, 4]);
438 range.copy_from_slice(4, &[5, 6]);
439
440 // SAFETY: We've initialized all 6 values.
441 unsafe {
442 range.finish();
443 }
444
445 let array = builder.finish_into_primitive();
446 assert_eq!(array.as_slice::<i32>(), &[1, 2, 3, 4, 5, 6]);
447 }
448
449 /// Test that `set_bit` uses relative indexing within the range.
450 ///
451 /// Note: `set_bit` requires the null buffer to already be initialized, so we first
452 /// use `append_mask` to set up the buffer, then demonstrate that `set_bit` can
453 /// modify individual bits with relative indexing.
454 #[test]
455 fn test_set_bit_relative_indexing() {
456 let mut builder = PrimitiveBuilder::<i32>::with_capacity(Nullability::Nullable, 10);
457
458 // First add some values to the builder.
459 builder.append_value(100);
460 builder.append_value(200);
461
462 // Create a range for new values.
463 let mut range = builder.uninit_range(3);
464
465 // Use append_mask to initialize the validity buffer for this range.
466 let initial_mask = Mask::from_iter([false, false, false]);
467 // SAFETY: We're about to initialize the values.
468 unsafe {
469 range.append_mask(initial_mask);
470 }
471
472 // Now we can use set_bit to modify individual bits with relative indexing.
473 range.set_bit(0, true); // Change first bit to valid
474 range.set_bit(2, true); // Change third bit to valid
475 // Leave middle bit as false (null)
476
477 // Initialize the values.
478 range.copy_from_slice(0, &[10, 20, 30]);
479
480 // SAFETY: We've initialized all 3 values and set their validity.
481 unsafe {
482 range.finish();
483 }
484
485 let array = builder.finish_into_primitive();
486
487 // Verify the total length and values.
488 assert_eq!(array.len(), 5);
489 assert_eq!(array.as_slice::<i32>(), &[100, 200, 10, 20, 30]);
490
491 // Check validity - the first two should be valid (from append_value).
492 assert!(!array.scalar_at(0).is_null()); // initial value 100
493 assert!(!array.scalar_at(1).is_null()); // initial value 200
494
495 // Check the range items with modified validity.
496 assert!(!array.scalar_at(2).is_null()); // range index 0 - set to valid
497 assert!(array.scalar_at(3).is_null()); // range index 1 - left as null
498 assert!(!array.scalar_at(4).is_null()); // range index 2 - set to valid
499 }
500
501 /// Test that creating a zero-length uninit range panics.
502 #[test]
503 #[should_panic(expected = "cannot create an uninit range of length 0")]
504 fn test_zero_length_uninit_range_panics() {
505 let mut builder = PrimitiveBuilder::<i32>::new(Nullability::NonNullable);
506 let _range = builder.uninit_range(0);
507 }
508
509 /// Test that creating an uninit range exceeding capacity panics.
510 #[test]
511 #[should_panic(expected = "uninit_range of len 10 exceeds builder capacity")]
512 fn test_uninit_range_exceeds_capacity_panics() {
513 let mut builder = PrimitiveBuilder::<i32>::with_capacity(Nullability::NonNullable, 5);
514 let _range = builder.uninit_range(10);
515 }
516
517 /// Test that `copy_from_slice` debug asserts on out-of-bounds access.
518 ///
519 /// Note: This only panics in debug mode due to `debug_assert!`.
520 #[test]
521 #[cfg(debug_assertions)]
522 #[should_panic(expected = "tried to copy a slice into a `UninitRange` past its boundary")]
523 fn test_copy_from_slice_out_of_bounds() {
524 let mut builder = PrimitiveBuilder::<i32>::with_capacity(Nullability::NonNullable, 10);
525 let mut range = builder.uninit_range(3);
526
527 // Try to copy 3 elements starting at offset 1 (would need 4 slots total).
528 range.copy_from_slice(1, &[1, 2, 3]);
529 }
530
531 /// Test that the unsafe contract of `finish` is documented and works correctly.
532 ///
533 /// This test demonstrates proper usage of the unsafe `finish` method.
534 #[test]
535 fn test_finish_unsafe_contract() {
536 let mut builder = PrimitiveBuilder::<i32>::with_capacity(Nullability::Nullable, 5);
537 let mut range = builder.uninit_range(3);
538
539 // Set validity mask.
540 let mask = Mask::from_iter([true, true, false]);
541 // SAFETY: We're about to initialize the matching number of values.
542 unsafe {
543 range.append_mask(mask);
544 }
545
546 // Initialize all values.
547 range.copy_from_slice(0, &[10, 20, 30]);
548
549 // SAFETY: We have initialized all 3 values and set their validity.
550 unsafe {
551 range.finish();
552 }
553
554 let array = builder.finish_into_primitive();
555 assert_eq!(array.len(), 3);
556 assert_eq!(array.as_slice::<i32>(), &[10, 20, 30]);
557 }
558}