vortex_array/builders/mod.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Builders for Vortex arrays.
5//!
6//! Every logical type in Vortex has a canonical (uncompressed) in-memory encoding. This module
7//! provides pre-allocated builders to construct new canonical arrays.
8//!
9//! ## Example:
10//!
11//! ```
12//! use vortex_array::builders::{builder_with_capacity, ArrayBuilder};
13//! use vortex_array::dtype::{DType, Nullability};
14//!
15//! // Create a new builder for string data.
16//! let mut builder = builder_with_capacity(&DType::Utf8(Nullability::NonNullable), 4);
17//!
18//! builder.append_scalar(&"a".into()).unwrap();
19//! builder.append_scalar(&"b".into()).unwrap();
20//! builder.append_scalar(&"c".into()).unwrap();
21//! builder.append_scalar(&"d".into()).unwrap();
22//!
23//! let strings = builder.finish();
24//!
25//! assert_eq!(strings.scalar_at(0).unwrap(), "a".into());
26//! assert_eq!(strings.scalar_at(1).unwrap(), "b".into());
27//! assert_eq!(strings.scalar_at(2).unwrap(), "c".into());
28//! assert_eq!(strings.scalar_at(3).unwrap(), "d".into());
29//! ```
30
31use std::any::Any;
32use std::sync::Arc;
33
34use vortex_error::VortexResult;
35use vortex_error::vortex_panic;
36use vortex_mask::Mask;
37
38use crate::ArrayRef;
39use crate::canonical::Canonical;
40use crate::dtype::DType;
41use crate::match_each_decimal_value_type;
42use crate::match_each_native_ptype;
43use crate::memory::HostAllocatorRef;
44use crate::scalar::Scalar;
45
46mod lazy_null_builder;
47pub(crate) use lazy_null_builder::LazyBitBufferBuilder;
48
49mod bool;
50mod decimal;
51pub mod dict;
52mod extension;
53mod fixed_size_list;
54mod list;
55mod listview;
56mod null;
57mod primitive;
58mod struct_;
59mod varbinview;
60
61pub use bool::*;
62pub use decimal::*;
63pub use extension::*;
64pub use fixed_size_list::*;
65pub use list::*;
66pub use listview::*;
67pub use null::*;
68pub use primitive::*;
69pub use struct_::*;
70pub use varbinview::*;
71
72#[cfg(test)]
73mod tests;
74
75/// The default capacity for builders.
76///
77/// This is equal to the default capacity for Arrow Arrays.
78pub const DEFAULT_BUILDER_CAPACITY: usize = 1024;
79
80pub trait ArrayBuilder: Send {
81 fn as_any(&self) -> &dyn Any;
82
83 fn as_any_mut(&mut self) -> &mut dyn Any;
84
85 fn dtype(&self) -> &DType;
86
87 fn len(&self) -> usize;
88
89 fn is_empty(&self) -> bool {
90 self.len() == 0
91 }
92
93 /// Append a "zero" value to the array.
94 ///
95 /// Zero values are generally determined by [`Scalar::default_value`].
96 fn append_zero(&mut self) {
97 self.append_zeros(1)
98 }
99
100 /// Appends n "zero" values to the array.
101 ///
102 /// Zero values are generally determined by [`Scalar::default_value`].
103 fn append_zeros(&mut self, n: usize);
104
105 /// Append a "null" value to the array.
106 ///
107 /// Implementors should panic if this method is called on a non-nullable [`ArrayBuilder`].
108 fn append_null(&mut self) {
109 self.append_nulls(1)
110 }
111
112 /// The inner part of `append_nulls`.
113 ///
114 /// # Safety
115 ///
116 /// The array builder must be nullable.
117 unsafe fn append_nulls_unchecked(&mut self, n: usize);
118
119 /// Appends n "null" values to the array.
120 ///
121 /// Implementors should panic if this method is called on a non-nullable [`ArrayBuilder`].
122 fn append_nulls(&mut self, n: usize) {
123 assert!(
124 self.dtype().is_nullable(),
125 "tried to append {n} nulls to a non-nullable array builder"
126 );
127
128 // SAFETY: We check above that the array builder is nullable.
129 unsafe {
130 self.append_nulls_unchecked(n);
131 }
132 }
133
134 /// Appends a default value to the array.
135 fn append_default(&mut self) {
136 self.append_defaults(1)
137 }
138
139 /// Appends n default values to the array.
140 ///
141 /// If the array builder is nullable, then this has the behavior of `self.append_nulls(n)`.
142 /// If the array builder is non-nullable, then it has the behavior of `self.append_zeros(n)`.
143 fn append_defaults(&mut self, n: usize) {
144 if self.dtype().is_nullable() {
145 self.append_nulls(n);
146 } else {
147 self.append_zeros(n);
148 }
149 }
150
151 /// A generic function to append a scalar to the builder.
152 fn append_scalar(&mut self, scalar: &Scalar) -> VortexResult<()>;
153
154 /// The inner part of `extend_from_array`.
155 ///
156 /// # Safety
157 ///
158 /// The array that must have an equal [`DType`] to the array builder's `DType` (with nullability
159 /// superset semantics).
160 unsafe fn extend_from_array_unchecked(&mut self, array: &ArrayRef);
161
162 /// Extends the array with the provided array, canonicalizing if necessary.
163 ///
164 /// Implementors must validate that the passed in [`ArrayRef`] has the correct [`DType`].
165 fn extend_from_array(&mut self, array: &ArrayRef) {
166 if !self.dtype().eq_with_nullability_superset(array.dtype()) {
167 vortex_panic!(
168 "tried to extend a builder with `DType` {} with an array with `DType {}",
169 self.dtype(),
170 array.dtype()
171 );
172 }
173
174 // SAFETY: We checked that the array had a valid `DType` above.
175 unsafe { self.extend_from_array_unchecked(array) }
176 }
177
178 /// Allocate space for extra `additional` items
179 fn reserve_exact(&mut self, additional: usize);
180
181 /// Override builders validity with the one provided.
182 ///
183 /// Note that this will have no effect on the final array if the array builder is non-nullable.
184 fn set_validity(&mut self, validity: Mask) {
185 if !self.dtype().is_nullable() {
186 return;
187 }
188 assert_eq!(self.len(), validity.len());
189 unsafe { self.set_validity_unchecked(validity) }
190 }
191
192 /// override validity with the one provided, without checking lengths
193 ///
194 /// # Safety
195 ///
196 /// Given validity must have an equal length to [`self.len()`](Self::len).
197 unsafe fn set_validity_unchecked(&mut self, validity: Mask);
198
199 /// Constructs an Array from the builder components.
200 ///
201 /// # Panics
202 ///
203 /// This function may panic if the builder's methods are called with invalid arguments. If only
204 /// the methods on this interface are used, the builder should not panic. However, specific
205 /// builders have interfaces that may be misused. For example, if the number of values in a
206 /// [PrimitiveBuilder]'s [vortex_buffer::BufferMut] does not match the number of validity bits,
207 /// the PrimitiveBuilder's [Self::finish] will panic.
208 fn finish(&mut self) -> ArrayRef;
209
210 /// Constructs a canonical array directly from the builder.
211 ///
212 /// This method provides a default implementation that creates an [`ArrayRef`] via `finish` and
213 /// then converts it to canonical form. Specific builders can override this with optimized
214 /// implementations that avoid the intermediate [`ArrayRef`] creation.
215 fn finish_into_canonical(&mut self) -> Canonical;
216}
217
218/// Construct a new canonical builder for the given [`DType`].
219///
220///
221/// # Example
222///
223/// ```
224/// use vortex_array::builders::{builder_with_capacity, ArrayBuilder};
225/// use vortex_array::dtype::{DType, Nullability};
226///
227/// // Create a new builder for string data.
228/// let mut builder = builder_with_capacity(&DType::Utf8(Nullability::NonNullable), 4);
229///
230/// builder.append_scalar(&"a".into()).unwrap();
231/// builder.append_scalar(&"b".into()).unwrap();
232/// builder.append_scalar(&"c".into()).unwrap();
233/// builder.append_scalar(&"d".into()).unwrap();
234///
235/// let strings = builder.finish();
236///
237/// assert_eq!(strings.scalar_at(0).unwrap(), "a".into());
238/// assert_eq!(strings.scalar_at(1).unwrap(), "b".into());
239/// assert_eq!(strings.scalar_at(2).unwrap(), "c".into());
240/// assert_eq!(strings.scalar_at(3).unwrap(), "d".into());
241/// ```
242pub fn builder_with_capacity(dtype: &DType, capacity: usize) -> Box<dyn ArrayBuilder> {
243 match dtype {
244 DType::Null => Box::new(NullBuilder::new()),
245 DType::Bool(n) => Box::new(BoolBuilder::with_capacity(*n, capacity)),
246 DType::Primitive(ptype, n) => {
247 match_each_native_ptype!(ptype, |P| {
248 Box::new(PrimitiveBuilder::<P>::with_capacity(*n, capacity))
249 })
250 }
251 DType::Decimal(decimal_type, n) => {
252 match_each_decimal_value_type!(
253 DecimalType::smallest_decimal_value_type(decimal_type),
254 |D| {
255 Box::new(DecimalBuilder::with_capacity::<D>(
256 capacity,
257 *decimal_type,
258 *n,
259 ))
260 }
261 )
262 }
263 DType::Utf8(n) => Box::new(VarBinViewBuilder::with_capacity(DType::Utf8(*n), capacity)),
264 DType::Binary(n) => Box::new(VarBinViewBuilder::with_capacity(
265 DType::Binary(*n),
266 capacity,
267 )),
268 DType::Struct(struct_dtype, n) => Box::new(StructBuilder::with_capacity(
269 struct_dtype.clone(),
270 *n,
271 capacity,
272 )),
273 DType::List(dtype, n) => Box::new(ListViewBuilder::<u64, u64>::with_capacity(
274 Arc::clone(dtype),
275 *n,
276 2 * capacity, // Arbitrarily choose 2 times the `offsets` capacity here.
277 capacity,
278 )),
279 DType::FixedSizeList(elem_dtype, list_size, null) => {
280 Box::new(FixedSizeListBuilder::with_capacity(
281 Arc::clone(elem_dtype),
282 *list_size,
283 *null,
284 capacity,
285 ))
286 }
287 DType::Extension(ext_dtype) => {
288 Box::new(ExtensionBuilder::with_capacity(ext_dtype.clone(), capacity))
289 }
290 DType::Variant(_) => {
291 unimplemented!()
292 }
293 }
294}
295
296/// Construct a new canonical builder for the given [`DType`] using a host
297/// [`crate::memory::HostAllocator`].
298pub fn builder_with_capacity_in(
299 allocator: HostAllocatorRef,
300 dtype: &DType,
301 capacity: usize,
302) -> Box<dyn ArrayBuilder> {
303 let _allocator = allocator;
304 builder_with_capacity(dtype, capacity)
305}