vortex_array/builders/mod.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Builders for Vortex arrays.
5//!
6//! Every logical type in Vortex has a canonical (uncompressed) in-memory encoding. This module
7//! provides pre-allocated builders to construct new canonical arrays.
8//!
9//! ## Example:
10//!
11//! ```
12//! use vortex_array::builders::{builder_with_capacity, ArrayBuilder};
13//! use vortex_array::dtype::{DType, Nullability};
14//!
15//! // Create a new builder for string data.
16//! let mut builder = builder_with_capacity(&DType::Utf8(Nullability::NonNullable), 4);
17//!
18//! builder.append_scalar(&"a".into()).unwrap();
19//! builder.append_scalar(&"b".into()).unwrap();
20//! builder.append_scalar(&"c".into()).unwrap();
21//! builder.append_scalar(&"d".into()).unwrap();
22//!
23//! let strings = builder.finish();
24//!
25//! assert_eq!(strings.scalar_at(0).unwrap(), "a".into());
26//! assert_eq!(strings.scalar_at(1).unwrap(), "b".into());
27//! assert_eq!(strings.scalar_at(2).unwrap(), "c".into());
28//! assert_eq!(strings.scalar_at(3).unwrap(), "d".into());
29//! ```
30
31use std::any::Any;
32
33use vortex_error::VortexResult;
34use vortex_error::vortex_panic;
35use vortex_mask::Mask;
36
37use crate::ArrayRef;
38use crate::canonical::Canonical;
39use crate::dtype::DType;
40use crate::match_each_decimal_value_type;
41use crate::match_each_native_ptype;
42use crate::scalar::Scalar;
43
44mod lazy_null_builder;
45pub(crate) use lazy_null_builder::LazyBitBufferBuilder;
46
47mod bool;
48mod decimal;
49pub mod dict;
50mod extension;
51mod fixed_size_list;
52mod list;
53mod listview;
54mod null;
55mod primitive;
56mod struct_;
57mod varbinview;
58
59pub use bool::*;
60pub use decimal::*;
61pub use extension::*;
62pub use fixed_size_list::*;
63pub use list::*;
64pub use listview::*;
65pub use null::*;
66pub use primitive::*;
67pub use struct_::*;
68pub use varbinview::*;
69
70#[cfg(test)]
71mod tests;
72
73/// The default capacity for builders.
74///
75/// This is equal to the default capacity for Arrow Arrays.
76pub const DEFAULT_BUILDER_CAPACITY: usize = 1024;
77
78pub trait ArrayBuilder: Send {
79 fn as_any(&self) -> &dyn Any;
80
81 fn as_any_mut(&mut self) -> &mut dyn Any;
82
83 fn dtype(&self) -> &DType;
84
85 fn len(&self) -> usize;
86
87 fn is_empty(&self) -> bool {
88 self.len() == 0
89 }
90
91 /// Append a "zero" value to the array.
92 ///
93 /// Zero values are generally determined by [`Scalar::default_value`].
94 fn append_zero(&mut self) {
95 self.append_zeros(1)
96 }
97
98 /// Appends n "zero" values to the array.
99 ///
100 /// Zero values are generally determined by [`Scalar::default_value`].
101 fn append_zeros(&mut self, n: usize);
102
103 /// Append a "null" value to the array.
104 ///
105 /// Implementors should panic if this method is called on a non-nullable [`ArrayBuilder`].
106 fn append_null(&mut self) {
107 self.append_nulls(1)
108 }
109
110 /// The inner part of `append_nulls`.
111 ///
112 /// # Safety
113 ///
114 /// The array builder must be nullable.
115 unsafe fn append_nulls_unchecked(&mut self, n: usize);
116
117 /// Appends n "null" values to the array.
118 ///
119 /// Implementors should panic if this method is called on a non-nullable [`ArrayBuilder`].
120 fn append_nulls(&mut self, n: usize) {
121 assert!(
122 self.dtype().is_nullable(),
123 "tried to append {n} nulls to a non-nullable array builder"
124 );
125
126 // SAFETY: We check above that the array builder is nullable.
127 unsafe {
128 self.append_nulls_unchecked(n);
129 }
130 }
131
132 /// Appends a default value to the array.
133 fn append_default(&mut self) {
134 self.append_defaults(1)
135 }
136
137 /// Appends n default values to the array.
138 ///
139 /// If the array builder is nullable, then this has the behavior of `self.append_nulls(n)`.
140 /// If the array builder is non-nullable, then it has the behavior of `self.append_zeros(n)`.
141 fn append_defaults(&mut self, n: usize) {
142 if self.dtype().is_nullable() {
143 self.append_nulls(n);
144 } else {
145 self.append_zeros(n);
146 }
147 }
148
149 /// A generic function to append a scalar to the builder.
150 fn append_scalar(&mut self, scalar: &Scalar) -> VortexResult<()>;
151
152 /// The inner part of `extend_from_array`.
153 ///
154 /// # Safety
155 ///
156 /// The array that must have an equal [`DType`] to the array builder's `DType` (with nullability
157 /// superset semantics).
158 unsafe fn extend_from_array_unchecked(&mut self, array: &ArrayRef);
159
160 /// Extends the array with the provided array, canonicalizing if necessary.
161 ///
162 /// Implementors must validate that the passed in [`ArrayRef`] has the correct [`DType`].
163 fn extend_from_array(&mut self, array: &ArrayRef) {
164 if !self.dtype().eq_with_nullability_superset(array.dtype()) {
165 vortex_panic!(
166 "tried to extend a builder with `DType` {} with an array with `DType {}",
167 self.dtype(),
168 array.dtype()
169 );
170 }
171
172 // SAFETY: We checked that the array had a valid `DType` above.
173 unsafe { self.extend_from_array_unchecked(array) }
174 }
175
176 /// Allocate space for extra `additional` items
177 fn reserve_exact(&mut self, additional: usize);
178
179 /// Override builders validity with the one provided.
180 ///
181 /// Note that this will have no effect on the final array if the array builder is non-nullable.
182 fn set_validity(&mut self, validity: Mask) {
183 if !self.dtype().is_nullable() {
184 return;
185 }
186 assert_eq!(self.len(), validity.len());
187 unsafe { self.set_validity_unchecked(validity) }
188 }
189
190 /// override validity with the one provided, without checking lengths
191 ///
192 /// # Safety
193 ///
194 /// Given validity must have an equal length to [`self.len()`](Self::len).
195 unsafe fn set_validity_unchecked(&mut self, validity: Mask);
196
197 /// Constructs an Array from the builder components.
198 ///
199 /// # Panics
200 ///
201 /// This function may panic if the builder's methods are called with invalid arguments. If only
202 /// the methods on this interface are used, the builder should not panic. However, specific
203 /// builders have interfaces that may be misused. For example, if the number of values in a
204 /// [PrimitiveBuilder]'s [vortex_buffer::BufferMut] does not match the number of validity bits,
205 /// the PrimitiveBuilder's [Self::finish] will panic.
206 fn finish(&mut self) -> ArrayRef;
207
208 /// Constructs a canonical array directly from the builder.
209 ///
210 /// This method provides a default implementation that creates an [`ArrayRef`] via `finish` and
211 /// then converts it to canonical form. Specific builders can override this with optimized
212 /// implementations that avoid the intermediate [`ArrayRef`] creation.
213 fn finish_into_canonical(&mut self) -> Canonical;
214}
215
216/// Construct a new canonical builder for the given [`DType`].
217///
218///
219/// # Example
220///
221/// ```
222/// use vortex_array::builders::{builder_with_capacity, ArrayBuilder};
223/// use vortex_array::dtype::{DType, Nullability};
224///
225/// // Create a new builder for string data.
226/// let mut builder = builder_with_capacity(&DType::Utf8(Nullability::NonNullable), 4);
227///
228/// builder.append_scalar(&"a".into()).unwrap();
229/// builder.append_scalar(&"b".into()).unwrap();
230/// builder.append_scalar(&"c".into()).unwrap();
231/// builder.append_scalar(&"d".into()).unwrap();
232///
233/// let strings = builder.finish();
234///
235/// assert_eq!(strings.scalar_at(0).unwrap(), "a".into());
236/// assert_eq!(strings.scalar_at(1).unwrap(), "b".into());
237/// assert_eq!(strings.scalar_at(2).unwrap(), "c".into());
238/// assert_eq!(strings.scalar_at(3).unwrap(), "d".into());
239/// ```
240pub fn builder_with_capacity(dtype: &DType, capacity: usize) -> Box<dyn ArrayBuilder> {
241 match dtype {
242 DType::Null => Box::new(NullBuilder::new()),
243 DType::Bool(n) => Box::new(BoolBuilder::with_capacity(*n, capacity)),
244 DType::Primitive(ptype, n) => {
245 match_each_native_ptype!(ptype, |P| {
246 Box::new(PrimitiveBuilder::<P>::with_capacity(*n, capacity))
247 })
248 }
249 DType::Decimal(decimal_type, n) => {
250 match_each_decimal_value_type!(
251 DecimalType::smallest_decimal_value_type(decimal_type),
252 |D| {
253 Box::new(DecimalBuilder::with_capacity::<D>(
254 capacity,
255 *decimal_type,
256 *n,
257 ))
258 }
259 )
260 }
261 DType::Utf8(n) => Box::new(VarBinViewBuilder::with_capacity(DType::Utf8(*n), capacity)),
262 DType::Binary(n) => Box::new(VarBinViewBuilder::with_capacity(
263 DType::Binary(*n),
264 capacity,
265 )),
266 DType::Struct(struct_dtype, n) => Box::new(StructBuilder::with_capacity(
267 struct_dtype.clone(),
268 *n,
269 capacity,
270 )),
271 DType::List(dtype, n) => Box::new(ListViewBuilder::<u64, u64>::with_capacity(
272 dtype.clone(),
273 *n,
274 2 * capacity, // Arbitrarily choose 2 times the `offsets` capacity here.
275 capacity,
276 )),
277 DType::FixedSizeList(elem_dtype, list_size, null) => Box::new(
278 FixedSizeListBuilder::with_capacity(elem_dtype.clone(), *list_size, *null, capacity),
279 ),
280 DType::Extension(ext_dtype) => {
281 Box::new(ExtensionBuilder::with_capacity(ext_dtype.clone(), capacity))
282 }
283 }
284}