vortex_array/arrays/varbinview/array.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use vortex_buffer::{Buffer, ByteBuffer};
7use vortex_dtype::{DType, Nullability};
8use vortex_error::{
9 VortexExpect, VortexResult, vortex_bail, vortex_ensure, vortex_err, vortex_panic,
10};
11
12use crate::arrays::binary_view::BinaryView;
13use crate::builders::{ArrayBuilder, VarBinViewBuilder};
14use crate::stats::ArrayStats;
15use crate::validity::Validity;
16
17/// A variable-length binary view array that stores strings and binary data efficiently.
18///
19/// This mirrors the Apache Arrow StringView/BinaryView array encoding and provides
20/// an optimized representation for variable-length data with excellent performance
21/// characteristics for both short and long strings.
22///
23/// ## Data Layout
24///
25/// The array uses a hybrid storage approach with two main components:
26/// - **Views buffer**: Array of 16-byte `BinaryView` entries (one per logical element)
27/// - **Data buffers**: Shared backing storage for strings longer than 12 bytes
28///
29/// ## View Structure
30///
31/// Commonly referred to as "German Strings", each 16-byte view entry contains either:
32/// - **Inlined data**: For strings ≤ 12 bytes, the entire string is stored directly in the view
33/// - **Reference data**: For strings > 12 bytes, contains:
34/// - String length (4 bytes)
35/// - First 4 bytes of string as prefix (4 bytes)
36/// - Buffer index and offset (8 bytes total)
37///
38/// The following ASCII graphic is reproduced verbatim from the Arrow documentation:
39///
40/// ```text
41/// ┌──────┬────────────────────────┐
42/// │length│ string value │
43/// Strings (len <= 12) │ │ (padded with 0) │
44/// └──────┴────────────────────────┘
45/// 0 31 127
46///
47/// ┌───────┬───────┬───────┬───────┐
48/// │length │prefix │ buf │offset │
49/// Strings (len > 12) │ │ │ index │ │
50/// └───────┴───────┴───────┴───────┘
51/// 0 31 63 95 127
52/// ```
53///
54/// # Examples
55///
56/// ```
57/// use vortex_array::arrays::VarBinViewArray;
58/// use vortex_dtype::{DType, Nullability};
59/// use vortex_array::IntoArray;
60///
61/// // Create from an Iterator<Item = &str>
62/// let array = VarBinViewArray::from_iter_str([
63/// "inlined",
64/// "this string is outlined"
65/// ]);
66///
67/// assert_eq!(array.len(), 2);
68///
69/// // Access individual strings
70/// let first = array.bytes_at(0);
71/// assert_eq!(first.as_slice(), b"inlined"); // "short"
72///
73/// let second = array.bytes_at(1);
74/// assert_eq!(second.as_slice(), b"this string is outlined"); // Long string
75/// ```
76#[derive(Clone, Debug)]
77pub struct VarBinViewArray {
78 pub(super) dtype: DType,
79 pub(super) buffers: Arc<[ByteBuffer]>,
80 pub(super) views: Buffer<BinaryView>,
81 pub(super) validity: Validity,
82 pub(super) stats_set: ArrayStats,
83}
84
85impl VarBinViewArray {
86 /// Creates a new [`VarBinViewArray`].
87 ///
88 /// # Panics
89 ///
90 /// Panics if the provided components do not satisfy the invariants documented
91 /// in [`VarBinViewArray::new_unchecked`].
92 pub fn new(
93 views: Buffer<BinaryView>,
94 buffers: Arc<[ByteBuffer]>,
95 dtype: DType,
96 validity: Validity,
97 ) -> Self {
98 Self::try_new(views, buffers, dtype, validity)
99 .vortex_expect("VarBinViewArray construction failed")
100 }
101
102 /// Constructs a new `VarBinViewArray`.
103 ///
104 /// See [`VarBinViewArray::new_unchecked`] for more information.
105 ///
106 /// # Errors
107 ///
108 /// Returns an error if the provided components do not satisfy the invariants documented in
109 /// [`VarBinViewArray::new_unchecked`].
110 pub fn try_new(
111 views: Buffer<BinaryView>,
112 buffers: Arc<[ByteBuffer]>,
113 dtype: DType,
114 validity: Validity,
115 ) -> VortexResult<Self> {
116 Self::validate(&views, &buffers, &dtype, &validity)?;
117
118 // SAFETY: validate ensures all invariants are met.
119 Ok(unsafe { Self::new_unchecked(views, buffers, dtype, validity) })
120 }
121
122 /// Creates a new [`VarBinViewArray`] without validation from these components:
123 ///
124 /// * `views` is a buffer of 16-byte view entries (one per logical element).
125 /// * `buffers` contains the backing storage for strings longer than 12 bytes.
126 /// * `dtype` specifies whether this contains UTF-8 strings or binary data.
127 /// * `validity` holds the null values.
128 ///
129 /// # Safety
130 ///
131 /// The caller must ensure all of the following invariants are satisfied:
132 ///
133 /// ## View Requirements
134 ///
135 /// - Views must be properly formatted 16-byte [`BinaryView`] entries.
136 /// - Inlined views (length ≤ 12) must have valid data in the first `length` bytes.
137 /// - Reference views (length > 12) must:
138 /// - Have a valid buffer index < `buffers.len()`.
139 /// - Have valid offsets that don't exceed the referenced buffer's bounds.
140 /// - Have a 4-byte prefix that matches the actual data at the referenced location.
141 ///
142 /// ## Type Requirements
143 ///
144 /// - `dtype` must be either [`DType::Utf8`] or [`DType::Binary`].
145 /// - For [`DType::Utf8`], all string data (both inlined and referenced) must be valid UTF-8.
146 ///
147 /// ## Validity Requirements
148 ///
149 /// - The validity must have the same nullability as the dtype.
150 /// - If validity is an array, its length must match `views.len()`.
151 pub unsafe fn new_unchecked(
152 views: Buffer<BinaryView>,
153 buffers: Arc<[ByteBuffer]>,
154 dtype: DType,
155 validity: Validity,
156 ) -> Self {
157 #[cfg(debug_assertions)]
158 Self::validate(&views, &buffers, &dtype, &validity)
159 .vortex_expect("[Debug Assertion]: Invalid `VarBinViewArray` parameters");
160
161 Self {
162 dtype,
163 buffers,
164 views,
165 validity,
166 stats_set: Default::default(),
167 }
168 }
169
170 /// Validates the components that would be used to create a [`VarBinViewArray`].
171 ///
172 /// This function checks all the invariants required by [`VarBinViewArray::new_unchecked`].
173 pub fn validate(
174 views: &Buffer<BinaryView>,
175 buffers: &Arc<[ByteBuffer]>,
176 dtype: &DType,
177 validity: &Validity,
178 ) -> VortexResult<()> {
179 vortex_ensure!(
180 validity.nullability() == dtype.nullability(),
181 "validity {:?} incompatible with nullability {:?}",
182 validity,
183 dtype.nullability()
184 );
185
186 match dtype {
187 DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
188 simdutf8::basic::from_utf8(string).is_ok()
189 })?,
190 DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
191 _ => vortex_bail!("invalid DType {dtype} for `VarBinViewArray`"),
192 }
193
194 Ok(())
195 }
196
197 fn validate_views<F>(
198 views: &Buffer<BinaryView>,
199 buffers: &Arc<[ByteBuffer]>,
200 validity: &Validity,
201 validator: F,
202 ) -> VortexResult<()>
203 where
204 F: Fn(&[u8]) -> bool,
205 {
206 for (idx, &view) in views.iter().enumerate() {
207 if validity.is_null(idx) {
208 continue;
209 }
210
211 if view.is_inlined() {
212 // Validate the inline bytestring
213 let bytes = &unsafe { view.inlined }.data[..view.len() as usize];
214 vortex_ensure!(
215 validator(bytes),
216 "view at index {idx}: inlined bytes failed utf-8 validation"
217 );
218 } else {
219 // Validate the view pointer
220 let view = view.as_view();
221 let buf_index = view.buffer_index as usize;
222 let start_offset = view.offset as usize;
223 let end_offset = start_offset.saturating_add(view.size as usize);
224
225 let buf = buffers.get(buf_index).ok_or_else(||
226 vortex_err!("view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewArray with {} buffers",
227 buffers.len()))?;
228
229 vortex_ensure!(
230 start_offset < buf.len(),
231 "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
232 buf.len(),
233 );
234
235 vortex_ensure!(
236 end_offset <= buf.len(),
237 "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
238 buf.len(),
239 );
240
241 // Make sure the prefix data matches the buffer data.
242 let bytes = &buf[start_offset..end_offset];
243 vortex_ensure!(
244 view.prefix == bytes[..4],
245 "VarBinView prefix does not match full string"
246 );
247
248 // Validate the full string
249 vortex_ensure!(
250 validator(bytes),
251 "view at index {idx}: outlined bytes fails utf-8 validation"
252 );
253 }
254 }
255
256 Ok(())
257 }
258
259 /// Number of raw string data buffers held by this array.
260 pub fn nbuffers(&self) -> usize {
261 self.buffers.len()
262 }
263
264 /// Access to the primitive views buffer.
265 ///
266 /// Variable-sized binary view buffer contain a "view" child array, with 16-byte entries that
267 /// contain either a pointer into one of the array's owned `buffer`s OR an inlined copy of
268 /// the string (if the string has 12 bytes or fewer).
269 #[inline]
270 pub fn views(&self) -> &Buffer<BinaryView> {
271 &self.views
272 }
273
274 /// Access value bytes at a given index
275 ///
276 /// Will return a `ByteBuffer` containing the data without performing a copy.
277 #[inline]
278 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
279 let views = self.views();
280 let view = &views[index];
281 // Expect this to be the common case: strings > 12 bytes.
282 if !view.is_inlined() {
283 let view_ref = view.as_view();
284 self.buffer(view_ref.buffer_index() as usize)
285 .slice(view_ref.as_range())
286 } else {
287 // Return access to the range of bytes around it.
288 views
289 .clone()
290 .into_byte_buffer()
291 .slice_ref(view.as_inlined().value())
292 }
293 }
294
295 /// Access one of the backing data buffers.
296 ///
297 /// # Panics
298 ///
299 /// This method panics if the provided index is out of bounds for the set of buffers provided
300 /// at construction time.
301 #[inline]
302 pub fn buffer(&self, idx: usize) -> &ByteBuffer {
303 if idx >= self.nbuffers() {
304 vortex_panic!(
305 "{idx} buffer index out of bounds, there are {} buffers",
306 self.nbuffers()
307 );
308 }
309 &self.buffers[idx]
310 }
311
312 /// Iterate over the underlying raw data buffers, not including the views buffer.
313 #[inline]
314 pub fn buffers(&self) -> &Arc<[ByteBuffer]> {
315 &self.buffers
316 }
317
318 /// Accumulate an iterable set of values into our type here.
319 #[allow(clippy::same_name_method)]
320 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
321 iter: I,
322 dtype: DType,
323 ) -> Self {
324 let iter = iter.into_iter();
325 let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
326
327 for item in iter {
328 match item {
329 None => builder.append_null(),
330 Some(v) => builder.append_value(v),
331 }
332 }
333
334 builder.finish_into_varbinview()
335 }
336
337 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
338 let iter = iter.into_iter();
339 let mut builder = VarBinViewBuilder::with_capacity(
340 DType::Utf8(Nullability::NonNullable),
341 iter.size_hint().0,
342 );
343
344 for item in iter {
345 builder.append_value(item.as_ref());
346 }
347
348 builder.finish_into_varbinview()
349 }
350
351 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
352 iter: I,
353 ) -> Self {
354 let iter = iter.into_iter();
355 let mut builder = VarBinViewBuilder::with_capacity(
356 DType::Utf8(Nullability::Nullable),
357 iter.size_hint().0,
358 );
359
360 for item in iter {
361 match item {
362 None => builder.append_null(),
363 Some(v) => builder.append_value(v.as_ref()),
364 }
365 }
366
367 builder.finish_into_varbinview()
368 }
369
370 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
371 let iter = iter.into_iter();
372 let mut builder = VarBinViewBuilder::with_capacity(
373 DType::Binary(Nullability::NonNullable),
374 iter.size_hint().0,
375 );
376
377 for item in iter {
378 builder.append_value(item.as_ref());
379 }
380
381 builder.finish_into_varbinview()
382 }
383
384 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
385 iter: I,
386 ) -> Self {
387 let iter = iter.into_iter();
388 let mut builder = VarBinViewBuilder::with_capacity(
389 DType::Binary(Nullability::Nullable),
390 iter.size_hint().0,
391 );
392
393 for item in iter {
394 match item {
395 None => builder.append_null(),
396 Some(v) => builder.append_value(v.as_ref()),
397 }
398 }
399
400 builder.finish_into_varbinview()
401 }
402}
403
404impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
405 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
406 Self::from_iter_nullable_bin(iter)
407 }
408}
409
410impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
411 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
412 Self::from_iter_nullable_bin(iter)
413 }
414}
415
416impl FromIterator<Option<String>> for VarBinViewArray {
417 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
418 Self::from_iter_nullable_str(iter)
419 }
420}
421
422impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
423 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
424 Self::from_iter_nullable_str(iter)
425 }
426}