vortex_array/arrays/varbinview/array.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use vortex_buffer::Buffer;
7use vortex_buffer::ByteBuffer;
8use vortex_dtype::DType;
9use vortex_dtype::Nullability;
10use vortex_error::VortexExpect;
11use vortex_error::VortexResult;
12use vortex_error::vortex_bail;
13use vortex_error::vortex_ensure;
14use vortex_error::vortex_err;
15use vortex_error::vortex_panic;
16use vortex_vector::binaryview::BinaryView;
17
18use crate::builders::ArrayBuilder;
19use crate::builders::VarBinViewBuilder;
20use crate::stats::ArrayStats;
21use crate::validity::Validity;
22
23/// A variable-length binary view array that stores strings and binary data efficiently.
24///
25/// This mirrors the Apache Arrow StringView/BinaryView array encoding and provides
26/// an optimized representation for variable-length data with excellent performance
27/// characteristics for both short and long strings.
28///
29/// ## Data Layout
30///
31/// The array uses a hybrid storage approach with two main components:
32/// - **Views buffer**: Array of 16-byte `BinaryView` entries (one per logical element)
33/// - **Data buffers**: Shared backing storage for strings longer than 12 bytes
34///
35/// ## View Structure
36///
37/// Commonly referred to as "German Strings", each 16-byte view entry contains either:
38/// - **Inlined data**: For strings ≤ 12 bytes, the entire string is stored directly in the view
39/// - **Reference data**: For strings > 12 bytes, contains:
40/// - String length (4 bytes)
41/// - First 4 bytes of string as prefix (4 bytes)
42/// - Buffer index and offset (8 bytes total)
43///
44/// The following ASCII graphic is reproduced verbatim from the Arrow documentation:
45///
46/// ```text
47/// ┌──────┬────────────────────────┐
48/// │length│ string value │
49/// Strings (len <= 12) │ │ (padded with 0) │
50/// └──────┴────────────────────────┘
51/// 0 31 127
52///
53/// ┌───────┬───────┬───────┬───────┐
54/// │length │prefix │ buf │offset │
55/// Strings (len > 12) │ │ │ index │ │
56/// └───────┴───────┴───────┴───────┘
57/// 0 31 63 95 127
58/// ```
59///
60/// # Examples
61///
62/// ```
63/// use vortex_array::arrays::VarBinViewArray;
64/// use vortex_dtype::{DType, Nullability};
65/// use vortex_array::IntoArray;
66///
67/// // Create from an Iterator<Item = &str>
68/// let array = VarBinViewArray::from_iter_str([
69/// "inlined",
70/// "this string is outlined"
71/// ]);
72///
73/// assert_eq!(array.len(), 2);
74///
75/// // Access individual strings
76/// let first = array.bytes_at(0);
77/// assert_eq!(first.as_slice(), b"inlined"); // "short"
78///
79/// let second = array.bytes_at(1);
80/// assert_eq!(second.as_slice(), b"this string is outlined"); // Long string
81/// ```
82#[derive(Clone, Debug)]
83pub struct VarBinViewArray {
84 pub(super) dtype: DType,
85 pub(super) buffers: Arc<[ByteBuffer]>,
86 pub(super) views: Buffer<BinaryView>,
87 pub(super) validity: Validity,
88 pub(super) stats_set: ArrayStats,
89}
90
91impl VarBinViewArray {
92 /// Creates a new [`VarBinViewArray`].
93 ///
94 /// # Panics
95 ///
96 /// Panics if the provided components do not satisfy the invariants documented
97 /// in [`VarBinViewArray::new_unchecked`].
98 pub fn new(
99 views: Buffer<BinaryView>,
100 buffers: Arc<[ByteBuffer]>,
101 dtype: DType,
102 validity: Validity,
103 ) -> Self {
104 Self::try_new(views, buffers, dtype, validity)
105 .vortex_expect("VarBinViewArray construction failed")
106 }
107
108 /// Constructs a new `VarBinViewArray`.
109 ///
110 /// See [`VarBinViewArray::new_unchecked`] for more information.
111 ///
112 /// # Errors
113 ///
114 /// Returns an error if the provided components do not satisfy the invariants documented in
115 /// [`VarBinViewArray::new_unchecked`].
116 pub fn try_new(
117 views: Buffer<BinaryView>,
118 buffers: Arc<[ByteBuffer]>,
119 dtype: DType,
120 validity: Validity,
121 ) -> VortexResult<Self> {
122 Self::validate(&views, &buffers, &dtype, &validity)?;
123
124 // SAFETY: validate ensures all invariants are met.
125 Ok(unsafe { Self::new_unchecked(views, buffers, dtype, validity) })
126 }
127
128 /// Creates a new [`VarBinViewArray`] without validation from these components:
129 ///
130 /// * `views` is a buffer of 16-byte view entries (one per logical element).
131 /// * `buffers` contains the backing storage for strings longer than 12 bytes.
132 /// * `dtype` specifies whether this contains UTF-8 strings or binary data.
133 /// * `validity` holds the null values.
134 ///
135 /// # Safety
136 ///
137 /// The caller must ensure all of the following invariants are satisfied:
138 ///
139 /// ## View Requirements
140 ///
141 /// - Views must be properly formatted 16-byte [`BinaryView`] entries.
142 /// - Inlined views (length ≤ 12) must have valid data in the first `length` bytes.
143 /// - Reference views (length > 12) must:
144 /// - Have a valid buffer index < `buffers.len()`.
145 /// - Have valid offsets that don't exceed the referenced buffer's bounds.
146 /// - Have a 4-byte prefix that matches the actual data at the referenced location.
147 ///
148 /// ## Type Requirements
149 ///
150 /// - `dtype` must be either [`DType::Utf8`] or [`DType::Binary`].
151 /// - For [`DType::Utf8`], all string data (both inlined and referenced) must be valid UTF-8.
152 ///
153 /// ## Validity Requirements
154 ///
155 /// - The validity must have the same nullability as the dtype.
156 /// - If validity is an array, its length must match `views.len()`.
157 pub unsafe fn new_unchecked(
158 views: Buffer<BinaryView>,
159 buffers: Arc<[ByteBuffer]>,
160 dtype: DType,
161 validity: Validity,
162 ) -> Self {
163 #[cfg(debug_assertions)]
164 Self::validate(&views, &buffers, &dtype, &validity)
165 .vortex_expect("[Debug Assertion]: Invalid `VarBinViewArray` parameters");
166
167 Self {
168 dtype,
169 buffers,
170 views,
171 validity,
172 stats_set: Default::default(),
173 }
174 }
175
176 /// Validates the components that would be used to create a [`VarBinViewArray`].
177 ///
178 /// This function checks all the invariants required by [`VarBinViewArray::new_unchecked`].
179 pub fn validate(
180 views: &Buffer<BinaryView>,
181 buffers: &Arc<[ByteBuffer]>,
182 dtype: &DType,
183 validity: &Validity,
184 ) -> VortexResult<()> {
185 vortex_ensure!(
186 validity.nullability() == dtype.nullability(),
187 "validity {:?} incompatible with nullability {:?}",
188 validity,
189 dtype.nullability()
190 );
191
192 match dtype {
193 DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
194 simdutf8::basic::from_utf8(string).is_ok()
195 })?,
196 DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
197 _ => vortex_bail!("invalid DType {dtype} for `VarBinViewArray`"),
198 }
199
200 Ok(())
201 }
202
203 fn validate_views<F>(
204 views: &Buffer<BinaryView>,
205 buffers: &Arc<[ByteBuffer]>,
206 validity: &Validity,
207 validator: F,
208 ) -> VortexResult<()>
209 where
210 F: Fn(&[u8]) -> bool,
211 {
212 for (idx, &view) in views.iter().enumerate() {
213 if validity.is_null(idx) {
214 continue;
215 }
216
217 if view.is_inlined() {
218 // Validate the inline bytestring
219 let bytes = &view.as_inlined().data[..view.len() as usize];
220 vortex_ensure!(
221 validator(bytes),
222 "view at index {idx}: inlined bytes failed utf-8 validation"
223 );
224 } else {
225 // Validate the view pointer
226 let view = view.as_view();
227 let buf_index = view.buffer_index as usize;
228 let start_offset = view.offset as usize;
229 let end_offset = start_offset.saturating_add(view.size as usize);
230
231 let buf = buffers.get(buf_index).ok_or_else(||
232 vortex_err!("view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewArray with {} buffers",
233 buffers.len()))?;
234
235 vortex_ensure!(
236 start_offset < buf.len(),
237 "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
238 buf.len(),
239 );
240
241 vortex_ensure!(
242 end_offset <= buf.len(),
243 "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
244 buf.len(),
245 );
246
247 // Make sure the prefix data matches the buffer data.
248 let bytes = &buf[start_offset..end_offset];
249 vortex_ensure!(
250 view.prefix == bytes[..4],
251 "VarBinView prefix does not match full string"
252 );
253
254 // Validate the full string
255 vortex_ensure!(
256 validator(bytes),
257 "view at index {idx}: outlined bytes fails utf-8 validation"
258 );
259 }
260 }
261
262 Ok(())
263 }
264
265 /// Number of raw string data buffers held by this array.
266 pub fn nbuffers(&self) -> usize {
267 self.buffers.len()
268 }
269
270 /// Access to the primitive views buffer.
271 ///
272 /// Variable-sized binary view buffer contain a "view" child array, with 16-byte entries that
273 /// contain either a pointer into one of the array's owned `buffer`s OR an inlined copy of
274 /// the string (if the string has 12 bytes or fewer).
275 #[inline]
276 pub fn views(&self) -> &Buffer<BinaryView> {
277 &self.views
278 }
279
280 /// Access value bytes at a given index
281 ///
282 /// Will return a `ByteBuffer` containing the data without performing a copy.
283 #[inline]
284 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
285 let views = self.views();
286 let view = &views[index];
287 // Expect this to be the common case: strings > 12 bytes.
288 if !view.is_inlined() {
289 let view_ref = view.as_view();
290 self.buffer(view_ref.buffer_index as usize)
291 .slice(view_ref.as_range())
292 } else {
293 // Return access to the range of bytes around it.
294 views
295 .clone()
296 .into_byte_buffer()
297 .slice_ref(view.as_inlined().value())
298 }
299 }
300
301 /// Access one of the backing data buffers.
302 ///
303 /// # Panics
304 ///
305 /// This method panics if the provided index is out of bounds for the set of buffers provided
306 /// at construction time.
307 #[inline]
308 pub fn buffer(&self, idx: usize) -> &ByteBuffer {
309 if idx >= self.nbuffers() {
310 vortex_panic!(
311 "{idx} buffer index out of bounds, there are {} buffers",
312 self.nbuffers()
313 );
314 }
315 &self.buffers[idx]
316 }
317
318 /// Iterate over the underlying raw data buffers, not including the views buffer.
319 #[inline]
320 pub fn buffers(&self) -> &Arc<[ByteBuffer]> {
321 &self.buffers
322 }
323
324 /// Accumulate an iterable set of values into our type here.
325 #[expect(
326 clippy::same_name_method,
327 reason = "intentionally named from_iter like Iterator::from_iter"
328 )]
329 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
330 iter: I,
331 dtype: DType,
332 ) -> Self {
333 let iter = iter.into_iter();
334 let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
335
336 for item in iter {
337 match item {
338 None => builder.append_null(),
339 Some(v) => builder.append_value(v),
340 }
341 }
342
343 builder.finish_into_varbinview()
344 }
345
346 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
347 let iter = iter.into_iter();
348 let mut builder = VarBinViewBuilder::with_capacity(
349 DType::Utf8(Nullability::NonNullable),
350 iter.size_hint().0,
351 );
352
353 for item in iter {
354 builder.append_value(item.as_ref());
355 }
356
357 builder.finish_into_varbinview()
358 }
359
360 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
361 iter: I,
362 ) -> Self {
363 let iter = iter.into_iter();
364 let mut builder = VarBinViewBuilder::with_capacity(
365 DType::Utf8(Nullability::Nullable),
366 iter.size_hint().0,
367 );
368
369 for item in iter {
370 match item {
371 None => builder.append_null(),
372 Some(v) => builder.append_value(v.as_ref()),
373 }
374 }
375
376 builder.finish_into_varbinview()
377 }
378
379 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
380 let iter = iter.into_iter();
381 let mut builder = VarBinViewBuilder::with_capacity(
382 DType::Binary(Nullability::NonNullable),
383 iter.size_hint().0,
384 );
385
386 for item in iter {
387 builder.append_value(item.as_ref());
388 }
389
390 builder.finish_into_varbinview()
391 }
392
393 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
394 iter: I,
395 ) -> Self {
396 let iter = iter.into_iter();
397 let mut builder = VarBinViewBuilder::with_capacity(
398 DType::Binary(Nullability::Nullable),
399 iter.size_hint().0,
400 );
401
402 for item in iter {
403 match item {
404 None => builder.append_null(),
405 Some(v) => builder.append_value(v.as_ref()),
406 }
407 }
408
409 builder.finish_into_varbinview()
410 }
411}
412
413impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
414 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
415 Self::from_iter_nullable_bin(iter)
416 }
417}
418
419impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
420 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
421 Self::from_iter_nullable_bin(iter)
422 }
423}
424
425impl FromIterator<Option<String>> for VarBinViewArray {
426 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
427 Self::from_iter_nullable_str(iter)
428 }
429}
430
431impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
432 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
433 Self::from_iter_nullable_str(iter)
434 }
435}