vortex_vector/binaryview/
vector.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Variable-length binary vector implementation.
5
6use std::fmt::Debug;
7use std::ops::BitAnd;
8use std::ops::RangeBounds;
9use std::sync::Arc;
10
11use vortex_buffer::Alignment;
12use vortex_buffer::Buffer;
13use vortex_buffer::ByteBuffer;
14use vortex_error::VortexExpect;
15use vortex_error::VortexResult;
16use vortex_error::vortex_ensure;
17use vortex_mask::Mask;
18
19use crate::VectorOps;
20use crate::binaryview::BinaryViewScalar;
21use crate::binaryview::BinaryViewType;
22use crate::binaryview::vector_mut::BinaryViewVectorMut;
23use crate::binaryview::view::BinaryView;
24use crate::binaryview::view::validate_views;
25
26/// A variable-length binary vector.
27///
28/// This is the core vector for string and binary data.
29#[derive(Debug, Clone)]
30pub struct BinaryViewVector<T: BinaryViewType> {
31    /// Views into the binary data.
32    views: Buffer<BinaryView>,
33    /// Buffers holding the referenced binary data.
34    buffers: Arc<Box<[ByteBuffer]>>,
35    /// Validity mask for the vector.
36    validity: Mask,
37    /// Marker trait for the [`BinaryViewType`].
38    _marker: std::marker::PhantomData<T>,
39}
40
41impl<T: BinaryViewType> PartialEq for BinaryViewVector<T> {
42    fn eq(&self, other: &Self) -> bool {
43        if self.views.len() != other.views.len() {
44            return false;
45        }
46        // Validity patterns must match
47        if self.validity != other.validity {
48            return false;
49        }
50        // Compare all views, OR with !validity to ignore invalid positions
51        self.views
52            .iter()
53            .zip(other.views.iter())
54            .enumerate()
55            .all(|(i, (self_view, other_view))| {
56                // If invalid, treat as equal
57                if !self.validity.value(i) {
58                    return true;
59                }
60                // For valid elements, compare the actual byte content via the view
61                let self_bytes: &[u8] = if self_view.is_inlined() {
62                    self_view.as_inlined().value()
63                } else {
64                    let view_ref = self_view.as_view();
65                    let buffer = &self.buffers[view_ref.buffer_index as usize];
66                    &buffer[view_ref.as_range()]
67                };
68
69                let other_bytes: &[u8] = if other_view.is_inlined() {
70                    other_view.as_inlined().value()
71                } else {
72                    let view_ref = other_view.as_view();
73                    let buffer = &other.buffers[view_ref.buffer_index as usize];
74                    &buffer[view_ref.as_range()]
75                };
76
77                self_bytes == other_bytes
78            })
79    }
80}
81
82impl<T: BinaryViewType> Eq for BinaryViewVector<T> {}
83
84impl<T: BinaryViewType> BinaryViewVector<T> {
85    /// Creates a new [`BinaryViewVector`] from the provided components.
86    ///
87    /// # Safety
88    ///
89    /// This function is unsafe because it does not validate the consistency of the provided
90    /// components.
91    ///
92    /// The caller must uphold all validation that would otherwise be validated by
93    /// the [safe constructor](Self::try_new).
94    pub unsafe fn new_unchecked(
95        views: Buffer<BinaryView>,
96        buffers: Arc<Box<[ByteBuffer]>>,
97        validity: Mask,
98    ) -> Self {
99        if cfg!(debug_assertions) {
100            Self::new(views, buffers, validity)
101        } else {
102            Self {
103                views,
104                validity,
105                buffers,
106                _marker: std::marker::PhantomData,
107            }
108        }
109    }
110
111    /// Create a new `BinaryViewVector` from its components, panicking if validation fails.
112    ///
113    /// # Errors
114    ///
115    /// This function will panic if any of the validation checks performed by
116    /// [`try_new`](Self::try_new) fails.
117    pub fn new(views: Buffer<BinaryView>, buffers: Arc<Box<[ByteBuffer]>>, validity: Mask) -> Self {
118        Self::try_new(views, buffers, validity).vortex_expect("Failed to create `BinaryViewVector`")
119    }
120
121    /// Create a new [`BinaryViewVector`] from the provided components with validation.
122    ///
123    /// # Errors
124    ///
125    /// This function will return an error if any of the following validation checks fails:
126    ///
127    /// 1. The length of the `views` does not match the length of the provided `validity`
128    /// 2. Any non-null `views` point to invalid `buffers` or buffer offset ranges
129    /// 3. Any data stored inlined or in the `buffers` and referenced by the `views` does not
130    ///    conform to the [validation constraints][BinaryViewType::validate] of this view type.
131    pub fn try_new(
132        views: Buffer<BinaryView>,
133        buffers: Arc<Box<[ByteBuffer]>>,
134        validity: Mask,
135    ) -> VortexResult<Self> {
136        vortex_ensure!(
137            views.len() == validity.len(),
138            "views buffer length {} != validity length {}",
139            views.len(),
140            validity.len()
141        );
142
143        validate_views(
144            &views,
145            &*buffers,
146            |index| validity.value(index),
147            T::validate,
148        )?;
149
150        Ok(Self {
151            views,
152            buffers,
153            validity,
154            _marker: std::marker::PhantomData,
155        })
156    }
157
158    /// Decomposes the vector into its constituent parts.
159    pub fn into_parts(self) -> (Buffer<BinaryView>, Arc<Box<[ByteBuffer]>>, Mask) {
160        (self.views, self.buffers, self.validity)
161    }
162
163    /// Get the `index` item from the vector as an owned `Scalar` type with zero-copy.
164    ///
165    /// This function will panic is `index` is out of range for the vector's length.
166    pub fn get(&self, index: usize) -> Option<T::Scalar> {
167        if !self.validity.value(index) {
168            return None;
169        }
170
171        let view = &self.views[index];
172        if view.is_inlined() {
173            let view = view.as_inlined();
174
175            // We find the occurrence of the inlined data in the views buffer.
176            let buffer = self
177                .views
178                .clone()
179                .into_byte_buffer()
180                .aligned(Alignment::none())
181                .slice_ref(&view.data[..view.size as usize]);
182
183            // SAFETY: validation that the string data contained in this vector is performed
184            //  at construction time, either in the constructor for safe construction, or by
185            //  the caller (when using the unchecked constructor).
186            Some(unsafe { T::scalar_from_buffer_unchecked(buffer) })
187        } else {
188            // Get a pointer into the buffer range
189            let view_ref = view.as_view();
190            let buffer = &self.buffers[view_ref.buffer_index as usize];
191
192            let start = view_ref.offset as usize;
193            let length = view_ref.size as usize;
194            let buffer_slice = buffer.slice(start..start + length);
195
196            // SAFETY: validation that the string data contained in this vector is performed
197            //  at construction time, either in the constructor for safe construction, or by
198            //  the caller (when using the unchecked constructor).
199            Some(unsafe { T::scalar_from_buffer_unchecked(buffer_slice) })
200        }
201    }
202
203    /// Get the `index` item from the vector as a native `Slice` type.
204    ///
205    /// This function will panic is `index` is out of range for the vector's length.
206    pub fn get_ref(&self, index: usize) -> Option<&T::Slice> {
207        if !self.validity.value(index) {
208            return None;
209        }
210
211        Some(unsafe { T::from_bytes_unchecked(self.get_ref_unchecked(index)) })
212    }
213
214    /// Get the `index` item from the vector as a [u8]
215    ///
216    /// ## SAFETY
217    ///
218    /// This function is unsafe since the validity of the vector is ignored.
219    pub unsafe fn get_ref_unchecked(&self, index: usize) -> &[u8] {
220        let view = &self.views[index];
221        if view.is_inlined() {
222            let view = view.as_inlined();
223            // SAFETY: validation that the string data contained in this vector is performed
224            //  at construction time, either in the constructor for safe construction, or by
225            //  the caller (when using the unchecked constructor).
226            &view.data[..view.size as usize]
227        } else {
228            // Get a pointer into the buffer range
229            let view_ref = view.as_view();
230            let buffer = &self.buffers[view_ref.buffer_index as usize];
231
232            let start = view_ref.offset as usize;
233            let length = view_ref.size as usize;
234
235            // SAFETY: validation that the string data contained in this vector is performed
236            //  at construction time, either in the constructor for safe construction, or by
237            //  the caller (when using the unchecked constructor).
238            &buffer.as_bytes()[start..start + length]
239        }
240    }
241
242    /// Buffers
243    pub fn buffers(&self) -> &Arc<Box<[ByteBuffer]>> {
244        &self.buffers
245    }
246
247    /// Views
248    pub fn views(&self) -> &Buffer<BinaryView> {
249        &self.views
250    }
251}
252
253impl<T: BinaryViewType> VectorOps for BinaryViewVector<T> {
254    type Mutable = BinaryViewVectorMut<T>;
255    type Scalar = BinaryViewScalar<T>;
256
257    fn len(&self) -> usize {
258        self.views.len()
259    }
260
261    fn validity(&self) -> &Mask {
262        &self.validity
263    }
264
265    fn mask_validity(&mut self, mask: &Mask) {
266        self.validity = self.validity.bitand(mask);
267    }
268
269    fn scalar_at(&self, index: usize) -> BinaryViewScalar<T> {
270        assert!(index < self.len());
271        BinaryViewScalar::<T>::new(self.get(index))
272    }
273
274    fn slice(&self, range: impl RangeBounds<usize> + Clone + Debug) -> Self {
275        BinaryViewVector {
276            views: self.views.slice(range.clone()),
277            buffers: self.buffers().clone(),
278            validity: self.validity.slice(range),
279            _marker: self._marker,
280        }
281    }
282
283    fn clear(&mut self) {
284        self.views.clear();
285        self.validity = Mask::new_true(0);
286        self.buffers = Arc::new(Box::new([]));
287    }
288
289    fn try_into_mut(self) -> Result<BinaryViewVectorMut<T>, Self> {
290        let views_mut = match self.views.try_into_mut() {
291            Ok(views_mut) => views_mut,
292            Err(views) => {
293                return Err(Self {
294                    views,
295                    validity: self.validity,
296                    buffers: self.buffers,
297                    _marker: std::marker::PhantomData,
298                });
299            }
300        };
301
302        let validity_mut = match self.validity.try_into_mut() {
303            Ok(validity_mut) => validity_mut,
304            Err(validity) => {
305                return Err(Self {
306                    views: views_mut.freeze(),
307                    validity,
308                    buffers: self.buffers,
309                    _marker: std::marker::PhantomData,
310                });
311            }
312        };
313
314        let buffers_mut = match Arc::try_unwrap(self.buffers) {
315            Ok(buffers) => buffers.into_vec(),
316            Err(buffers) => {
317                // Backup: collect a new Vec with clones of each buffer
318                buffers.iter().cloned().collect()
319            }
320        };
321
322        // SAFETY: the BinaryViewVector maintains the same invariants that are
323        //  otherwise checked in the safe BinaryViewVectorMut constructor.
324        unsafe {
325            Ok(BinaryViewVectorMut::new_unchecked(
326                views_mut,
327                validity_mut,
328                buffers_mut,
329            ))
330        }
331    }
332
333    fn into_mut(self) -> BinaryViewVectorMut<T> {
334        let views_mut = self.views.into_mut();
335        let validity_mut = self.validity.into_mut();
336
337        // If someone else has a strong reference to the `Arc`, clone the underlying data (which is
338        // just a **different** reference count increment).
339        let buffers_mut = Arc::try_unwrap(self.buffers)
340            .unwrap_or_else(|arc| (*arc).clone())
341            .into_vec();
342
343        // SAFETY: The BinaryViewVector maintains the exact same invariants as the immutable
344        // version, so all invariants are still upheld.
345        unsafe { BinaryViewVectorMut::new_unchecked(views_mut, validity_mut, buffers_mut) }
346    }
347}
348
349#[cfg(test)]
350mod tests {
351    use std::sync::Arc;
352
353    use vortex_buffer::ByteBuffer;
354    use vortex_buffer::buffer;
355    use vortex_mask::Mask;
356
357    use crate::VectorMutOps;
358    use crate::VectorOps;
359    use crate::binaryview::StringVector;
360    use crate::binaryview::StringVectorMut;
361    use crate::binaryview::view::BinaryView;
362
363    #[test]
364    #[should_panic(expected = "views buffer length 1 != validity length 100")]
365    fn test_try_new_mismatch_validity_len() {
366        StringVector::try_new(
367            buffer![BinaryView::new_inlined(b"inlined")],
368            Arc::new(Box::new([])),
369            Mask::new_true(100),
370        )
371        .unwrap();
372    }
373
374    #[test]
375    #[should_panic(
376        expected = "view at index 0 references invalid buffer: 100 out of bounds for BinaryViewVector with 0 buffers"
377    )]
378    fn test_try_new_invalid_buffer_offset() {
379        StringVector::try_new(
380            buffer![BinaryView::make_view(b"bad buffer ptr", 100, 0)],
381            Arc::new(Box::new([])),
382            Mask::new_true(1),
383        )
384        .unwrap();
385    }
386
387    #[test]
388    #[should_panic(expected = "start offset 4294967295 out of bounds for buffer 0 with size 19")]
389    fn test_try_new_invalid_length() {
390        StringVector::try_new(
391            buffer![BinaryView::make_view(b"bad buffer ptr", 0, u32::MAX)],
392            Arc::new(Box::new([ByteBuffer::copy_from(b"a very short buffer")])),
393            Mask::new_true(1),
394        )
395        .unwrap();
396    }
397
398    #[test]
399    #[should_panic(expected = "view at index 0: inlined bytes failed utf-8 validation")]
400    fn test_try_new_invalid_utf8_inlined() {
401        StringVector::try_new(
402            buffer![BinaryView::new_inlined(b"\x80")],
403            Arc::new(Box::new([])),
404            Mask::new_true(1),
405        )
406        .unwrap();
407    }
408
409    #[test]
410    #[should_panic(expected = "view at index 0: outlined bytes failed utf-8 validation")]
411    fn test_try_new_invalid_utf8_outlined() {
412        // 0xFF is never valid in UTF-8
413        let sequence = b"\xff".repeat(13);
414        StringVector::try_new(
415            buffer![BinaryView::make_view(&sequence, 0, 0)],
416            Arc::new(Box::new([ByteBuffer::copy_from(sequence)])),
417            Mask::new_true(1),
418        )
419        .unwrap();
420    }
421
422    #[test]
423    fn test_try_into_mut() {
424        let mut shared_vec = StringVectorMut::with_capacity(5);
425        shared_vec.append_nulls(2);
426        shared_vec.append_values("an example value", 2);
427        shared_vec.append_values("another example value", 1);
428
429        let shared_vec = shared_vec.freeze();
430
431        // Making a copy aliases the vector, preventing us from converting it back into mutable
432        let shared_vec2 = shared_vec.clone();
433
434        // The Err variant is returned, because the aliasing borrow from shared_vec2 is blocking us
435        // from taking unique ownership of the memory.
436        let shared_vec = shared_vec.try_into_mut().unwrap_err();
437
438        // Dropping the aliasing borrow makes it possible to cast the unique reference to mut
439        drop(shared_vec2);
440
441        assert!(shared_vec.try_into_mut().is_ok());
442    }
443
444    #[test]
445    fn test_binaryview_eq_identical_inlined() {
446        // Test equality with inlined strings (<=12 bytes).
447        let mut v1 = StringVectorMut::with_capacity(3);
448        v1.append_values("hello", 1);
449        v1.append_values("world", 1);
450        v1.append_values("test", 1);
451        let v1 = v1.freeze();
452
453        let mut v2 = StringVectorMut::with_capacity(3);
454        v2.append_values("hello", 1);
455        v2.append_values("world", 1);
456        v2.append_values("test", 1);
457        let v2 = v2.freeze();
458
459        assert_eq!(v1, v2);
460    }
461
462    #[test]
463    fn test_binaryview_eq_identical_outlined() {
464        // Test equality with outlined strings (>12 bytes).
465        let mut v1 = StringVectorMut::with_capacity(2);
466        v1.append_values("this is a longer string that won't be inlined", 1);
467        v1.append_values("another long string for testing purposes", 1);
468        let v1 = v1.freeze();
469
470        let mut v2 = StringVectorMut::with_capacity(2);
471        v2.append_values("this is a longer string that won't be inlined", 1);
472        v2.append_values("another long string for testing purposes", 1);
473        let v2 = v2.freeze();
474
475        assert_eq!(v1, v2);
476    }
477
478    #[test]
479    fn test_binaryview_eq_different_length() {
480        let mut v1 = StringVectorMut::with_capacity(3);
481        v1.append_values("a", 1);
482        v1.append_values("b", 1);
483        v1.append_values("c", 1);
484        let v1 = v1.freeze();
485
486        let mut v2 = StringVectorMut::with_capacity(2);
487        v2.append_values("a", 1);
488        v2.append_values("b", 1);
489        let v2 = v2.freeze();
490
491        assert_ne!(v1, v2);
492    }
493
494    #[test]
495    fn test_binaryview_eq_different_validity() {
496        let mut v1 = StringVectorMut::with_capacity(3);
497        v1.append_values("a", 1);
498        v1.append_values("b", 1);
499        v1.append_values("c", 1);
500        let v1 = v1.freeze();
501
502        let mut v2 = StringVectorMut::with_capacity(3);
503        v2.append_values("a", 1);
504        v2.append_nulls(1);
505        v2.append_values("c", 1);
506        let v2 = v2.freeze();
507
508        assert_ne!(v1, v2);
509    }
510
511    #[test]
512    fn test_binaryview_eq_different_values() {
513        let mut v1 = StringVectorMut::with_capacity(3);
514        v1.append_values("hello", 1);
515        v1.append_values("world", 1);
516        v1.append_values("test", 1);
517        let v1 = v1.freeze();
518
519        let mut v2 = StringVectorMut::with_capacity(3);
520        v2.append_values("hello", 1);
521        v2.append_values("DIFFERENT", 1);
522        v2.append_values("test", 1);
523        let v2 = v2.freeze();
524
525        assert_ne!(v1, v2);
526    }
527
528    #[test]
529    fn test_binaryview_eq_ignores_invalid_positions_inlined() {
530        // Two vectors with different values at invalid positions should be equal.
531        let mut v1 = StringVectorMut::with_capacity(3);
532        v1.append_values("hello", 1);
533        v1.append_values("value_a", 1); // This will be masked as invalid
534        v1.append_values("test", 1);
535        let mut v1 = v1.freeze();
536        // Mask position 1 as invalid
537        v1.mask_validity(&Mask::from_iter([true, false, true]));
538
539        let mut v2 = StringVectorMut::with_capacity(3);
540        v2.append_values("hello", 1);
541        v2.append_values("value_b", 1); // Different value at invalid position
542        v2.append_values("test", 1);
543        let mut v2 = v2.freeze();
544        v2.mask_validity(&Mask::from_iter([true, false, true]));
545
546        assert_eq!(v1, v2);
547    }
548
549    #[test]
550    fn test_binaryview_eq_ignores_invalid_positions_outlined() {
551        // Test with outlined strings at invalid positions.
552        let mut v1 = StringVectorMut::with_capacity(3);
553        v1.append_values("this is a very long string that will be outlined", 1);
554        v1.append_values("another long value that differs between vectors A", 1);
555        v1.append_values("yet another long string for the test", 1);
556        let mut v1 = v1.freeze();
557        v1.mask_validity(&Mask::from_iter([true, false, true]));
558
559        let mut v2 = StringVectorMut::with_capacity(3);
560        v2.append_values("this is a very long string that will be outlined", 1);
561        v2.append_values("different long value at the invalid position B", 1);
562        v2.append_values("yet another long string for the test", 1);
563        let mut v2 = v2.freeze();
564        v2.mask_validity(&Mask::from_iter([true, false, true]));
565
566        assert_eq!(v1, v2);
567    }
568
569    #[test]
570    fn test_binaryview_eq_empty() {
571        let v1 = StringVectorMut::with_capacity(0).freeze();
572        let v2 = StringVectorMut::with_capacity(0).freeze();
573
574        assert_eq!(v1, v2);
575    }
576
577    #[test]
578    fn test_binaryview_eq_all_nulls() {
579        let mut v1 = StringVectorMut::with_capacity(3);
580        v1.append_nulls(3);
581        let v1 = v1.freeze();
582
583        let mut v2 = StringVectorMut::with_capacity(3);
584        v2.append_nulls(3);
585        let v2 = v2.freeze();
586
587        assert_eq!(v1, v2);
588    }
589}