vortex_vector/binaryview/
vector.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Variable-length binary vector implementation.
5
6use std::fmt::Debug;
7use std::ops::RangeBounds;
8use std::sync::Arc;
9
10use vortex_buffer::{Alignment, Buffer, ByteBuffer};
11use vortex_error::{VortexExpect, VortexResult, vortex_ensure};
12use vortex_mask::Mask;
13
14use crate::binaryview::vector_mut::BinaryViewVectorMut;
15use crate::binaryview::view::{BinaryView, validate_views};
16use crate::binaryview::{BinaryViewScalar, BinaryViewType};
17use crate::{Scalar, VectorOps};
18
19/// A variable-length binary vector.
20///
21/// This is the core vector for string and binary data.
22#[derive(Debug, Clone)]
23pub struct BinaryViewVector<T: BinaryViewType> {
24    /// Views into the binary data.
25    views: Buffer<BinaryView>,
26    /// Buffers holding the referenced binary data.
27    buffers: Arc<Box<[ByteBuffer]>>,
28    /// Validity mask for the vector.
29    validity: Mask,
30    /// Marker trait for the [`BinaryViewType`].
31    _marker: std::marker::PhantomData<T>,
32}
33
34impl<T: BinaryViewType> BinaryViewVector<T> {
35    /// Creates a new [`BinaryViewVector`] from the provided components.
36    ///
37    /// # Safety
38    ///
39    /// This function is unsafe because it does not validate the consistency of the provided
40    /// components.
41    ///
42    /// The caller must uphold all validation that would otherwise be validated by
43    /// the [safe constructor](Self::try_new).
44    pub unsafe fn new_unchecked(
45        views: Buffer<BinaryView>,
46        buffers: Arc<Box<[ByteBuffer]>>,
47        validity: Mask,
48    ) -> Self {
49        if cfg!(debug_assertions) {
50            Self::new(views, buffers, validity)
51        } else {
52            Self {
53                views,
54                validity,
55                buffers,
56                _marker: std::marker::PhantomData,
57            }
58        }
59    }
60
61    /// Create a new `BinaryViewVector` from its components, panicking if validation fails.
62    ///
63    /// # Errors
64    ///
65    /// This function will panic if any of the validation checks performed by
66    /// [`try_new`](Self::try_new) fails.
67    pub fn new(views: Buffer<BinaryView>, buffers: Arc<Box<[ByteBuffer]>>, validity: Mask) -> Self {
68        Self::try_new(views, buffers, validity).vortex_expect("Failed to create `BinaryViewVector`")
69    }
70
71    /// Create a new [`BinaryViewVector`] from the provided components with validation.
72    ///
73    /// # Errors
74    ///
75    /// This function will return an error if any of the following validation checks fails:
76    ///
77    /// 1. The length of the `views` does not match the length of the provided `validity`
78    /// 2. Any non-null `views` point to invalid `buffers` or buffer offset ranges
79    /// 3. Any data stored inlined or in the `buffers` and referenced by the `views` does not
80    ///    conform to the [validation constraints][BinaryViewType::validate] of this view type.
81    pub fn try_new(
82        views: Buffer<BinaryView>,
83        buffers: Arc<Box<[ByteBuffer]>>,
84        validity: Mask,
85    ) -> VortexResult<Self> {
86        vortex_ensure!(
87            views.len() == validity.len(),
88            "views buffer length {} != validity length {}",
89            views.len(),
90            validity.len()
91        );
92
93        validate_views(
94            &views,
95            &*buffers,
96            |index| validity.value(index),
97            T::validate,
98        )?;
99
100        Ok(Self {
101            views,
102            buffers,
103            validity,
104            _marker: std::marker::PhantomData,
105        })
106    }
107
108    /// Decomposes the vector into its constituent parts.
109    pub fn into_parts(self) -> (Buffer<BinaryView>, Arc<Box<[ByteBuffer]>>, Mask) {
110        (self.views, self.buffers, self.validity)
111    }
112
113    /// Get the `index` item from the vector as an owned `Scalar` type with zero-copy.
114    ///
115    /// This function will panic is `index` is out of range for the vector's length.
116    pub fn get(&self, index: usize) -> Option<T::Scalar> {
117        if !self.validity.value(index) {
118            return None;
119        }
120
121        let view = &self.views[index];
122        if view.is_inlined() {
123            let view = view.as_inlined();
124
125            // We find the occurrence of the inlined data in the views buffer.
126            let buffer = self
127                .views
128                .clone()
129                .into_byte_buffer()
130                .aligned(Alignment::none())
131                .slice_ref(&view.data[..view.size as usize]);
132
133            // SAFETY: validation that the string data contained in this vector is performed
134            //  at construction time, either in the constructor for safe construction, or by
135            //  the caller (when using the unchecked constructor).
136            Some(unsafe { T::scalar_from_buffer_unchecked(buffer) })
137        } else {
138            // Get a pointer into the buffer range
139            let view_ref = view.as_view();
140            let buffer = &self.buffers[view_ref.buffer_index as usize];
141
142            let start = view_ref.offset as usize;
143            let length = view_ref.size as usize;
144            let buffer_slice = buffer.slice(start..start + length);
145
146            // SAFETY: validation that the string data contained in this vector is performed
147            //  at construction time, either in the constructor for safe construction, or by
148            //  the caller (when using the unchecked constructor).
149            Some(unsafe { T::scalar_from_buffer_unchecked(buffer_slice) })
150        }
151    }
152
153    /// Get the `index` item from the vector as a native `Slice` type.
154    ///
155    /// This function will panic is `index` is out of range for the vector's length.
156    pub fn get_ref(&self, index: usize) -> Option<&T::Slice> {
157        if !self.validity.value(index) {
158            return None;
159        }
160
161        let view = &self.views[index];
162        if view.is_inlined() {
163            let view = view.as_inlined();
164            // SAFETY: validation that the string data contained in this vector is performed
165            //  at construction time, either in the constructor for safe construction, or by
166            //  the caller (when using the unchecked constructor).
167            Some(unsafe { T::from_bytes_unchecked(&view.data[..view.size as usize]) })
168        } else {
169            // Get a pointer into the buffer range
170            let view_ref = view.as_view();
171            let buffer = &self.buffers[view_ref.buffer_index as usize];
172
173            let start = view_ref.offset as usize;
174            let length = view_ref.size as usize;
175
176            // SAFETY: validation that the string data contained in this vector is performed
177            //  at construction time, either in the constructor for safe construction, or by
178            //  the caller (when using the unchecked constructor).
179            Some(unsafe { T::from_bytes_unchecked(&buffer.as_bytes()[start..start + length]) })
180        }
181    }
182
183    /// Buffers
184    pub fn buffers(&self) -> &Arc<Box<[ByteBuffer]>> {
185        &self.buffers
186    }
187
188    /// Views
189    pub fn views(&self) -> &Buffer<BinaryView> {
190        &self.views
191    }
192}
193
194impl<T: BinaryViewType> VectorOps for BinaryViewVector<T> {
195    type Mutable = BinaryViewVectorMut<T>;
196
197    fn len(&self) -> usize {
198        self.views.len()
199    }
200
201    fn validity(&self) -> &Mask {
202        &self.validity
203    }
204
205    fn scalar_at(&self, index: usize) -> Scalar {
206        assert!(index < self.len());
207        BinaryViewScalar::<T>::from(self.get(index)).into()
208    }
209
210    fn slice(&self, _range: impl RangeBounds<usize> + Clone + Debug) -> Self {
211        todo!()
212    }
213
214    fn try_into_mut(self) -> Result<BinaryViewVectorMut<T>, Self> {
215        let views_mut = match self.views.try_into_mut() {
216            Ok(views_mut) => views_mut,
217            Err(views) => {
218                return Err(Self {
219                    views,
220                    validity: self.validity,
221                    buffers: self.buffers,
222                    _marker: std::marker::PhantomData,
223                });
224            }
225        };
226
227        let validity_mut = match self.validity.try_into_mut() {
228            Ok(validity_mut) => validity_mut,
229            Err(validity) => {
230                return Err(Self {
231                    views: views_mut.freeze(),
232                    validity,
233                    buffers: self.buffers,
234                    _marker: std::marker::PhantomData,
235                });
236            }
237        };
238
239        let buffers_mut = match Arc::try_unwrap(self.buffers) {
240            Ok(buffers) => buffers.into_vec(),
241            Err(buffers) => {
242                // Backup: collect a new Vec with clones of each buffer
243                buffers.iter().cloned().collect()
244            }
245        };
246
247        // SAFETY: the BinaryViewVector maintains the same invariants that are
248        //  otherwise checked in the safe BinaryViewVectorMut constructor.
249        unsafe {
250            Ok(BinaryViewVectorMut::new_unchecked(
251                views_mut,
252                validity_mut,
253                buffers_mut,
254            ))
255        }
256    }
257
258    fn into_mut(self) -> BinaryViewVectorMut<T> {
259        let views_mut = self.views.into_mut();
260        let validity_mut = self.validity.into_mut();
261
262        // If someone else has a strong reference to the `Arc`, clone the underlying data (which is
263        // just a **different** reference count increment).
264        let buffers_mut = Arc::try_unwrap(self.buffers)
265            .unwrap_or_else(|arc| (*arc).clone())
266            .into_vec();
267
268        // SAFETY: The BinaryViewVector maintains the exact same invariants as the immutable
269        // version, so all invariants are still upheld.
270        unsafe { BinaryViewVectorMut::new_unchecked(views_mut, validity_mut, buffers_mut) }
271    }
272}
273
274#[cfg(test)]
275mod tests {
276    use std::sync::Arc;
277
278    use vortex_buffer::{ByteBuffer, buffer};
279    use vortex_mask::Mask;
280
281    use crate::binaryview::view::BinaryView;
282    use crate::binaryview::{StringVector, StringVectorMut};
283    use crate::{VectorMutOps, VectorOps};
284
285    #[test]
286    #[should_panic(expected = "views buffer length 1 != validity length 100")]
287    fn test_try_new_mismatch_validity_len() {
288        StringVector::try_new(
289            buffer![BinaryView::new_inlined(b"inlined")],
290            Arc::new(Box::new([])),
291            Mask::new_true(100),
292        )
293        .unwrap();
294    }
295
296    #[test]
297    #[should_panic(
298        expected = "view at index 0 references invalid buffer: 100 out of bounds for BinaryViewVector with 0 buffers"
299    )]
300    fn test_try_new_invalid_buffer_offset() {
301        StringVector::try_new(
302            buffer![BinaryView::make_view(b"bad buffer ptr", 100, 0)],
303            Arc::new(Box::new([])),
304            Mask::new_true(1),
305        )
306        .unwrap();
307    }
308
309    #[test]
310    #[should_panic(expected = "start offset 4294967295 out of bounds for buffer 0 with size 19")]
311    fn test_try_new_invalid_length() {
312        StringVector::try_new(
313            buffer![BinaryView::make_view(b"bad buffer ptr", 0, u32::MAX)],
314            Arc::new(Box::new([ByteBuffer::copy_from(b"a very short buffer")])),
315            Mask::new_true(1),
316        )
317        .unwrap();
318    }
319
320    #[test]
321    #[should_panic(expected = "view at index 0: inlined bytes failed utf-8 validation")]
322    fn test_try_new_invalid_utf8_inlined() {
323        StringVector::try_new(
324            buffer![BinaryView::new_inlined(b"\x80")],
325            Arc::new(Box::new([])),
326            Mask::new_true(1),
327        )
328        .unwrap();
329    }
330
331    #[test]
332    #[should_panic(expected = "view at index 0: outlined bytes failed utf-8 validation")]
333    fn test_try_new_invalid_utf8_outlined() {
334        // 0xFF is never valid in UTF-8
335        let sequence = b"\xff".repeat(13);
336        StringVector::try_new(
337            buffer![BinaryView::make_view(&sequence, 0, 0)],
338            Arc::new(Box::new([ByteBuffer::copy_from(sequence)])),
339            Mask::new_true(1),
340        )
341        .unwrap();
342    }
343
344    #[test]
345    fn test_try_into_mut() {
346        let mut shared_vec = StringVectorMut::with_capacity(5);
347        shared_vec.append_nulls(2);
348        shared_vec.append_values("an example value", 2);
349        shared_vec.append_values("another example value", 1);
350
351        let shared_vec = shared_vec.freeze();
352
353        // Making a copy aliases the vector, preventing us from converting it back into mutable
354        let shared_vec2 = shared_vec.clone();
355
356        // The Err variant is returned, because the aliasing borrow from shared_vec2 is blocking us
357        // from taking unique ownership of the memory.
358        let shared_vec = shared_vec.try_into_mut().unwrap_err();
359
360        // Dropping the aliasing borrow makes it possible to cast the unique reference to mut
361        drop(shared_vec2);
362
363        assert!(shared_vec.try_into_mut().is_ok());
364    }
365}