vortex_vector/binaryview/
vector.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Variable-length binary vector implementation.
5
6use std::fmt::Debug;
7use std::ops::BitAnd;
8use std::ops::RangeBounds;
9use std::sync::Arc;
10
11use vortex_buffer::Alignment;
12use vortex_buffer::Buffer;
13use vortex_buffer::ByteBuffer;
14use vortex_error::VortexExpect;
15use vortex_error::VortexResult;
16use vortex_error::vortex_ensure;
17use vortex_mask::Mask;
18
19use crate::VectorOps;
20use crate::binaryview::BinaryViewScalar;
21use crate::binaryview::BinaryViewType;
22use crate::binaryview::vector_mut::BinaryViewVectorMut;
23use crate::binaryview::view::BinaryView;
24use crate::binaryview::view::validate_views;
25
26/// A variable-length binary vector.
27///
28/// This is the core vector for string and binary data.
29#[derive(Debug, Clone)]
30pub struct BinaryViewVector<T: BinaryViewType> {
31    /// Views into the binary data.
32    views: Buffer<BinaryView>,
33    /// Buffers holding the referenced binary data.
34    buffers: Arc<Box<[ByteBuffer]>>,
35    /// Validity mask for the vector.
36    validity: Mask,
37    /// Marker trait for the [`BinaryViewType`].
38    _marker: std::marker::PhantomData<T>,
39}
40
41impl<T: BinaryViewType> BinaryViewVector<T> {
42    /// Creates a new [`BinaryViewVector`] from the provided components.
43    ///
44    /// # Safety
45    ///
46    /// This function is unsafe because it does not validate the consistency of the provided
47    /// components.
48    ///
49    /// The caller must uphold all validation that would otherwise be validated by
50    /// the [safe constructor](Self::try_new).
51    pub unsafe fn new_unchecked(
52        views: Buffer<BinaryView>,
53        buffers: Arc<Box<[ByteBuffer]>>,
54        validity: Mask,
55    ) -> Self {
56        if cfg!(debug_assertions) {
57            Self::new(views, buffers, validity)
58        } else {
59            Self {
60                views,
61                validity,
62                buffers,
63                _marker: std::marker::PhantomData,
64            }
65        }
66    }
67
68    /// Create a new `BinaryViewVector` from its components, panicking if validation fails.
69    ///
70    /// # Errors
71    ///
72    /// This function will panic if any of the validation checks performed by
73    /// [`try_new`](Self::try_new) fails.
74    pub fn new(views: Buffer<BinaryView>, buffers: Arc<Box<[ByteBuffer]>>, validity: Mask) -> Self {
75        Self::try_new(views, buffers, validity).vortex_expect("Failed to create `BinaryViewVector`")
76    }
77
78    /// Create a new [`BinaryViewVector`] from the provided components with validation.
79    ///
80    /// # Errors
81    ///
82    /// This function will return an error if any of the following validation checks fails:
83    ///
84    /// 1. The length of the `views` does not match the length of the provided `validity`
85    /// 2. Any non-null `views` point to invalid `buffers` or buffer offset ranges
86    /// 3. Any data stored inlined or in the `buffers` and referenced by the `views` does not
87    ///    conform to the [validation constraints][BinaryViewType::validate] of this view type.
88    pub fn try_new(
89        views: Buffer<BinaryView>,
90        buffers: Arc<Box<[ByteBuffer]>>,
91        validity: Mask,
92    ) -> VortexResult<Self> {
93        vortex_ensure!(
94            views.len() == validity.len(),
95            "views buffer length {} != validity length {}",
96            views.len(),
97            validity.len()
98        );
99
100        validate_views(
101            &views,
102            &*buffers,
103            |index| validity.value(index),
104            T::validate,
105        )?;
106
107        Ok(Self {
108            views,
109            buffers,
110            validity,
111            _marker: std::marker::PhantomData,
112        })
113    }
114
115    /// Decomposes the vector into its constituent parts.
116    pub fn into_parts(self) -> (Buffer<BinaryView>, Arc<Box<[ByteBuffer]>>, Mask) {
117        (self.views, self.buffers, self.validity)
118    }
119
120    /// Get the `index` item from the vector as an owned `Scalar` type with zero-copy.
121    ///
122    /// This function will panic is `index` is out of range for the vector's length.
123    pub fn get(&self, index: usize) -> Option<T::Scalar> {
124        if !self.validity.value(index) {
125            return None;
126        }
127
128        let view = &self.views[index];
129        if view.is_inlined() {
130            let view = view.as_inlined();
131
132            // We find the occurrence of the inlined data in the views buffer.
133            let buffer = self
134                .views
135                .clone()
136                .into_byte_buffer()
137                .aligned(Alignment::none())
138                .slice_ref(&view.data[..view.size as usize]);
139
140            // SAFETY: validation that the string data contained in this vector is performed
141            //  at construction time, either in the constructor for safe construction, or by
142            //  the caller (when using the unchecked constructor).
143            Some(unsafe { T::scalar_from_buffer_unchecked(buffer) })
144        } else {
145            // Get a pointer into the buffer range
146            let view_ref = view.as_view();
147            let buffer = &self.buffers[view_ref.buffer_index as usize];
148
149            let start = view_ref.offset as usize;
150            let length = view_ref.size as usize;
151            let buffer_slice = buffer.slice(start..start + length);
152
153            // SAFETY: validation that the string data contained in this vector is performed
154            //  at construction time, either in the constructor for safe construction, or by
155            //  the caller (when using the unchecked constructor).
156            Some(unsafe { T::scalar_from_buffer_unchecked(buffer_slice) })
157        }
158    }
159
160    /// Get the `index` item from the vector as a native `Slice` type.
161    ///
162    /// This function will panic is `index` is out of range for the vector's length.
163    pub fn get_ref(&self, index: usize) -> Option<&T::Slice> {
164        if !self.validity.value(index) {
165            return None;
166        }
167
168        let view = &self.views[index];
169        if view.is_inlined() {
170            let view = view.as_inlined();
171            // SAFETY: validation that the string data contained in this vector is performed
172            //  at construction time, either in the constructor for safe construction, or by
173            //  the caller (when using the unchecked constructor).
174            Some(unsafe { T::from_bytes_unchecked(&view.data[..view.size as usize]) })
175        } else {
176            // Get a pointer into the buffer range
177            let view_ref = view.as_view();
178            let buffer = &self.buffers[view_ref.buffer_index as usize];
179
180            let start = view_ref.offset as usize;
181            let length = view_ref.size as usize;
182
183            // SAFETY: validation that the string data contained in this vector is performed
184            //  at construction time, either in the constructor for safe construction, or by
185            //  the caller (when using the unchecked constructor).
186            Some(unsafe { T::from_bytes_unchecked(&buffer.as_bytes()[start..start + length]) })
187        }
188    }
189
190    /// Buffers
191    pub fn buffers(&self) -> &Arc<Box<[ByteBuffer]>> {
192        &self.buffers
193    }
194
195    /// Views
196    pub fn views(&self) -> &Buffer<BinaryView> {
197        &self.views
198    }
199}
200
201impl<T: BinaryViewType> VectorOps for BinaryViewVector<T> {
202    type Mutable = BinaryViewVectorMut<T>;
203    type Scalar = BinaryViewScalar<T>;
204
205    fn len(&self) -> usize {
206        self.views.len()
207    }
208
209    fn validity(&self) -> &Mask {
210        &self.validity
211    }
212
213    fn mask_validity(&mut self, mask: &Mask) {
214        self.validity = self.validity.bitand(mask);
215    }
216
217    fn scalar_at(&self, index: usize) -> BinaryViewScalar<T> {
218        assert!(index < self.len());
219        BinaryViewScalar::<T>::new(self.get(index))
220    }
221
222    fn slice(&self, _range: impl RangeBounds<usize> + Clone + Debug) -> Self {
223        todo!()
224    }
225
226    fn clear(&mut self) {
227        self.views.clear();
228        self.validity = Mask::new_true(0);
229        self.buffers = Arc::new(Box::new([]));
230    }
231
232    fn try_into_mut(self) -> Result<BinaryViewVectorMut<T>, Self> {
233        let views_mut = match self.views.try_into_mut() {
234            Ok(views_mut) => views_mut,
235            Err(views) => {
236                return Err(Self {
237                    views,
238                    validity: self.validity,
239                    buffers: self.buffers,
240                    _marker: std::marker::PhantomData,
241                });
242            }
243        };
244
245        let validity_mut = match self.validity.try_into_mut() {
246            Ok(validity_mut) => validity_mut,
247            Err(validity) => {
248                return Err(Self {
249                    views: views_mut.freeze(),
250                    validity,
251                    buffers: self.buffers,
252                    _marker: std::marker::PhantomData,
253                });
254            }
255        };
256
257        let buffers_mut = match Arc::try_unwrap(self.buffers) {
258            Ok(buffers) => buffers.into_vec(),
259            Err(buffers) => {
260                // Backup: collect a new Vec with clones of each buffer
261                buffers.iter().cloned().collect()
262            }
263        };
264
265        // SAFETY: the BinaryViewVector maintains the same invariants that are
266        //  otherwise checked in the safe BinaryViewVectorMut constructor.
267        unsafe {
268            Ok(BinaryViewVectorMut::new_unchecked(
269                views_mut,
270                validity_mut,
271                buffers_mut,
272            ))
273        }
274    }
275
276    fn into_mut(self) -> BinaryViewVectorMut<T> {
277        let views_mut = self.views.into_mut();
278        let validity_mut = self.validity.into_mut();
279
280        // If someone else has a strong reference to the `Arc`, clone the underlying data (which is
281        // just a **different** reference count increment).
282        let buffers_mut = Arc::try_unwrap(self.buffers)
283            .unwrap_or_else(|arc| (*arc).clone())
284            .into_vec();
285
286        // SAFETY: The BinaryViewVector maintains the exact same invariants as the immutable
287        // version, so all invariants are still upheld.
288        unsafe { BinaryViewVectorMut::new_unchecked(views_mut, validity_mut, buffers_mut) }
289    }
290}
291
292#[cfg(test)]
293mod tests {
294    use std::sync::Arc;
295
296    use vortex_buffer::ByteBuffer;
297    use vortex_buffer::buffer;
298    use vortex_mask::Mask;
299
300    use crate::VectorMutOps;
301    use crate::VectorOps;
302    use crate::binaryview::StringVector;
303    use crate::binaryview::StringVectorMut;
304    use crate::binaryview::view::BinaryView;
305
306    #[test]
307    #[should_panic(expected = "views buffer length 1 != validity length 100")]
308    fn test_try_new_mismatch_validity_len() {
309        StringVector::try_new(
310            buffer![BinaryView::new_inlined(b"inlined")],
311            Arc::new(Box::new([])),
312            Mask::new_true(100),
313        )
314        .unwrap();
315    }
316
317    #[test]
318    #[should_panic(
319        expected = "view at index 0 references invalid buffer: 100 out of bounds for BinaryViewVector with 0 buffers"
320    )]
321    fn test_try_new_invalid_buffer_offset() {
322        StringVector::try_new(
323            buffer![BinaryView::make_view(b"bad buffer ptr", 100, 0)],
324            Arc::new(Box::new([])),
325            Mask::new_true(1),
326        )
327        .unwrap();
328    }
329
330    #[test]
331    #[should_panic(expected = "start offset 4294967295 out of bounds for buffer 0 with size 19")]
332    fn test_try_new_invalid_length() {
333        StringVector::try_new(
334            buffer![BinaryView::make_view(b"bad buffer ptr", 0, u32::MAX)],
335            Arc::new(Box::new([ByteBuffer::copy_from(b"a very short buffer")])),
336            Mask::new_true(1),
337        )
338        .unwrap();
339    }
340
341    #[test]
342    #[should_panic(expected = "view at index 0: inlined bytes failed utf-8 validation")]
343    fn test_try_new_invalid_utf8_inlined() {
344        StringVector::try_new(
345            buffer![BinaryView::new_inlined(b"\x80")],
346            Arc::new(Box::new([])),
347            Mask::new_true(1),
348        )
349        .unwrap();
350    }
351
352    #[test]
353    #[should_panic(expected = "view at index 0: outlined bytes failed utf-8 validation")]
354    fn test_try_new_invalid_utf8_outlined() {
355        // 0xFF is never valid in UTF-8
356        let sequence = b"\xff".repeat(13);
357        StringVector::try_new(
358            buffer![BinaryView::make_view(&sequence, 0, 0)],
359            Arc::new(Box::new([ByteBuffer::copy_from(sequence)])),
360            Mask::new_true(1),
361        )
362        .unwrap();
363    }
364
365    #[test]
366    fn test_try_into_mut() {
367        let mut shared_vec = StringVectorMut::with_capacity(5);
368        shared_vec.append_nulls(2);
369        shared_vec.append_values("an example value", 2);
370        shared_vec.append_values("another example value", 1);
371
372        let shared_vec = shared_vec.freeze();
373
374        // Making a copy aliases the vector, preventing us from converting it back into mutable
375        let shared_vec2 = shared_vec.clone();
376
377        // The Err variant is returned, because the aliasing borrow from shared_vec2 is blocking us
378        // from taking unique ownership of the memory.
379        let shared_vec = shared_vec.try_into_mut().unwrap_err();
380
381        // Dropping the aliasing borrow makes it possible to cast the unique reference to mut
382        drop(shared_vec2);
383
384        assert!(shared_vec.try_into_mut().is_ok());
385    }
386}