vortex_array/arrays/varbinview/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::{Debug, Formatter};
5use std::ops::Range;
6
7use static_assertions::{assert_eq_align, assert_eq_size};
8use vortex_buffer::{Alignment, Buffer, ByteBuffer};
9use vortex_dtype::{DType, Nullability};
10use vortex_error::{VortexResult, VortexUnwrap, vortex_bail, vortex_panic};
11
12use crate::builders::{ArrayBuilder, VarBinViewBuilder};
13use crate::stats::{ArrayStats, StatsSetRef};
14use crate::validity::Validity;
15use crate::vtable::{
16    ArrayVTable, CanonicalVTable, NotSupported, VTable, ValidityHelper,
17    ValidityVTableFromValidityHelper,
18};
19use crate::{Canonical, EncodingId, EncodingRef, vtable};
20
21mod accessor;
22mod compact;
23mod compute;
24mod ops;
25mod serde;
26
27pub use compact::*;
28
29#[derive(Clone, Copy, Debug, PartialEq, Eq)]
30#[repr(C, align(8))]
31pub struct Inlined {
32    size: u32,
33    data: [u8; BinaryView::MAX_INLINED_SIZE],
34}
35
36impl Inlined {
37    fn new<const N: usize>(value: &[u8]) -> Self {
38        let mut inlined = Self {
39            size: N.try_into().vortex_unwrap(),
40            data: [0u8; BinaryView::MAX_INLINED_SIZE],
41        };
42        inlined.data[..N].copy_from_slice(&value[..N]);
43        inlined
44    }
45
46    #[inline]
47    pub fn value(&self) -> &[u8] {
48        &self.data[0..(self.size as usize)]
49    }
50}
51
52#[derive(Clone, Copy, Debug)]
53#[repr(C, align(8))]
54pub struct Ref {
55    size: u32,
56    prefix: [u8; 4],
57    buffer_index: u32,
58    offset: u32,
59}
60
61impl Ref {
62    pub fn new(size: u32, prefix: [u8; 4], buffer_index: u32, offset: u32) -> Self {
63        Self {
64            size,
65            prefix,
66            buffer_index,
67            offset,
68        }
69    }
70
71    #[inline]
72    pub fn buffer_index(&self) -> u32 {
73        self.buffer_index
74    }
75
76    #[inline]
77    pub fn offset(&self) -> u32 {
78        self.offset
79    }
80
81    #[inline]
82    pub fn prefix(&self) -> &[u8; 4] {
83        &self.prefix
84    }
85
86    #[inline]
87    pub fn to_range(&self) -> Range<usize> {
88        self.offset as usize..(self.offset + self.size) as usize
89    }
90}
91
92#[derive(Clone, Copy)]
93#[repr(C, align(16))]
94pub union BinaryView {
95    // Numeric representation. This is logically `u128`, but we split it into the high and low
96    // bits to preserve the alignment.
97    le_bytes: [u8; 16],
98
99    // Inlined representation: strings <= 12 bytes
100    inlined: Inlined,
101
102    // Reference type: strings > 12 bytes.
103    _ref: Ref,
104}
105
106assert_eq_size!(BinaryView, [u8; 16]);
107assert_eq_size!(Inlined, [u8; 16]);
108assert_eq_size!(Ref, [u8; 16]);
109assert_eq_align!(BinaryView, u128);
110
111impl BinaryView {
112    pub const MAX_INLINED_SIZE: usize = 12;
113
114    /// Create a view from a value, block and offset
115    ///
116    /// Depending on the length of the provided value either a new inlined
117    /// or a reference view will be constructed.
118    ///
119    /// Adapted from arrow-rs <https://github.com/apache/arrow-rs/blob/f4fde769ab6e1a9b75f890b7f8b47bc22800830b/arrow-array/src/builder/generic_bytes_view_builder.rs#L524>
120    /// Explicitly enumerating inlined view produces code that avoids calling generic `ptr::copy_non_interleave` that's slower than explicit stores
121    #[inline(never)]
122    pub fn make_view(value: &[u8], block: u32, offset: u32) -> Self {
123        match value.len() {
124            0 => Self {
125                inlined: Inlined::new::<0>(value),
126            },
127            1 => Self {
128                inlined: Inlined::new::<1>(value),
129            },
130            2 => Self {
131                inlined: Inlined::new::<2>(value),
132            },
133            3 => Self {
134                inlined: Inlined::new::<3>(value),
135            },
136            4 => Self {
137                inlined: Inlined::new::<4>(value),
138            },
139            5 => Self {
140                inlined: Inlined::new::<5>(value),
141            },
142            6 => Self {
143                inlined: Inlined::new::<6>(value),
144            },
145            7 => Self {
146                inlined: Inlined::new::<7>(value),
147            },
148            8 => Self {
149                inlined: Inlined::new::<8>(value),
150            },
151            9 => Self {
152                inlined: Inlined::new::<9>(value),
153            },
154            10 => Self {
155                inlined: Inlined::new::<10>(value),
156            },
157            11 => Self {
158                inlined: Inlined::new::<11>(value),
159            },
160            12 => Self {
161                inlined: Inlined::new::<12>(value),
162            },
163            _ => Self {
164                _ref: Ref::new(
165                    u32::try_from(value.len()).vortex_unwrap(),
166                    value[0..4].try_into().vortex_unwrap(),
167                    block,
168                    offset,
169                ),
170            },
171        }
172    }
173
174    /// Create a new empty view
175    #[inline]
176    pub fn empty_view() -> Self {
177        Self::new_inlined(&[])
178    }
179
180    /// Create a new inlined binary view
181    #[inline]
182    pub fn new_inlined(value: &[u8]) -> Self {
183        assert!(
184            value.len() <= Self::MAX_INLINED_SIZE,
185            "expected inlined value to be <= 12 bytes, was {}",
186            value.len()
187        );
188
189        Self::make_view(value, 0, 0)
190    }
191
192    #[inline]
193    pub fn len(&self) -> u32 {
194        unsafe { self.inlined.size }
195    }
196
197    #[inline]
198    pub fn is_empty(&self) -> bool {
199        self.len() > 0
200    }
201
202    #[inline]
203    #[allow(clippy::cast_possible_truncation)]
204    pub fn is_inlined(&self) -> bool {
205        self.len() <= (Self::MAX_INLINED_SIZE as u32)
206    }
207
208    pub fn as_inlined(&self) -> &Inlined {
209        unsafe { &self.inlined }
210    }
211
212    pub fn as_view(&self) -> &Ref {
213        unsafe { &self._ref }
214    }
215
216    pub fn as_u128(&self) -> u128 {
217        // SAFETY: binary view always safe to read as u128 LE bytes
218        unsafe { u128::from_le_bytes(self.le_bytes) }
219    }
220
221    /// Shifts the buffer reference by the view by a given offset, useful when merging many
222    /// varbinview arrays into one.
223    #[inline(always)]
224    pub fn offset_view(self, offset: u32) -> Self {
225        if self.is_inlined() {
226            self
227        } else {
228            // Referencing views must have their buffer_index adjusted with new offsets
229            let view_ref = self.as_view();
230            Self {
231                _ref: Ref::new(
232                    self.len(),
233                    *view_ref.prefix(),
234                    offset + view_ref.buffer_index(),
235                    view_ref.offset(),
236                ),
237            }
238        }
239    }
240}
241
242impl From<u128> for BinaryView {
243    fn from(value: u128) -> Self {
244        BinaryView {
245            le_bytes: value.to_le_bytes(),
246        }
247    }
248}
249
250impl Debug for BinaryView {
251    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
252        let mut s = f.debug_struct("BinaryView");
253        if self.is_inlined() {
254            s.field("inline", &"i".to_string());
255        } else {
256            s.field("ref", &"r".to_string());
257        }
258        s.finish()
259    }
260}
261
262vtable!(VarBinView);
263
264impl VTable for VarBinViewVTable {
265    type Array = VarBinViewArray;
266    type Encoding = VarBinViewEncoding;
267
268    type ArrayVTable = Self;
269    type CanonicalVTable = Self;
270    type OperationsVTable = Self;
271    type ValidityVTable = ValidityVTableFromValidityHelper;
272    type VisitorVTable = Self;
273    type ComputeVTable = NotSupported;
274    type EncodeVTable = NotSupported;
275    type SerdeVTable = Self;
276
277    fn id(_encoding: &Self::Encoding) -> EncodingId {
278        EncodingId::new_ref("vortex.varbinview")
279    }
280
281    fn encoding(_array: &Self::Array) -> EncodingRef {
282        EncodingRef::new_ref(VarBinViewEncoding.as_ref())
283    }
284}
285
286#[derive(Clone, Debug)]
287pub struct VarBinViewArray {
288    dtype: DType,
289    buffers: Vec<ByteBuffer>,
290    views: Buffer<BinaryView>,
291    validity: Validity,
292    stats_set: ArrayStats,
293}
294
295#[derive(Clone, Debug)]
296pub struct VarBinViewEncoding;
297
298impl VarBinViewArray {
299    pub fn try_new(
300        views: Buffer<BinaryView>,
301        buffers: Vec<ByteBuffer>,
302        dtype: DType,
303        validity: Validity,
304    ) -> VortexResult<Self> {
305        if views.alignment() != Alignment::of::<BinaryView>() {
306            vortex_bail!("Views must be aligned to a 128 bits");
307        }
308
309        if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) {
310            vortex_bail!(MismatchedTypes: "utf8 or binary", dtype);
311        }
312
313        if dtype.is_nullable() == (validity == Validity::NonNullable) {
314            vortex_bail!("incorrect validity {:?}", validity);
315        }
316
317        Ok(Self {
318            dtype,
319            buffers,
320            views,
321            validity,
322            stats_set: Default::default(),
323        })
324    }
325
326    /// Number of raw string data buffers held by this array.
327    pub fn nbuffers(&self) -> usize {
328        self.buffers.len()
329    }
330
331    /// Access to the primitive views buffer.
332    ///
333    /// Variable-sized binary view buffer contain a "view" child array, with 16-byte entries that
334    /// contain either a pointer into one of the array's owned `buffer`s OR an inlined copy of
335    /// the string (if the string has 12 bytes or fewer).
336    #[inline]
337    pub fn views(&self) -> &Buffer<BinaryView> {
338        &self.views
339    }
340
341    /// Access value bytes at a given index
342    ///
343    /// Will return a bytebuffer pointing to the underlying data without performing a copy
344    #[inline]
345    pub fn bytes_at(&self, index: usize) -> ByteBuffer {
346        let views = self.views();
347        let view = &views[index];
348        // Expect this to be the common case: strings > 12 bytes.
349        if !view.is_inlined() {
350            let view_ref = view.as_view();
351            self.buffer(view_ref.buffer_index() as usize)
352                .slice(view_ref.to_range())
353        } else {
354            // Return access to the range of bytes around it.
355            views
356                .clone()
357                .into_byte_buffer()
358                .slice_ref(view.as_inlined().value())
359        }
360    }
361
362    /// Access one of the backing data buffers.
363    ///
364    /// # Panics
365    ///
366    /// This method panics if the provided index is out of bounds for the set of buffers provided
367    /// at construction time.
368    #[inline]
369    pub fn buffer(&self, idx: usize) -> &ByteBuffer {
370        if idx >= self.nbuffers() {
371            vortex_panic!(
372                "{idx} buffer index out of bounds, there are {} buffers",
373                self.nbuffers()
374            );
375        }
376        &self.buffers[idx]
377    }
378
379    /// Iterate over the underlying raw data buffers, not including the views buffer.
380    #[inline]
381    pub fn buffers(&self) -> &[ByteBuffer] {
382        &self.buffers
383    }
384
385    /// Accumulate an iterable set of values into our type here.
386    #[allow(clippy::same_name_method)]
387    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
388        iter: I,
389        dtype: DType,
390    ) -> Self {
391        let iter = iter.into_iter();
392        let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
393
394        for item in iter {
395            match item {
396                None => builder.append_null(),
397                Some(v) => builder.append_value(v),
398            }
399        }
400
401        builder.finish_into_varbinview()
402    }
403
404    pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
405        let iter = iter.into_iter();
406        let mut builder = VarBinViewBuilder::with_capacity(
407            DType::Utf8(Nullability::NonNullable),
408            iter.size_hint().0,
409        );
410
411        for item in iter {
412            builder.append_value(item.as_ref());
413        }
414
415        builder.finish_into_varbinview()
416    }
417
418    pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
419        iter: I,
420    ) -> Self {
421        let iter = iter.into_iter();
422        let mut builder = VarBinViewBuilder::with_capacity(
423            DType::Utf8(Nullability::Nullable),
424            iter.size_hint().0,
425        );
426
427        for item in iter {
428            match item {
429                None => builder.append_null(),
430                Some(v) => builder.append_value(v.as_ref()),
431            }
432        }
433
434        builder.finish_into_varbinview()
435    }
436
437    pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
438        let iter = iter.into_iter();
439        let mut builder = VarBinViewBuilder::with_capacity(
440            DType::Binary(Nullability::NonNullable),
441            iter.size_hint().0,
442        );
443
444        for item in iter {
445            builder.append_value(item.as_ref());
446        }
447
448        builder.finish_into_varbinview()
449    }
450
451    pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
452        iter: I,
453    ) -> Self {
454        let iter = iter.into_iter();
455        let mut builder = VarBinViewBuilder::with_capacity(
456            DType::Binary(Nullability::Nullable),
457            iter.size_hint().0,
458        );
459
460        for item in iter {
461            match item {
462                None => builder.append_null(),
463                Some(v) => builder.append_value(v.as_ref()),
464            }
465        }
466
467        builder.finish_into_varbinview()
468    }
469}
470
471impl ArrayVTable<VarBinViewVTable> for VarBinViewVTable {
472    fn len(array: &VarBinViewArray) -> usize {
473        array.views.len()
474    }
475
476    fn dtype(array: &VarBinViewArray) -> &DType {
477        &array.dtype
478    }
479
480    fn stats(array: &VarBinViewArray) -> StatsSetRef<'_> {
481        array.stats_set.to_ref(array.as_ref())
482    }
483}
484
485impl ValidityHelper for VarBinViewArray {
486    fn validity(&self) -> &Validity {
487        &self.validity
488    }
489}
490
491impl CanonicalVTable<VarBinViewVTable> for VarBinViewVTable {
492    fn canonicalize(array: &VarBinViewArray) -> VortexResult<Canonical> {
493        Ok(Canonical::VarBinView(array.clone()))
494    }
495
496    fn append_to_builder(
497        array: &VarBinViewArray,
498        builder: &mut dyn ArrayBuilder,
499    ) -> VortexResult<()> {
500        builder.extend_from_array(array.as_ref())
501    }
502}
503
504impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
505    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
506        Self::from_iter_nullable_bin(iter)
507    }
508}
509
510impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
511    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
512        Self::from_iter_nullable_bin(iter)
513    }
514}
515
516impl FromIterator<Option<String>> for VarBinViewArray {
517    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
518        Self::from_iter_nullable_str(iter)
519    }
520}
521
522impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
523    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
524        Self::from_iter_nullable_str(iter)
525    }
526}
527
528#[cfg(test)]
529mod test {
530    use vortex_scalar::Scalar;
531
532    use crate::arrays::varbinview::{BinaryView, VarBinViewArray};
533    use crate::{Array, Canonical, IntoArray};
534
535    #[test]
536    pub fn varbin_view() {
537        let binary_arr =
538            VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]);
539        assert_eq!(binary_arr.len(), 2);
540        assert_eq!(
541            binary_arr.scalar_at(0).unwrap(),
542            Scalar::from("hello world")
543        );
544        assert_eq!(
545            binary_arr.scalar_at(1).unwrap(),
546            Scalar::from("hello world this is a long string")
547        );
548    }
549
550    #[test]
551    pub fn slice_array() {
552        let binary_arr =
553            VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"])
554                .slice(1, 2)
555                .unwrap();
556        assert_eq!(
557            binary_arr.scalar_at(0).unwrap(),
558            Scalar::from("hello world this is a long string")
559        );
560    }
561
562    #[test]
563    pub fn flatten_array() {
564        let binary_arr = VarBinViewArray::from_iter_str(["string1", "string2"]);
565
566        let flattened = binary_arr.to_canonical().unwrap();
567        assert!(matches!(flattened, Canonical::VarBinView(_)));
568
569        let var_bin = flattened.into_varbinview().unwrap().into_array();
570        assert_eq!(var_bin.scalar_at(0).unwrap(), Scalar::from("string1"));
571        assert_eq!(var_bin.scalar_at(1).unwrap(), Scalar::from("string2"));
572    }
573
574    #[test]
575    pub fn binary_view_size_and_alignment() {
576        assert_eq!(size_of::<BinaryView>(), 16);
577        assert_eq!(align_of::<BinaryView>(), 16);
578    }
579}