1use std::fmt::{Debug, Formatter};
5use std::hash::{Hash, Hasher};
6use std::ops::Range;
7use std::sync::Arc;
8
9use static_assertions::{assert_eq_align, assert_eq_size};
10use vortex_buffer::{Buffer, ByteBuffer};
11use vortex_dtype::{DType, Nullability};
12use vortex_error::{
13    VortexExpect, VortexResult, VortexUnwrap, vortex_bail, vortex_ensure, vortex_err, vortex_panic,
14};
15
16use crate::builders::{ArrayBuilder, VarBinViewBuilder};
17use crate::stats::{ArrayStats, StatsSetRef};
18use crate::validity::Validity;
19use crate::vtable::{
20    ArrayVTable, CanonicalVTable, NotSupported, VTable, ValidityHelper,
21    ValidityVTableFromValidityHelper,
22};
23use crate::{Canonical, EncodingId, EncodingRef, vtable};
24
25mod accessor;
26mod compact;
27mod compute;
28mod ops;
29mod serde;
30
31#[derive(Clone, Copy, Debug, PartialEq, Eq)]
32#[repr(C, align(8))]
33pub struct Inlined {
34    size: u32,
35    data: [u8; BinaryView::MAX_INLINED_SIZE],
36}
37
38impl Inlined {
39    fn new<const N: usize>(value: &[u8]) -> Self {
40        let mut inlined = Self {
41            size: N.try_into().vortex_unwrap(),
42            data: [0u8; BinaryView::MAX_INLINED_SIZE],
43        };
44        inlined.data[..N].copy_from_slice(&value[..N]);
45        inlined
46    }
47
48    #[inline]
49    pub fn value(&self) -> &[u8] {
50        &self.data[0..(self.size as usize)]
51    }
52}
53
54#[derive(Clone, Copy, Debug)]
55#[repr(C, align(8))]
56pub struct Ref {
57    size: u32,
58    prefix: [u8; 4],
59    buffer_index: u32,
60    offset: u32,
61}
62
63impl Ref {
64    pub fn new(size: u32, prefix: [u8; 4], buffer_index: u32, offset: u32) -> Self {
65        Self {
66            size,
67            prefix,
68            buffer_index,
69            offset,
70        }
71    }
72
73    #[inline]
74    pub fn buffer_index(&self) -> u32 {
75        self.buffer_index
76    }
77
78    #[inline]
79    pub fn offset(&self) -> u32 {
80        self.offset
81    }
82
83    #[inline]
84    pub fn prefix(&self) -> &[u8; 4] {
85        &self.prefix
86    }
87
88    #[inline]
89    pub fn to_range(&self) -> Range<usize> {
90        self.offset as usize..(self.offset + self.size) as usize
91    }
92}
93
94#[derive(Clone, Copy)]
95#[repr(C, align(16))]
96pub union BinaryView {
97    le_bytes: [u8; 16],
100
101    inlined: Inlined,
103
104    _ref: Ref,
106}
107
108assert_eq_size!(BinaryView, [u8; 16]);
109assert_eq_size!(Inlined, [u8; 16]);
110assert_eq_size!(Ref, [u8; 16]);
111assert_eq_align!(BinaryView, u128);
112
113impl Hash for BinaryView {
114    fn hash<H: Hasher>(&self, state: &mut H) {
115        unsafe { std::mem::transmute::<&BinaryView, &[u8; 16]>(self) }.hash(state);
116    }
117}
118
119impl BinaryView {
120    pub const MAX_INLINED_SIZE: usize = 12;
121
122    #[inline(never)]
130    pub fn make_view(value: &[u8], block: u32, offset: u32) -> Self {
131        match value.len() {
132            0 => Self {
133                inlined: Inlined::new::<0>(value),
134            },
135            1 => Self {
136                inlined: Inlined::new::<1>(value),
137            },
138            2 => Self {
139                inlined: Inlined::new::<2>(value),
140            },
141            3 => Self {
142                inlined: Inlined::new::<3>(value),
143            },
144            4 => Self {
145                inlined: Inlined::new::<4>(value),
146            },
147            5 => Self {
148                inlined: Inlined::new::<5>(value),
149            },
150            6 => Self {
151                inlined: Inlined::new::<6>(value),
152            },
153            7 => Self {
154                inlined: Inlined::new::<7>(value),
155            },
156            8 => Self {
157                inlined: Inlined::new::<8>(value),
158            },
159            9 => Self {
160                inlined: Inlined::new::<9>(value),
161            },
162            10 => Self {
163                inlined: Inlined::new::<10>(value),
164            },
165            11 => Self {
166                inlined: Inlined::new::<11>(value),
167            },
168            12 => Self {
169                inlined: Inlined::new::<12>(value),
170            },
171            _ => Self {
172                _ref: Ref::new(
173                    u32::try_from(value.len()).vortex_unwrap(),
174                    value[0..4].try_into().vortex_unwrap(),
175                    block,
176                    offset,
177                ),
178            },
179        }
180    }
181
182    #[inline]
184    pub fn empty_view() -> Self {
185        Self::new_inlined(&[])
186    }
187
188    #[inline]
190    pub fn new_inlined(value: &[u8]) -> Self {
191        assert!(
192            value.len() <= Self::MAX_INLINED_SIZE,
193            "expected inlined value to be <= 12 bytes, was {}",
194            value.len()
195        );
196
197        Self::make_view(value, 0, 0)
198    }
199
200    #[inline]
201    pub fn len(&self) -> u32 {
202        unsafe { self.inlined.size }
203    }
204
205    #[inline]
206    pub fn is_empty(&self) -> bool {
207        self.len() > 0
208    }
209
210    #[inline]
211    #[allow(clippy::cast_possible_truncation)]
212    pub fn is_inlined(&self) -> bool {
213        self.len() <= (Self::MAX_INLINED_SIZE as u32)
214    }
215
216    pub fn as_inlined(&self) -> &Inlined {
217        unsafe { &self.inlined }
218    }
219
220    pub fn as_view(&self) -> &Ref {
221        unsafe { &self._ref }
222    }
223
224    pub fn as_u128(&self) -> u128 {
225        unsafe { u128::from_le_bytes(self.le_bytes) }
227    }
228
229    #[inline(always)]
231    pub fn with_buffer_idx(self, buffer_idx: u32) -> Self {
232        if self.is_inlined() {
233            self
234        } else {
235            let view_ref = self.as_view();
237            Self {
238                _ref: Ref::new(
239                    self.len(),
240                    *view_ref.prefix(),
241                    buffer_idx,
242                    view_ref.offset(),
243                ),
244            }
245        }
246    }
247
248    #[inline(always)]
251    pub fn offset_view(self, offset: u32) -> Self {
252        if self.is_inlined() {
253            self
254        } else {
255            let view_ref = self.as_view();
257            Self {
258                _ref: Ref::new(
259                    self.len(),
260                    *view_ref.prefix(),
261                    offset + view_ref.buffer_index(),
262                    view_ref.offset(),
263                ),
264            }
265        }
266    }
267}
268
269impl From<u128> for BinaryView {
270    fn from(value: u128) -> Self {
271        BinaryView {
272            le_bytes: value.to_le_bytes(),
273        }
274    }
275}
276
277impl Debug for BinaryView {
278    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
279        let mut s = f.debug_struct("BinaryView");
280        if self.is_inlined() {
281            s.field("inline", &"i".to_string());
282        } else {
283            s.field("ref", &"r".to_string());
284        }
285        s.finish()
286    }
287}
288
289vtable!(VarBinView);
290
291impl VTable for VarBinViewVTable {
292    type Array = VarBinViewArray;
293    type Encoding = VarBinViewEncoding;
294
295    type ArrayVTable = Self;
296    type CanonicalVTable = Self;
297    type OperationsVTable = Self;
298    type ValidityVTable = ValidityVTableFromValidityHelper;
299    type VisitorVTable = Self;
300    type ComputeVTable = NotSupported;
301    type EncodeVTable = NotSupported;
302    type PipelineVTable = NotSupported;
303    type SerdeVTable = Self;
304
305    fn id(_encoding: &Self::Encoding) -> EncodingId {
306        EncodingId::new_ref("vortex.varbinview")
307    }
308
309    fn encoding(_array: &Self::Array) -> EncodingRef {
310        EncodingRef::new_ref(VarBinViewEncoding.as_ref())
311    }
312}
313
314#[derive(Clone, Debug)]
374pub struct VarBinViewArray {
375    dtype: DType,
376    buffers: Arc<[ByteBuffer]>,
377    views: Buffer<BinaryView>,
378    validity: Validity,
379    stats_set: ArrayStats,
380}
381
382#[derive(Clone, Debug)]
383pub struct VarBinViewEncoding;
384
385impl VarBinViewArray {
386    fn validate(
387        views: &Buffer<BinaryView>,
388        buffers: &Arc<[ByteBuffer]>,
389        dtype: &DType,
390        validity: &Validity,
391    ) -> VortexResult<()> {
392        vortex_ensure!(
393            validity.nullability() == dtype.nullability(),
394            "validity {:?} incompatible with nullability {:?}",
395            validity,
396            dtype.nullability()
397        );
398
399        match dtype {
400            DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
401                simdutf8::basic::from_utf8(string).is_ok()
402            })?,
403            DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
404            _ => vortex_bail!("invalid DType {dtype} for `VarBinViewArray`"),
405        }
406
407        Ok(())
408    }
409
410    fn validate_views<F>(
411        views: &Buffer<BinaryView>,
412        buffers: &Arc<[ByteBuffer]>,
413        validity: &Validity,
414        validator: F,
415    ) -> VortexResult<()>
416    where
417        F: Fn(&[u8]) -> bool,
418    {
419        for (idx, &view) in views.iter().enumerate() {
420            if validity.is_null(idx) {
421                continue;
422            }
423
424            if view.is_inlined() {
425                let bytes = &unsafe { view.inlined }.data[..view.len() as usize];
427                vortex_ensure!(
428                    validator(bytes),
429                    "view at index {idx}: inlined bytes failed utf-8 validation"
430                );
431            } else {
432                let view = view.as_view();
434                let buf_index = view.buffer_index as usize;
435                let start_offset = view.offset as usize;
436                let end_offset = start_offset.saturating_add(view.size as usize);
437
438                let buf = buffers.get(buf_index).ok_or_else(||
439                    vortex_err!("view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewArray with {} buffers",
440                        buffers.len()))?;
441
442                vortex_ensure!(
443                    start_offset < buf.len(),
444                    "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
445                    buf.len(),
446                );
447
448                vortex_ensure!(
449                    end_offset <= buf.len(),
450                    "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
451                    buf.len(),
452                );
453
454                let bytes = &buf[start_offset..end_offset];
456                vortex_ensure!(
457                    view.prefix == bytes[..4],
458                    "VarBinView prefix does not match full string"
459                );
460
461                vortex_ensure!(
463                    validator(bytes),
464                    "view at index {idx}: outlined bytes fails utf-8 validation"
465                );
466            }
467        }
468
469        Ok(())
470    }
471}
472
473impl VarBinViewArray {
474    pub unsafe fn new_unchecked(
482        views: Buffer<BinaryView>,
483        buffers: Arc<[ByteBuffer]>,
484        dtype: DType,
485        validity: Validity,
486    ) -> Self {
487        Self {
488            dtype,
489            buffers,
490            views,
491            validity,
492            stats_set: Default::default(),
493        }
494    }
495
496    pub fn new(
497        views: Buffer<BinaryView>,
498        buffers: Arc<[ByteBuffer]>,
499        dtype: DType,
500        validity: Validity,
501    ) -> Self {
502        Self::try_new(views, buffers, dtype, validity).vortex_expect("VarBinViewArray new")
503    }
504
505    pub fn try_new(
506        views: Buffer<BinaryView>,
507        buffers: Arc<[ByteBuffer]>,
508        dtype: DType,
509        validity: Validity,
510    ) -> VortexResult<Self> {
511        Self::validate(&views, &buffers, &dtype, &validity)?;
512
513        Ok(Self {
514            dtype,
515            buffers,
516            views,
517            validity,
518            stats_set: Default::default(),
519        })
520    }
521
522    pub fn nbuffers(&self) -> usize {
524        self.buffers.len()
525    }
526
527    #[inline]
533    pub fn views(&self) -> &Buffer<BinaryView> {
534        &self.views
535    }
536
537    #[inline]
541    pub fn bytes_at(&self, index: usize) -> ByteBuffer {
542        let views = self.views();
543        let view = &views[index];
544        if !view.is_inlined() {
546            let view_ref = view.as_view();
547            self.buffer(view_ref.buffer_index() as usize)
548                .slice(view_ref.to_range())
549        } else {
550            views
552                .clone()
553                .into_byte_buffer()
554                .slice_ref(view.as_inlined().value())
555        }
556    }
557
558    #[inline]
565    pub fn buffer(&self, idx: usize) -> &ByteBuffer {
566        if idx >= self.nbuffers() {
567            vortex_panic!(
568                "{idx} buffer index out of bounds, there are {} buffers",
569                self.nbuffers()
570            );
571        }
572        &self.buffers[idx]
573    }
574
575    #[inline]
577    pub fn buffers(&self) -> &Arc<[ByteBuffer]> {
578        &self.buffers
579    }
580
581    #[allow(clippy::same_name_method)]
583    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
584        iter: I,
585        dtype: DType,
586    ) -> Self {
587        let iter = iter.into_iter();
588        let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
589
590        for item in iter {
591            match item {
592                None => builder.append_null(),
593                Some(v) => builder.append_value(v),
594            }
595        }
596
597        builder.finish_into_varbinview()
598    }
599
600    pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
601        let iter = iter.into_iter();
602        let mut builder = VarBinViewBuilder::with_capacity(
603            DType::Utf8(Nullability::NonNullable),
604            iter.size_hint().0,
605        );
606
607        for item in iter {
608            builder.append_value(item.as_ref());
609        }
610
611        builder.finish_into_varbinview()
612    }
613
614    pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
615        iter: I,
616    ) -> Self {
617        let iter = iter.into_iter();
618        let mut builder = VarBinViewBuilder::with_capacity(
619            DType::Utf8(Nullability::Nullable),
620            iter.size_hint().0,
621        );
622
623        for item in iter {
624            match item {
625                None => builder.append_null(),
626                Some(v) => builder.append_value(v.as_ref()),
627            }
628        }
629
630        builder.finish_into_varbinview()
631    }
632
633    pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
634        let iter = iter.into_iter();
635        let mut builder = VarBinViewBuilder::with_capacity(
636            DType::Binary(Nullability::NonNullable),
637            iter.size_hint().0,
638        );
639
640        for item in iter {
641            builder.append_value(item.as_ref());
642        }
643
644        builder.finish_into_varbinview()
645    }
646
647    pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
648        iter: I,
649    ) -> Self {
650        let iter = iter.into_iter();
651        let mut builder = VarBinViewBuilder::with_capacity(
652            DType::Binary(Nullability::Nullable),
653            iter.size_hint().0,
654        );
655
656        for item in iter {
657            match item {
658                None => builder.append_null(),
659                Some(v) => builder.append_value(v.as_ref()),
660            }
661        }
662
663        builder.finish_into_varbinview()
664    }
665}
666
667impl ArrayVTable<VarBinViewVTable> for VarBinViewVTable {
668    fn len(array: &VarBinViewArray) -> usize {
669        array.views.len()
670    }
671
672    fn dtype(array: &VarBinViewArray) -> &DType {
673        &array.dtype
674    }
675
676    fn stats(array: &VarBinViewArray) -> StatsSetRef<'_> {
677        array.stats_set.to_ref(array.as_ref())
678    }
679}
680
681impl ValidityHelper for VarBinViewArray {
682    fn validity(&self) -> &Validity {
683        &self.validity
684    }
685}
686
687impl CanonicalVTable<VarBinViewVTable> for VarBinViewVTable {
688    fn canonicalize(array: &VarBinViewArray) -> Canonical {
689        Canonical::VarBinView(array.clone())
690    }
691
692    fn append_to_builder(array: &VarBinViewArray, builder: &mut dyn ArrayBuilder) {
693        builder.extend_from_array(array.as_ref())
694    }
695}
696
697impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
698    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
699        Self::from_iter_nullable_bin(iter)
700    }
701}
702
703impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
704    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
705        Self::from_iter_nullable_bin(iter)
706    }
707}
708
709impl FromIterator<Option<String>> for VarBinViewArray {
710    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
711        Self::from_iter_nullable_str(iter)
712    }
713}
714
715impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
716    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
717        Self::from_iter_nullable_str(iter)
718    }
719}
720
721#[cfg(test)]
722mod test {
723    use vortex_scalar::Scalar;
724
725    use crate::arrays::varbinview::{BinaryView, VarBinViewArray};
726    use crate::{Array, ToCanonical};
727
728    #[test]
729    pub fn varbin_view() {
730        let binary_arr =
731            VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]);
732        assert_eq!(binary_arr.len(), 2);
733        assert_eq!(binary_arr.scalar_at(0), Scalar::from("hello world"));
734        assert_eq!(
735            binary_arr.scalar_at(1),
736            Scalar::from("hello world this is a long string")
737        );
738    }
739
740    #[test]
741    pub fn slice_array() {
742        let binary_arr =
743            VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"])
744                .slice(1..2);
745        assert_eq!(
746            binary_arr.scalar_at(0),
747            Scalar::from("hello world this is a long string")
748        );
749    }
750
751    #[test]
752    pub fn flatten_array() {
753        let binary_arr = VarBinViewArray::from_iter_str(["string1", "string2"]);
754        let var_bin = binary_arr.to_varbinview();
755        assert_eq!(var_bin.scalar_at(0), Scalar::from("string1"));
756        assert_eq!(var_bin.scalar_at(1), Scalar::from("string2"));
757    }
758
759    #[test]
760    pub fn binary_view_size_and_alignment() {
761        assert_eq!(size_of::<BinaryView>(), 16);
762        assert_eq!(align_of::<BinaryView>(), 16);
763    }
764}