vortex_vector/binaryview/
view.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! The 16-byte view struct stored in variable-length binary vectors.
5
6use std::fmt;
7use std::hash::Hash;
8use std::hash::Hasher;
9use std::ops::Range;
10
11use static_assertions::assert_eq_align;
12use static_assertions::assert_eq_size;
13use vortex_buffer::ByteBuffer;
14use vortex_error::VortexExpect;
15use vortex_error::VortexResult;
16use vortex_error::vortex_ensure;
17use vortex_error::vortex_err;
18
19/// A view over a variable-length binary value.
20///
21/// Either an inlined representation (for values <= 12 bytes) or a reference
22/// to an external buffer (for values > 12 bytes).
23#[derive(Clone, Copy)]
24#[repr(C, align(16))]
25pub union BinaryView {
26    /// Numeric representation. This is logically `u128`, but we split it into the high and low
27    /// bits to preserve the alignment.
28    pub(crate) le_bytes: [u8; 16],
29
30    /// Inlined representation: strings <= 12 bytes
31    pub(crate) inlined: Inlined,
32
33    /// Reference type: strings > 12 bytes.
34    pub(crate) _ref: Ref,
35}
36
37assert_eq_align!(BinaryView, u128);
38assert_eq_size!(BinaryView, [u8; 16]);
39assert_eq_size!(Inlined, [u8; 16]);
40assert_eq_size!(Ref, [u8; 16]);
41
42/// Variant of a [`BinaryView`] that holds an inlined value.
43#[derive(Clone, Copy, Debug, PartialEq, Eq)]
44#[repr(C, align(8))]
45pub struct Inlined {
46    /// The size of the full value.
47    pub size: u32,
48    /// The full inlined value.
49    pub data: [u8; BinaryView::MAX_INLINED_SIZE],
50}
51
52impl Inlined {
53    /// Creates a new inlined representation from the provided value of constant size.
54    #[inline]
55    fn new<const N: usize>(value: &[u8]) -> Self {
56        debug_assert_eq!(value.len(), N);
57        let mut inlined = Self {
58            size: N.try_into().vortex_expect("inlined size must fit in u32"),
59            data: [0u8; BinaryView::MAX_INLINED_SIZE],
60        };
61        inlined.data[..N].copy_from_slice(&value[..N]);
62        inlined
63    }
64
65    /// Returns the full inlined value.
66    #[inline]
67    pub fn value(&self) -> &[u8] {
68        &self.data[0..(self.size as usize)]
69    }
70}
71
72/// Variant of a [`BinaryView`] that holds a reference to an external buffer.
73#[derive(Clone, Copy, Debug)]
74#[repr(C, align(8))]
75pub struct Ref {
76    /// The size of the full value.
77    pub size: u32,
78    /// The prefix bytes of the value (first 4 bytes).
79    pub prefix: [u8; 4],
80    /// The index of the buffer where the full value is stored.
81    pub buffer_index: u32,
82    /// The offset within the buffer where the full value starts.
83    pub offset: u32,
84}
85
86impl Ref {
87    /// Returns the range within the buffer where the full value is stored.
88    #[inline]
89    pub fn as_range(&self) -> Range<usize> {
90        self.offset as usize..(self.offset + self.size) as usize
91    }
92
93    /// Replaces the buffer index and offset of the reference, returning a new `Ref`.
94    #[inline]
95    pub fn with_buffer_and_offset(&self, buffer_index: u32, offset: u32) -> Ref {
96        Self {
97            size: self.size,
98            prefix: self.prefix,
99            buffer_index,
100            offset,
101        }
102    }
103}
104
105impl PartialEq for BinaryView {
106    fn eq(&self, other: &Self) -> bool {
107        let a = unsafe { std::mem::transmute::<&BinaryView, &u128>(self) };
108        let b = unsafe { std::mem::transmute::<&BinaryView, &u128>(other) };
109        a == b
110    }
111}
112impl Eq for BinaryView {}
113
114impl Hash for BinaryView {
115    fn hash<H: Hasher>(&self, state: &mut H) {
116        unsafe { std::mem::transmute::<&BinaryView, &u128>(self) }.hash(state);
117    }
118}
119
120impl Default for BinaryView {
121    fn default() -> Self {
122        Self::make_view(&[], 0, 0)
123    }
124}
125
126impl BinaryView {
127    /// Maximum size of an inlined binary value.
128    pub const MAX_INLINED_SIZE: usize = 12;
129
130    /// Create a view from a value, block and offset
131    ///
132    /// Depending on the length of the provided value either a new inlined
133    /// or a reference view will be constructed.
134    ///
135    /// Adapted from arrow-rs <https://github.com/apache/arrow-rs/blob/f4fde769ab6e1a9b75f890b7f8b47bc22800830b/arrow-array/src/builder/generic_bytes_view_builder.rs#L524>
136    /// Explicitly enumerating inlined view produces code that avoids calling generic `ptr::copy_non_interleave` that's slower than explicit stores
137    #[inline(never)]
138    pub fn make_view(value: &[u8], block: u32, offset: u32) -> Self {
139        match value.len() {
140            0 => Self {
141                inlined: Inlined::new::<0>(value),
142            },
143            1 => Self {
144                inlined: Inlined::new::<1>(value),
145            },
146            2 => Self {
147                inlined: Inlined::new::<2>(value),
148            },
149            3 => Self {
150                inlined: Inlined::new::<3>(value),
151            },
152            4 => Self {
153                inlined: Inlined::new::<4>(value),
154            },
155            5 => Self {
156                inlined: Inlined::new::<5>(value),
157            },
158            6 => Self {
159                inlined: Inlined::new::<6>(value),
160            },
161            7 => Self {
162                inlined: Inlined::new::<7>(value),
163            },
164            8 => Self {
165                inlined: Inlined::new::<8>(value),
166            },
167            9 => Self {
168                inlined: Inlined::new::<9>(value),
169            },
170            10 => Self {
171                inlined: Inlined::new::<10>(value),
172            },
173            11 => Self {
174                inlined: Inlined::new::<11>(value),
175            },
176            12 => Self {
177                inlined: Inlined::new::<12>(value),
178            },
179            _ => Self {
180                _ref: Ref {
181                    size: u32::try_from(value.len()).vortex_expect("value length must fit in u32"),
182                    prefix: value[0..4]
183                        .try_into()
184                        .vortex_expect("prefix must be exactly 4 bytes"),
185                    buffer_index: block,
186                    offset,
187                },
188            },
189        }
190    }
191
192    /// Create a new empty view
193    #[inline]
194    pub fn empty_view() -> Self {
195        Self { le_bytes: [0; 16] }
196    }
197
198    /// Create a new inlined binary view
199    ///
200    /// # Panics
201    ///
202    /// Panics if the provided string is too long to inline.
203    #[inline]
204    pub fn new_inlined(value: &[u8]) -> Self {
205        assert!(
206            value.len() <= Self::MAX_INLINED_SIZE,
207            "expected inlined value to be <= 12 bytes, was {}",
208            value.len()
209        );
210
211        Self::make_view(value, 0, 0)
212    }
213
214    /// Returns the length of the binary value.
215    #[inline]
216    pub fn len(&self) -> u32 {
217        unsafe { self.inlined.size }
218    }
219
220    /// Returns true if the binary value is empty.
221    #[inline]
222    pub fn is_empty(&self) -> bool {
223        self.len() == 0
224    }
225
226    /// Returns true if the binary value is inlined.
227    #[inline]
228    #[expect(
229        clippy::cast_possible_truncation,
230        reason = "MAX_INLINED_SIZE is a small constant"
231    )]
232    pub fn is_inlined(&self) -> bool {
233        self.len() <= (Self::MAX_INLINED_SIZE as u32)
234    }
235
236    /// Returns the inlined representation of the binary value.
237    pub fn as_inlined(&self) -> &Inlined {
238        debug_assert!(self.is_inlined());
239        unsafe { &self.inlined }
240    }
241
242    /// Returns the reference representation of the binary value.
243    pub fn as_view(&self) -> &Ref {
244        debug_assert!(!self.is_inlined());
245        unsafe { &self._ref }
246    }
247
248    /// Returns a mutable reference to the reference representation of the binary value.
249    pub fn as_view_mut(&mut self) -> &mut Ref {
250        unsafe { &mut self._ref }
251    }
252
253    /// Returns the binary view as u128 representation.
254    pub fn as_u128(&self) -> u128 {
255        // SAFETY: binary view always safe to read as u128 LE bytes
256        unsafe { u128::from_le_bytes(self.le_bytes) }
257    }
258}
259
260impl From<u128> for BinaryView {
261    fn from(value: u128) -> Self {
262        BinaryView {
263            le_bytes: value.to_le_bytes(),
264        }
265    }
266}
267
268impl From<Ref> for BinaryView {
269    fn from(value: Ref) -> Self {
270        BinaryView { _ref: value }
271    }
272}
273
274impl fmt::Debug for BinaryView {
275    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
276        let mut s = f.debug_struct("BinaryView");
277        if self.is_inlined() {
278            s.field("inline", &self.as_inlined());
279        } else {
280            s.field("ref", &self.as_view());
281        }
282        s.finish()
283    }
284}
285
286/// Validate that all views either
287///
288/// 1. Contain valid inline data that conforms to type constraints as defined by the `validator`
289/// 2. Points at a valid range of owned buffer memory, and the bytes stored there conform to
290///    the type constraints as defined by the `validator`.
291pub(super) fn validate_views<ValidateFn, IsValidFn>(
292    views: &[BinaryView],
293    buffers: impl AsRef<[ByteBuffer]>,
294    validity: IsValidFn,
295    validator: ValidateFn,
296) -> VortexResult<()>
297where
298    IsValidFn: Fn(usize) -> bool,
299    ValidateFn: Fn(&[u8]) -> bool,
300{
301    let buffers = buffers.as_ref();
302    for (idx, &view) in views.iter().enumerate() {
303        if !validity(idx) {
304            continue;
305        }
306
307        if view.is_inlined() {
308            // Validate the inline bytestring
309            let bytes = &unsafe { view.inlined }.data[..view.len() as usize];
310            vortex_ensure!(
311                validator(bytes),
312                "view at index {idx}: inlined bytes failed utf-8 validation"
313            );
314        } else {
315            // Validate the view pointer
316            let view = view.as_view();
317            let buf_index = view.buffer_index as usize;
318            let start_offset = view.offset as usize;
319            let end_offset = start_offset.saturating_add(view.size as usize);
320
321            let buf = buffers.get(buf_index).ok_or_else(||
322                vortex_err!("view at index {idx} references invalid buffer: {buf_index} out of bounds for BinaryViewVector with {} buffers",
323                        buffers.len()))?;
324
325            vortex_ensure!(
326                start_offset < buf.len(),
327                "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
328                buf.len(),
329            );
330
331            vortex_ensure!(
332                end_offset <= buf.len(),
333                "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
334                buf.len(),
335            );
336
337            // Make sure the prefix data matches the buffer data.
338            let bytes = &buf[start_offset..end_offset];
339            vortex_ensure!(
340                view.prefix == bytes[..4],
341                "VarBinView prefix does not match full string"
342            );
343
344            // Validate the full string
345            vortex_ensure!(
346                validator(bytes),
347                "view at index {idx}: outlined bytes failed utf-8 validation"
348            );
349        }
350    }
351
352    Ok(())
353}