vortex_vector/binaryview/
view.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! The 16-byte view struct stored in variable-length binary vectors.
5
6use std::fmt;
7use std::hash::{Hash, Hasher};
8use std::ops::Range;
9
10use static_assertions::{assert_eq_align, assert_eq_size};
11use vortex_buffer::ByteBuffer;
12use vortex_error::{VortexResult, VortexUnwrap, vortex_ensure, vortex_err};
13
14/// A view over a variable-length binary value.
15///
16/// Either an inlined representation (for values <= 12 bytes) or a reference
17/// to an external buffer (for values > 12 bytes).
18#[derive(Clone, Copy)]
19#[repr(C, align(16))]
20pub union BinaryView {
21    /// Numeric representation. This is logically `u128`, but we split it into the high and low
22    /// bits to preserve the alignment.
23    pub(crate) le_bytes: [u8; 16],
24
25    /// Inlined representation: strings <= 12 bytes
26    pub(crate) inlined: Inlined,
27
28    /// Reference type: strings > 12 bytes.
29    pub(crate) _ref: Ref,
30}
31
32assert_eq_align!(BinaryView, u128);
33assert_eq_size!(BinaryView, [u8; 16]);
34assert_eq_size!(Inlined, [u8; 16]);
35assert_eq_size!(Ref, [u8; 16]);
36
37/// Variant of a [`BinaryView`] that holds an inlined value.
38#[derive(Clone, Copy, Debug, PartialEq, Eq)]
39#[repr(C, align(8))]
40pub struct Inlined {
41    /// The size of the full value.
42    pub size: u32,
43    /// The full inlined value.
44    pub data: [u8; BinaryView::MAX_INLINED_SIZE],
45}
46
47impl Inlined {
48    /// Creates a new inlined representation from the provided value of constant size.
49    fn new<const N: usize>(value: &[u8]) -> Self {
50        debug_assert_eq!(value.len(), N);
51        let mut inlined = Self {
52            size: N.try_into().vortex_unwrap(),
53            data: [0u8; BinaryView::MAX_INLINED_SIZE],
54        };
55        inlined.data[..N].copy_from_slice(&value[..N]);
56        inlined
57    }
58
59    /// Returns the full inlined value.
60    #[inline]
61    pub fn value(&self) -> &[u8] {
62        &self.data[0..(self.size as usize)]
63    }
64}
65
66/// Variant of a [`BinaryView`] that holds a reference to an external buffer.
67#[derive(Clone, Copy, Debug)]
68#[repr(C, align(8))]
69pub struct Ref {
70    /// The size of the full value.
71    pub size: u32,
72    /// The prefix bytes of the value (first 4 bytes).
73    pub prefix: [u8; 4],
74    /// The index of the buffer where the full value is stored.
75    pub buffer_index: u32,
76    /// The offset within the buffer where the full value starts.
77    pub offset: u32,
78}
79
80impl Ref {
81    /// Returns the range within the buffer where the full value is stored.
82    #[inline]
83    pub fn as_range(&self) -> Range<usize> {
84        self.offset as usize..(self.offset + self.size) as usize
85    }
86
87    /// Replaces the buffer index and offset of the reference, returning a new `Ref`.
88    #[inline]
89    pub fn with_buffer_and_offset(&self, buffer_index: u32, offset: u32) -> Ref {
90        Self {
91            size: self.size,
92            prefix: self.prefix,
93            buffer_index,
94            offset,
95        }
96    }
97}
98
99impl PartialEq for BinaryView {
100    fn eq(&self, other: &Self) -> bool {
101        let a = unsafe { std::mem::transmute::<&BinaryView, &u128>(self) };
102        let b = unsafe { std::mem::transmute::<&BinaryView, &u128>(other) };
103        a == b
104    }
105}
106impl Eq for BinaryView {}
107
108impl Hash for BinaryView {
109    fn hash<H: Hasher>(&self, state: &mut H) {
110        unsafe { std::mem::transmute::<&BinaryView, &u128>(self) }.hash(state);
111    }
112}
113
114impl Default for BinaryView {
115    fn default() -> Self {
116        Self::make_view(&[], 0, 0)
117    }
118}
119
120impl BinaryView {
121    /// Maximum size of an inlined binary value.
122    pub const MAX_INLINED_SIZE: usize = 12;
123
124    /// Create a view from a value, block and offset
125    ///
126    /// Depending on the length of the provided value either a new inlined
127    /// or a reference view will be constructed.
128    ///
129    /// Adapted from arrow-rs <https://github.com/apache/arrow-rs/blob/f4fde769ab6e1a9b75f890b7f8b47bc22800830b/arrow-array/src/builder/generic_bytes_view_builder.rs#L524>
130    /// Explicitly enumerating inlined view produces code that avoids calling generic `ptr::copy_non_interleave` that's slower than explicit stores
131    #[inline(never)]
132    pub fn make_view(value: &[u8], block: u32, offset: u32) -> Self {
133        match value.len() {
134            0 => Self {
135                inlined: Inlined::new::<0>(value),
136            },
137            1 => Self {
138                inlined: Inlined::new::<1>(value),
139            },
140            2 => Self {
141                inlined: Inlined::new::<2>(value),
142            },
143            3 => Self {
144                inlined: Inlined::new::<3>(value),
145            },
146            4 => Self {
147                inlined: Inlined::new::<4>(value),
148            },
149            5 => Self {
150                inlined: Inlined::new::<5>(value),
151            },
152            6 => Self {
153                inlined: Inlined::new::<6>(value),
154            },
155            7 => Self {
156                inlined: Inlined::new::<7>(value),
157            },
158            8 => Self {
159                inlined: Inlined::new::<8>(value),
160            },
161            9 => Self {
162                inlined: Inlined::new::<9>(value),
163            },
164            10 => Self {
165                inlined: Inlined::new::<10>(value),
166            },
167            11 => Self {
168                inlined: Inlined::new::<11>(value),
169            },
170            12 => Self {
171                inlined: Inlined::new::<12>(value),
172            },
173            _ => Self {
174                _ref: Ref {
175                    size: u32::try_from(value.len()).vortex_unwrap(),
176                    prefix: value[0..4].try_into().vortex_unwrap(),
177                    buffer_index: block,
178                    offset,
179                },
180            },
181        }
182    }
183
184    /// Create a new empty view
185    #[inline]
186    pub fn empty_view() -> Self {
187        Self { le_bytes: [0; 16] }
188    }
189
190    /// Create a new inlined binary view
191    ///
192    /// # Panics
193    ///
194    /// Panics if the provided string is too long to inline.
195    #[inline]
196    pub fn new_inlined(value: &[u8]) -> Self {
197        assert!(
198            value.len() <= Self::MAX_INLINED_SIZE,
199            "expected inlined value to be <= 12 bytes, was {}",
200            value.len()
201        );
202
203        Self::make_view(value, 0, 0)
204    }
205
206    /// Returns the length of the binary value.
207    #[inline]
208    pub fn len(&self) -> u32 {
209        unsafe { self.inlined.size }
210    }
211
212    /// Returns true if the binary value is empty.
213    #[inline]
214    pub fn is_empty(&self) -> bool {
215        self.len() == 0
216    }
217
218    /// Returns true if the binary value is inlined.
219    #[inline]
220    #[allow(clippy::cast_possible_truncation)]
221    pub fn is_inlined(&self) -> bool {
222        self.len() <= (Self::MAX_INLINED_SIZE as u32)
223    }
224
225    /// Returns the inlined representation of the binary value.
226    pub fn as_inlined(&self) -> &Inlined {
227        debug_assert!(self.is_inlined());
228        unsafe { &self.inlined }
229    }
230
231    /// Returns the reference representation of the binary value.
232    pub fn as_view(&self) -> &Ref {
233        debug_assert!(!self.is_inlined());
234        unsafe { &self._ref }
235    }
236
237    /// Returns a mutable reference to the reference representation of the binary value.
238    pub fn as_view_mut(&mut self) -> &mut Ref {
239        unsafe { &mut self._ref }
240    }
241
242    /// Returns the binary view as u128 representation.
243    pub fn as_u128(&self) -> u128 {
244        // SAFETY: binary view always safe to read as u128 LE bytes
245        unsafe { u128::from_le_bytes(self.le_bytes) }
246    }
247}
248
249impl From<u128> for BinaryView {
250    fn from(value: u128) -> Self {
251        BinaryView {
252            le_bytes: value.to_le_bytes(),
253        }
254    }
255}
256
257impl From<Ref> for BinaryView {
258    fn from(value: Ref) -> Self {
259        BinaryView { _ref: value }
260    }
261}
262
263impl fmt::Debug for BinaryView {
264    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
265        let mut s = f.debug_struct("BinaryView");
266        if self.is_inlined() {
267            s.field("inline", &self.as_inlined());
268        } else {
269            s.field("ref", &self.as_view());
270        }
271        s.finish()
272    }
273}
274
275/// Validate that all views either
276///
277/// 1. Contain valid inline data that conforms to type constraints as defined by the `validator`
278/// 2. Points at a valid range of owned buffer memory, and the bytes stored there conform to
279///    the type constraints as defined by the `validator`.
280pub(super) fn validate_views<ValidateFn, IsValidFn>(
281    views: &[BinaryView],
282    buffers: impl AsRef<[ByteBuffer]>,
283    validity: IsValidFn,
284    validator: ValidateFn,
285) -> VortexResult<()>
286where
287    IsValidFn: Fn(usize) -> bool,
288    ValidateFn: Fn(&[u8]) -> bool,
289{
290    let buffers = buffers.as_ref();
291    for (idx, &view) in views.iter().enumerate() {
292        if !validity(idx) {
293            continue;
294        }
295
296        if view.is_inlined() {
297            // Validate the inline bytestring
298            let bytes = &unsafe { view.inlined }.data[..view.len() as usize];
299            vortex_ensure!(
300                validator(bytes),
301                "view at index {idx}: inlined bytes failed utf-8 validation"
302            );
303        } else {
304            // Validate the view pointer
305            let view = view.as_view();
306            let buf_index = view.buffer_index as usize;
307            let start_offset = view.offset as usize;
308            let end_offset = start_offset.saturating_add(view.size as usize);
309
310            let buf = buffers.get(buf_index).ok_or_else(||
311                vortex_err!("view at index {idx} references invalid buffer: {buf_index} out of bounds for BinaryViewVector with {} buffers",
312                        buffers.len()))?;
313
314            vortex_ensure!(
315                start_offset < buf.len(),
316                "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
317                buf.len(),
318            );
319
320            vortex_ensure!(
321                end_offset <= buf.len(),
322                "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
323                buf.len(),
324            );
325
326            // Make sure the prefix data matches the buffer data.
327            let bytes = &buf[start_offset..end_offset];
328            vortex_ensure!(
329                view.prefix == bytes[..4],
330                "VarBinView prefix does not match full string"
331            );
332
333            // Validate the full string
334            vortex_ensure!(
335                validator(bytes),
336                "view at index {idx}: outlined bytes failed utf-8 validation"
337            );
338        }
339    }
340
341    Ok(())
342}