Skip to main content

vortex_array/arrays/varbinview/
view.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! The 16-byte view struct stored in variable-length binary vectors.
5
6use std::fmt;
7use std::hash::Hash;
8use std::hash::Hasher;
9use std::ops::Range;
10
11use static_assertions::assert_eq_align;
12use static_assertions::assert_eq_size;
13use vortex_error::VortexExpect;
14
15/// A view over a variable-length binary value.
16///
17/// Either an inlined representation (for values <= 12 bytes) or a reference
18/// to an external buffer (for values > 12 bytes).
19#[derive(Clone, Copy)]
20#[repr(C, align(16))]
21pub union BinaryView {
22    /// Numeric representation. This is logically `u128`, but we split it into the high and low
23    /// bits to preserve the alignment.
24    pub(crate) le_bytes: [u8; 16],
25
26    /// Inlined representation: strings <= 12 bytes
27    pub(crate) inlined: Inlined,
28
29    /// Reference type: strings > 12 bytes.
30    pub(crate) _ref: Ref,
31}
32
33assert_eq_align!(BinaryView, u128);
34assert_eq_size!(BinaryView, [u8; 16]);
35assert_eq_size!(Inlined, [u8; 16]);
36assert_eq_size!(Ref, [u8; 16]);
37
38/// Variant of a [`BinaryView`] that holds an inlined value.
39#[derive(Clone, Copy, Debug, PartialEq, Eq)]
40#[repr(C, align(8))]
41pub struct Inlined {
42    /// The size of the full value.
43    pub size: u32,
44    /// The full inlined value.
45    pub data: [u8; BinaryView::MAX_INLINED_SIZE],
46}
47
48impl Inlined {
49    /// Creates a new inlined representation from the provided value of constant size.
50    fn new<const N: usize>(value: &[u8]) -> Self {
51        debug_assert_eq!(value.len(), N);
52        let mut inlined = Self {
53            size: N.try_into().vortex_expect("inlined size must fit in u32"),
54            data: [0u8; BinaryView::MAX_INLINED_SIZE],
55        };
56        inlined.data[..N].copy_from_slice(&value[..N]);
57        inlined
58    }
59
60    /// Returns the full inlined value.
61    #[inline]
62    pub fn value(&self) -> &[u8] {
63        &self.data[0..(self.size as usize)]
64    }
65}
66
67/// Variant of a [`BinaryView`] that holds a reference to an external buffer.
68#[derive(Clone, Copy, Debug)]
69#[repr(C, align(8))]
70pub struct Ref {
71    /// The size of the full value.
72    pub size: u32,
73    /// The prefix bytes of the value (first 4 bytes).
74    pub prefix: [u8; 4],
75    /// The index of the buffer where the full value is stored.
76    pub buffer_index: u32,
77    /// The offset within the buffer where the full value starts.
78    pub offset: u32,
79}
80
81impl Ref {
82    /// Returns the range within the buffer where the full value is stored.
83    #[inline]
84    pub fn as_range(&self) -> Range<usize> {
85        self.offset as usize..(self.offset + self.size) as usize
86    }
87
88    /// Replaces the buffer index and offset of the reference, returning a new `Ref`.
89    #[inline]
90    pub fn with_buffer_and_offset(&self, buffer_index: u32, offset: u32) -> Ref {
91        Self {
92            size: self.size,
93            prefix: self.prefix,
94            buffer_index,
95            offset,
96        }
97    }
98}
99
100impl BinaryView {
101    /// Maximum size of an inlined binary value.
102    pub const MAX_INLINED_SIZE: usize = 12;
103
104    /// Create a view from a value, block and offset
105    ///
106    /// Depending on the length of the provided value either a new inlined
107    /// or a reference view will be constructed.
108    ///
109    /// Adapted from arrow-rs <https://github.com/apache/arrow-rs/blob/f4fde769ab6e1a9b75f890b7f8b47bc22800830b/arrow-array/src/builder/generic_bytes_view_builder.rs#L524>
110    /// Explicitly enumerating inlined view produces code that avoids calling generic `ptr::copy_non_interleave` that's slower than explicit stores
111    #[inline(never)]
112    pub fn make_view(value: &[u8], block: u32, offset: u32) -> Self {
113        match value.len() {
114            0 => Self {
115                inlined: Inlined::new::<0>(value),
116            },
117            1 => Self {
118                inlined: Inlined::new::<1>(value),
119            },
120            2 => Self {
121                inlined: Inlined::new::<2>(value),
122            },
123            3 => Self {
124                inlined: Inlined::new::<3>(value),
125            },
126            4 => Self {
127                inlined: Inlined::new::<4>(value),
128            },
129            5 => Self {
130                inlined: Inlined::new::<5>(value),
131            },
132            6 => Self {
133                inlined: Inlined::new::<6>(value),
134            },
135            7 => Self {
136                inlined: Inlined::new::<7>(value),
137            },
138            8 => Self {
139                inlined: Inlined::new::<8>(value),
140            },
141            9 => Self {
142                inlined: Inlined::new::<9>(value),
143            },
144            10 => Self {
145                inlined: Inlined::new::<10>(value),
146            },
147            11 => Self {
148                inlined: Inlined::new::<11>(value),
149            },
150            12 => Self {
151                inlined: Inlined::new::<12>(value),
152            },
153            _ => Self::new_ref(
154                u32::try_from(value.len()).vortex_expect("value length must fit in u32"),
155                value[0..4]
156                    .try_into()
157                    .ok()
158                    .vortex_expect("prefix must be exactly 4 bytes"),
159                block,
160                offset,
161            ),
162        }
163    }
164
165    /// Create a new empty view
166    #[inline]
167    pub fn empty_view() -> Self {
168        Self { le_bytes: [0; 16] }
169    }
170
171    /// Create a reference view directly from its components, without inspecting the value.
172    ///
173    /// `size` must be greater than [`MAX_INLINED_SIZE`], and `prefix` must hold the first four
174    /// bytes of the value. This is the fast path for bulk view construction where the caller has
175    /// already established that the value is too long to inline; it assembles the 16-byte view as a
176    /// single `u128` so the compiler can emit one wide store per view.
177    ///
178    /// [`MAX_INLINED_SIZE`]: Self::MAX_INLINED_SIZE
179    #[inline]
180    pub fn new_ref(size: u32, prefix: [u8; 4], buffer_index: u32, offset: u32) -> Self {
181        debug_assert!(size as usize > Self::MAX_INLINED_SIZE);
182        // Matches the little-endian field order of `Ref` (size, prefix, buffer_index, offset),
183        // consistent with `le_bytes` and the `From<u128>`/`as_u128` representation.
184        Self::from(
185            u128::from(size)
186                | (u128::from(u32::from_le_bytes(prefix)) << 32)
187                | (u128::from(buffer_index) << 64)
188                | (u128::from(offset) << 96),
189        )
190    }
191
192    /// Create a new inlined binary view
193    ///
194    /// # Panics
195    ///
196    /// Panics if the provided string is too long to inline.
197    #[inline]
198    pub fn new_inlined(value: &[u8]) -> Self {
199        assert!(
200            value.len() <= Self::MAX_INLINED_SIZE,
201            "expected inlined value to be <= 12 bytes, was {}",
202            value.len()
203        );
204
205        Self::make_view(value, 0, 0)
206    }
207
208    /// Returns the length of the binary value.
209    #[inline]
210    pub fn len(&self) -> u32 {
211        unsafe { self.inlined.size }
212    }
213
214    /// Returns true if the binary value is empty.
215    #[inline]
216    pub fn is_empty(&self) -> bool {
217        self.len() == 0
218    }
219
220    /// Returns true if the binary value is inlined.
221    #[inline]
222    #[expect(
223        clippy::cast_possible_truncation,
224        reason = "MAX_INLINED_SIZE is a small constant"
225    )]
226    pub fn is_inlined(&self) -> bool {
227        self.len() <= (Self::MAX_INLINED_SIZE as u32)
228    }
229
230    /// Returns the inlined representation of the binary value.
231    pub fn as_inlined(&self) -> &Inlined {
232        debug_assert!(self.is_inlined());
233        unsafe { &self.inlined }
234    }
235
236    /// Returns the reference representation of the binary value.
237    pub fn as_view(&self) -> &Ref {
238        debug_assert!(!self.is_inlined());
239        unsafe { &self._ref }
240    }
241
242    /// Returns a mutable reference to the reference representation of the binary value.
243    pub fn as_view_mut(&mut self) -> &mut Ref {
244        unsafe { &mut self._ref }
245    }
246
247    /// Returns the binary view as u128 representation.
248    pub fn as_u128(&self) -> u128 {
249        // SAFETY: binary view always safe to read as u128 LE bytes
250        unsafe { u128::from_le_bytes(self.le_bytes) }
251    }
252}
253
254impl From<u128> for BinaryView {
255    fn from(value: u128) -> Self {
256        BinaryView {
257            le_bytes: value.to_le_bytes(),
258        }
259    }
260}
261
262impl From<Ref> for BinaryView {
263    fn from(value: Ref) -> Self {
264        BinaryView { _ref: value }
265    }
266}
267
268impl PartialEq for BinaryView {
269    fn eq(&self, other: &Self) -> bool {
270        let a = unsafe { std::mem::transmute::<&BinaryView, &u128>(self) };
271        let b = unsafe { std::mem::transmute::<&BinaryView, &u128>(other) };
272        a == b
273    }
274}
275impl Eq for BinaryView {}
276
277impl Hash for BinaryView {
278    fn hash<H: Hasher>(&self, state: &mut H) {
279        unsafe { std::mem::transmute::<&BinaryView, &u128>(self) }.hash(state);
280    }
281}
282
283impl Default for BinaryView {
284    fn default() -> Self {
285        Self::make_view(&[], 0, 0)
286    }
287}
288
289impl fmt::Debug for BinaryView {
290    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
291        let mut s = f.debug_struct("BinaryView");
292        if self.is_inlined() {
293            s.field("inline", &self.as_inlined());
294        } else {
295            s.field("ref", &self.as_view());
296        }
297        s.finish()
298    }
299}
300
301#[cfg(test)]
302mod tests {
303    use super::*;
304
305    #[rstest::rstest]
306    // Just past the inline boundary, typical, and large values.
307    #[case(13, 7, 42)]
308    #[case(20, 7, 42)]
309    #[case(255, 7, 42)]
310    #[case(4096, 7, 42)]
311    // Zero buffer index/offset and the `u32` extremes, to confirm the `u128` field assembly does
312    // not overflow into neighbouring fields.
313    #[case(13, 0, 0)]
314    #[case(13, u32::MAX, u32::MAX)]
315    fn new_ref_matches_make_view(#[case] len: u32, #[case] buffer_index: u32, #[case] offset: u32) {
316        // `new_ref` assembles the reference view as a `u128`; it must be byte-identical to the
317        // value-inspecting `make_view` for any value longer than the inline limit.
318        let value: Vec<u8> = (0..len)
319            .map(|i| u8::try_from(i % 251).vortex_expect("i % 251 fits in u8"))
320            .collect();
321        let prefix = [value[0], value[1], value[2], value[3]];
322        let made = BinaryView::make_view(&value, buffer_index, offset);
323        let built = BinaryView::new_ref(len, prefix, buffer_index, offset);
324        assert_eq!(made.as_u128(), built.as_u128(), "mismatch at len {len}");
325        assert!(!built.is_inlined());
326        let r = built.as_view();
327        assert_eq!(r.size, len);
328        assert_eq!(r.prefix, prefix);
329        assert_eq!(r.buffer_index, buffer_index);
330        assert_eq!(r.offset, offset);
331    }
332}