polars_arrow/array/binview/
view.rs

1use std::cmp::Ordering;
2use std::fmt::{self, Display, Formatter};
3
4use bytemuck::{Pod, Zeroable};
5use polars_error::*;
6use polars_utils::min_max::MinMax;
7use polars_utils::nulls::IsNull;
8use polars_utils::total_ord::{TotalEq, TotalOrd};
9
10use crate::datatypes::PrimitiveType;
11use crate::types::{Bytes16Alignment4, NativeType};
12
13// We use this instead of u128 because we want alignment of <= 8 bytes.
14/// A reference to a set of bytes.
15///
16/// If `length <= 12`, these bytes are inlined over the `prefix`, `buffer_idx` and `offset` fields.
17/// If `length > 12`, these fields specify a slice of a buffer.
18#[derive(Copy, Clone, Default)]
19#[repr(C)]
20pub struct View {
21    /// The length of the string/bytes.
22    pub length: u32,
23    /// First 4 bytes of string/bytes data.
24    pub prefix: u32,
25    /// The buffer index.
26    pub buffer_idx: u32,
27    /// The offset into the buffer.
28    pub offset: u32,
29}
30
31impl fmt::Debug for View {
32    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
33        if self.length <= Self::MAX_INLINE_SIZE {
34            fmt.debug_struct("View")
35                .field("length", &self.length)
36                .field("content", &unsafe {
37                    std::slice::from_raw_parts(
38                        (self as *const _ as *const u8).add(4),
39                        self.length as usize,
40                    )
41                })
42                .finish()
43        } else {
44            fmt.debug_struct("View")
45                .field("length", &self.length)
46                .field("prefix", &self.prefix.to_be_bytes())
47                .field("buffer_idx", &self.buffer_idx)
48                .field("offset", &self.offset)
49                .finish()
50        }
51    }
52}
53
54impl View {
55    pub const MAX_INLINE_SIZE: u32 = 12;
56
57    #[inline(always)]
58    pub fn is_inline(&self) -> bool {
59        self.length <= Self::MAX_INLINE_SIZE
60    }
61
62    #[inline(always)]
63    pub fn as_u128(self) -> u128 {
64        unsafe { std::mem::transmute(self) }
65    }
66
67    /// Create a new inline view without verifying the length
68    ///
69    /// # Safety
70    ///
71    /// It needs to hold that `bytes.len() <= View::MAX_INLINE_SIZE`.
72    #[inline]
73    pub unsafe fn new_inline_unchecked(bytes: &[u8]) -> Self {
74        debug_assert!(bytes.len() <= u32::MAX as usize);
75        debug_assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);
76
77        let mut view = Self {
78            length: bytes.len() as u32,
79            ..Default::default()
80        };
81
82        let view_ptr = &mut view as *mut _ as *mut u8;
83
84        // SAFETY:
85        // - bytes length <= 12,
86        // - size_of::<View> == 16
87        // - View is laid out as [length, prefix, buffer_idx, offset] (using repr(C))
88        // - By grabbing the view_ptr and adding 4, we have provenance over prefix, buffer_idx and
89        // offset. (i.e. the same could not be achieved with &mut self.prefix as *mut _ as *mut u8)
90        unsafe {
91            let inline_data_ptr = view_ptr.add(4);
92            core::ptr::copy_nonoverlapping(bytes.as_ptr(), inline_data_ptr, bytes.len());
93        }
94        view
95    }
96
97    /// Create a new inline view
98    ///
99    /// # Panics
100    ///
101    /// Panics if the `bytes.len() > View::MAX_INLINE_SIZE`.
102    #[inline]
103    pub fn new_inline(bytes: &[u8]) -> Self {
104        assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);
105        unsafe { Self::new_inline_unchecked(bytes) }
106    }
107
108    /// Create a new inline view
109    ///
110    /// # Safety
111    ///
112    /// It needs to hold that `bytes.len() > View::MAX_INLINE_SIZE`.
113    #[inline]
114    pub unsafe fn new_noninline_unchecked(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {
115        debug_assert!(bytes.len() <= u32::MAX as usize);
116        debug_assert!(bytes.len() as u32 > View::MAX_INLINE_SIZE);
117
118        // SAFETY: The invariant of this function guarantees that this is safe.
119        let prefix = unsafe { u32::from_le_bytes(bytes[0..4].try_into().unwrap_unchecked()) };
120        Self {
121            length: bytes.len() as u32,
122            prefix,
123            buffer_idx,
124            offset,
125        }
126    }
127
128    #[inline]
129    pub fn new_from_bytes(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {
130        debug_assert!(bytes.len() <= u32::MAX as usize);
131
132        // SAFETY: We verify the invariant with the outer if statement
133        unsafe {
134            if bytes.len() as u32 <= Self::MAX_INLINE_SIZE {
135                Self::new_inline_unchecked(bytes)
136            } else {
137                Self::new_noninline_unchecked(bytes, buffer_idx, offset)
138            }
139        }
140    }
141
142    /// Constructs a byteslice from this view.
143    ///
144    /// # Safety
145    /// Assumes that this view is valid for the given buffers.
146    #[inline]
147    pub unsafe fn get_slice_unchecked<'a, B: AsRef<[u8]>>(&'a self, buffers: &'a [B]) -> &'a [u8] {
148        unsafe {
149            if self.length <= Self::MAX_INLINE_SIZE {
150                self.get_inlined_slice_unchecked()
151            } else {
152                self.get_external_slice_unchecked(buffers)
153            }
154        }
155    }
156
157    /// Construct a byte slice from an inline view, if it is inline.
158    #[inline]
159    pub fn get_inlined_slice(&self) -> Option<&[u8]> {
160        if self.length <= Self::MAX_INLINE_SIZE {
161            unsafe { Some(self.get_inlined_slice_unchecked()) }
162        } else {
163            None
164        }
165    }
166
167    /// Construct a byte slice from an inline view.
168    ///
169    /// # Safety
170    /// Assumes that this view is inlinable.
171    #[inline]
172    pub unsafe fn get_inlined_slice_unchecked(&self) -> &[u8] {
173        debug_assert!(self.length <= View::MAX_INLINE_SIZE);
174        let ptr = self as *const View as *const u8;
175        unsafe { std::slice::from_raw_parts(ptr.add(4), self.length as usize) }
176    }
177
178    /// Construct a byte slice from an external view.
179    ///
180    /// # Safety
181    /// Assumes that this view is in the external buffers.
182    #[inline]
183    pub unsafe fn get_external_slice_unchecked<'a, B: AsRef<[u8]>>(
184        &self,
185        buffers: &'a [B],
186    ) -> &'a [u8] {
187        debug_assert!(self.length > View::MAX_INLINE_SIZE);
188        let data = buffers.get_unchecked(self.buffer_idx as usize);
189        let offset = self.offset as usize;
190        data.as_ref()
191            .get_unchecked(offset..offset + self.length as usize)
192    }
193
194    /// Extend a `Vec<View>` with inline views slices of `src` with `width`.
195    ///
196    /// This tries to use SIMD to optimize the copying and can be massively faster than doing a
197    /// `views.extend(src.chunks_exact(width).map(View::new_inline))`.
198    ///
199    /// # Panics
200    ///
201    /// This function panics if `src.len()` is not divisible by `width`, `width >
202    /// View::MAX_INLINE_SIZE` or `width == 0`.
203    pub fn extend_with_inlinable_strided(views: &mut Vec<Self>, src: &[u8], width: u8) {
204        macro_rules! dispatch {
205            ($n:ident = $match:ident in [$($v:literal),+ $(,)?] => $block:block, otherwise = $otherwise:expr) => {
206                match $match {
207                    $(
208                        $v => {
209                            const $n: usize = $v;
210
211                            $block
212                        }
213                    )+
214                    _ => $otherwise,
215                }
216            }
217        }
218
219        let width = width as usize;
220
221        assert!(width > 0);
222        assert!(width <= View::MAX_INLINE_SIZE as usize);
223
224        assert_eq!(src.len() % width, 0);
225
226        let num_values = src.len() / width;
227
228        views.reserve(num_values);
229
230        #[allow(unused_mut)]
231        let mut src = src;
232
233        dispatch! {
234            N = width in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] => {
235                #[cfg(feature = "simd")]
236                {
237                    macro_rules! repeat_with {
238                        ($i:ident = [$($v:literal),+ $(,)?] => $block:block) => {
239                            $({
240                                const $i: usize = $v;
241
242                                $block
243                            })+
244                        }
245                    }
246
247                    use std::simd::*;
248
249                    // SAFETY: This is always allowed, since views.len() is always in the Vec
250                    // buffer.
251                    let mut dst = unsafe { views.as_mut_ptr().add(views.len()).cast::<u8>() };
252
253                    let length_mask = u8x16::from_array([N as u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
254
255                    const BLOCKS_PER_LOAD: usize = 16 / N;
256                    const BYTES_PER_LOOP: usize = N * BLOCKS_PER_LOAD;
257
258                    let num_loops = (src.len() / BYTES_PER_LOOP).saturating_sub(1);
259
260                    for _ in 0..num_loops {
261                        // SAFETY: The num_loops calculates how many times we can do this.
262                        let loaded = u8x16::from_array(unsafe {
263                            src.get_unchecked(..16).try_into().unwrap()
264                        });
265                        src = unsafe { src.get_unchecked(BYTES_PER_LOOP..) };
266
267                        // This way we can reuse the same load for multiple views.
268                        repeat_with!(
269                            I = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] => {
270                                if I < BLOCKS_PER_LOAD {
271                                    let zero = u8x16::default();
272                                    const SWIZZLE: [usize; 16] = const {
273                                        let mut swizzle = [16usize; 16];
274
275                                        let mut i = 0;
276                                        while i < N {
277                                            let idx = i + I * N;
278                                            if idx < 16 {
279                                                swizzle[4+i] = idx;
280                                            }
281                                            i += 1;
282                                        }
283
284                                        swizzle
285                                    };
286
287                                    let scattered = simd_swizzle!(loaded, zero, SWIZZLE);
288                                    let view_bytes = (scattered | length_mask).to_array();
289
290                                    // SAFETY: dst has the capacity reserved and view_bytes is 16
291                                    // bytes long.
292                                    unsafe {
293                                        core::ptr::copy_nonoverlapping(view_bytes.as_ptr(), dst, 16);
294                                        dst = dst.add(16);
295                                    }
296                                }
297                            }
298                        );
299                    }
300
301                    unsafe {
302                        views.set_len(views.len() + num_loops * BLOCKS_PER_LOAD);
303                    }
304                }
305
306                views.extend(src.chunks_exact(N).map(|slice| unsafe {
307                    View::new_inline_unchecked(slice)
308                }));
309            },
310            otherwise = unreachable!()
311        }
312    }
313}
314
315impl IsNull for View {
316    const HAS_NULLS: bool = false;
317    type Inner = Self;
318
319    fn is_null(&self) -> bool {
320        false
321    }
322
323    fn unwrap_inner(self) -> Self::Inner {
324        self
325    }
326}
327
328impl Display for View {
329    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
330        write!(f, "{:?}", self)
331    }
332}
333
334unsafe impl Zeroable for View {}
335
336unsafe impl Pod for View {}
337
338impl PartialEq for View {
339    fn eq(&self, other: &Self) -> bool {
340        self.as_u128() == other.as_u128()
341    }
342}
343
344// These are 'implemented' because we want to implement NativeType
345// for View, that should probably not be done.
346impl TotalOrd for View {
347    fn tot_cmp(&self, _other: &Self) -> Ordering {
348        unimplemented!()
349    }
350}
351
352impl TotalEq for View {
353    fn tot_eq(&self, other: &Self) -> bool {
354        self.eq(other)
355    }
356}
357
358impl MinMax for View {
359    fn nan_min_lt(&self, _other: &Self) -> bool {
360        unimplemented!()
361    }
362
363    fn nan_max_lt(&self, _other: &Self) -> bool {
364        unimplemented!()
365    }
366}
367
368impl NativeType for View {
369    const PRIMITIVE: PrimitiveType = PrimitiveType::UInt128;
370
371    type Bytes = [u8; 16];
372    type AlignedBytes = Bytes16Alignment4;
373
374    #[inline]
375    fn to_le_bytes(&self) -> Self::Bytes {
376        self.as_u128().to_le_bytes()
377    }
378
379    #[inline]
380    fn to_be_bytes(&self) -> Self::Bytes {
381        self.as_u128().to_be_bytes()
382    }
383
384    #[inline]
385    fn from_le_bytes(bytes: Self::Bytes) -> Self {
386        Self::from(u128::from_le_bytes(bytes))
387    }
388
389    #[inline]
390    fn from_be_bytes(bytes: Self::Bytes) -> Self {
391        Self::from(u128::from_be_bytes(bytes))
392    }
393}
394
395impl From<u128> for View {
396    #[inline]
397    fn from(value: u128) -> Self {
398        unsafe { std::mem::transmute(value) }
399    }
400}
401
402impl From<View> for u128 {
403    #[inline]
404    fn from(value: View) -> Self {
405        value.as_u128()
406    }
407}
408
409pub fn validate_views<B: AsRef<[u8]>, F>(
410    views: &[View],
411    buffers: &[B],
412    validate_bytes: F,
413) -> PolarsResult<()>
414where
415    F: Fn(&[u8]) -> PolarsResult<()>,
416{
417    for view in views {
418        if let Some(inline_slice) = view.get_inlined_slice() {
419            if view.length < View::MAX_INLINE_SIZE && view.as_u128() >> (32 + view.length * 8) != 0
420            {
421                polars_bail!(ComputeError: "view contained non-zero padding in prefix");
422            }
423
424            validate_bytes(inline_slice)?;
425        } else {
426            let data = buffers.get(view.buffer_idx as usize).ok_or_else(|| {
427                polars_err!(OutOfBounds: "view index out of bounds\n\nGot: {} buffers and index: {}", buffers.len(), view.buffer_idx)
428            })?;
429
430            let start = view.offset as usize;
431            let end = start + view.length as usize;
432            let b = data
433                .as_ref()
434                .get(start..end)
435                .ok_or_else(|| polars_err!(OutOfBounds: "buffer slice out of bounds"))?;
436
437            polars_ensure!(b.starts_with(&view.prefix.to_le_bytes()), ComputeError: "prefix does not match string data");
438            validate_bytes(b)?;
439        }
440    }
441
442    Ok(())
443}
444
445pub fn validate_binary_views<B: AsRef<[u8]>>(views: &[View], buffers: &[B]) -> PolarsResult<()> {
446    validate_views(views, buffers, |_| Ok(()))
447}
448
449fn validate_utf8(b: &[u8]) -> PolarsResult<()> {
450    match simdutf8::basic::from_utf8(b) {
451        Ok(_) => Ok(()),
452        Err(_) => Err(polars_err!(ComputeError: "invalid utf8")),
453    }
454}
455
456pub fn validate_utf8_views<B: AsRef<[u8]>>(views: &[View], buffers: &[B]) -> PolarsResult<()> {
457    validate_views(views, buffers, validate_utf8)
458}
459
460/// Checks the views for valid UTF-8. Assumes the first num_trusted_buffers are
461/// valid UTF-8 without checking.
462/// # Safety
463/// The views and buffers must uphold the invariants of BinaryView otherwise we will go OOB.
464pub unsafe fn validate_views_utf8_only<B: AsRef<[u8]>>(
465    views: &[View],
466    buffers: &[B],
467    mut num_trusted_buffers: usize,
468) -> PolarsResult<()> {
469    unsafe {
470        while num_trusted_buffers < buffers.len()
471            && buffers[num_trusted_buffers].as_ref().is_ascii()
472        {
473            num_trusted_buffers += 1;
474        }
475
476        // Fast path if all buffers are ASCII (or there are no buffers).
477        if num_trusted_buffers >= buffers.len() {
478            for view in views {
479                if let Some(inlined_slice) = view.get_inlined_slice() {
480                    validate_utf8(inlined_slice)?;
481                }
482            }
483        } else {
484            for view in views {
485                if view.length <= View::MAX_INLINE_SIZE
486                    || view.buffer_idx as usize >= num_trusted_buffers
487                {
488                    validate_utf8(view.get_slice_unchecked(buffers))?;
489                }
490            }
491        }
492
493        Ok(())
494    }
495}