Skip to main content

polars_arrow/array/binview/
view.rs

1use std::cmp::Ordering;
2use std::fmt::{self, Display, Formatter};
3
4use bytemuck::{Pod, Zeroable};
5use polars_error::*;
6use polars_utils::min_max::MinMax;
7use polars_utils::nulls::IsNull;
8use polars_utils::total_ord::{TotalEq, TotalOrd};
9
10use crate::datatypes::PrimitiveType;
11use crate::types::{Bytes16Alignment4, NativeType};
12
13// We use this instead of u128 because we want alignment of <= 8 bytes.
14/// A reference to a set of bytes.
15///
16/// If `length <= 12`, these bytes are inlined over the `prefix`, `buffer_idx` and `offset` fields.
17/// If `length > 12`, these fields specify a slice of a buffer.
18#[derive(Copy, Clone, Default)]
19#[repr(C)]
20pub struct View {
21    /// The length of the string/bytes.
22    pub length: u32,
23    /// First 4 bytes of string/bytes data.
24    pub prefix: u32,
25    /// The buffer index.
26    pub buffer_idx: u32,
27    /// The offset into the buffer.
28    pub offset: u32,
29}
30
31impl fmt::Debug for View {
32    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
33        if self.length <= Self::MAX_INLINE_SIZE {
34            fmt.debug_struct("View")
35                .field("length", &self.length)
36                .field("content", &unsafe {
37                    std::slice::from_raw_parts(
38                        (self as *const _ as *const u8).add(4),
39                        self.length as usize,
40                    )
41                })
42                .finish()
43        } else {
44            fmt.debug_struct("View")
45                .field("length", &self.length)
46                .field("prefix", &self.prefix.to_be_bytes())
47                .field("buffer_idx", &self.buffer_idx)
48                .field("offset", &self.offset)
49                .finish()
50        }
51    }
52}
53
54impl View {
55    pub const MAX_INLINE_SIZE: u32 = 12;
56
57    #[inline(always)]
58    pub fn is_inline(&self) -> bool {
59        self.length <= Self::MAX_INLINE_SIZE
60    }
61
62    #[inline(always)]
63    pub fn as_u128(self) -> u128 {
64        unsafe { std::mem::transmute(self) }
65    }
66
67    /// Create a new inline view without verifying the length
68    ///
69    /// # Safety
70    ///
71    /// It needs to hold that `bytes.len() <= View::MAX_INLINE_SIZE`.
72    #[inline]
73    pub unsafe fn new_inline_unchecked(bytes: &[u8]) -> Self {
74        debug_assert!(bytes.len() <= u32::MAX as usize);
75        debug_assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);
76
77        let mut view = Self {
78            length: bytes.len() as u32,
79            ..Default::default()
80        };
81
82        let view_ptr = &mut view as *mut _ as *mut u8;
83
84        // SAFETY:
85        // - bytes length <= 12,
86        // - size_of::<View> == 16
87        // - View is laid out as [length, prefix, buffer_idx, offset] (using repr(C))
88        // - By grabbing the view_ptr and adding 4, we have provenance over prefix, buffer_idx and
89        // offset. (i.e. the same could not be achieved with &mut self.prefix as *mut _ as *mut u8)
90        unsafe {
91            let inline_data_ptr = view_ptr.add(4);
92            core::ptr::copy_nonoverlapping(bytes.as_ptr(), inline_data_ptr, bytes.len());
93        }
94        view
95    }
96
97    /// Create a new inline view
98    ///
99    /// # Panics
100    ///
101    /// Panics if the `bytes.len() > View::MAX_INLINE_SIZE`.
102    #[inline]
103    pub fn new_inline(bytes: &[u8]) -> Self {
104        assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);
105        unsafe { Self::new_inline_unchecked(bytes) }
106    }
107
108    /// Create a new inline view
109    ///
110    /// # Safety
111    ///
112    /// It needs to hold that `bytes.len() > View::MAX_INLINE_SIZE`.
113    #[inline]
114    pub unsafe fn new_noninline_unchecked(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {
115        debug_assert!(bytes.len() <= u32::MAX as usize);
116        debug_assert!(bytes.len() as u32 > View::MAX_INLINE_SIZE);
117
118        // SAFETY: The invariant of this function guarantees that this is safe.
119        let prefix = unsafe { u32::from_le_bytes(bytes[0..4].try_into().unwrap_unchecked()) };
120        Self {
121            length: bytes.len() as u32,
122            prefix,
123            buffer_idx,
124            offset,
125        }
126    }
127
128    #[inline]
129    pub fn new_from_bytes(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {
130        assert!(bytes.len() <= u32::MAX as usize);
131
132        // SAFETY: We verify the invariant with the outer if statement
133        unsafe {
134            if bytes.len() as u32 <= Self::MAX_INLINE_SIZE {
135                Self::new_inline_unchecked(bytes)
136            } else {
137                Self::new_noninline_unchecked(bytes, buffer_idx, offset)
138            }
139        }
140    }
141
142    #[inline]
143    pub fn new_with_buffers(
144        bytes: &[u8],
145        buffer_idx_offset: u32,
146        buffers: &mut Vec<Vec<u8>>,
147    ) -> Self {
148        unsafe {
149            if bytes.len() <= Self::MAX_INLINE_SIZE as usize {
150                Self::new_inline_unchecked(bytes)
151            } else {
152                assert!(bytes.len() <= u32::MAX as usize);
153                let num_buffers = buffers.len();
154                if let Some(buf) = buffers.last_mut() {
155                    if buf.len() + bytes.len() <= buf.capacity() {
156                        let buf_idx = buffer_idx_offset + num_buffers as u32 - 1;
157                        let offset = buf.len() as u32;
158                        buf.extend_from_slice(bytes);
159                        return Self::new_noninline_unchecked(bytes, buf_idx, offset);
160                    }
161                }
162
163                Self::new_with_buffers_slow(bytes, buffer_idx_offset, buffers)
164            }
165        }
166    }
167
168    /// # Safety
169    ///
170    /// It needs to hold that `bytes.len() > View::MAX_INLINE_SIZE`.
171    #[cold]
172    unsafe fn new_with_buffers_slow(
173        bytes: &[u8],
174        buffer_idx_offset: u32,
175        buffers: &mut Vec<Vec<u8>>,
176    ) -> Self {
177        // Resize last buffer instead of making new buffer.
178        const RESIZE_LIMIT: usize = 8192;
179
180        if buffers.is_empty() {
181            buffers.push(bytes.to_vec());
182            return View::new_noninline_unchecked(bytes, buffer_idx_offset, 0);
183        }
184
185        let num_buffers = buffers.len();
186        let old_cap = buffers.last().unwrap().capacity();
187        let new_cap = (old_cap + old_cap).clamp(bytes.len(), u32::MAX as usize);
188        if new_cap <= RESIZE_LIMIT {
189            let buffer_idx = buffer_idx_offset + num_buffers as u32 - 1;
190            let buf = buffers.last_mut().unwrap();
191            let offset = buf.len() as u32;
192            buf.reserve(new_cap - buf.len());
193            buf.extend_from_slice(bytes);
194            View::new_noninline_unchecked(bytes, buffer_idx, offset)
195        } else {
196            let buffer_idx = buffer_idx_offset + num_buffers as u32;
197            let mut buf = Vec::with_capacity(new_cap);
198            buf.extend_from_slice(bytes);
199            buffers.push(buf);
200            View::new_noninline_unchecked(bytes, buffer_idx, 0)
201        }
202    }
203
204    /// Constructs a byteslice from this view.
205    ///
206    /// # Safety
207    /// Assumes that this view is valid for the given buffers.
208    #[inline]
209    pub unsafe fn get_slice_unchecked<'a, B: AsRef<[u8]>>(&'a self, buffers: &'a [B]) -> &'a [u8] {
210        unsafe {
211            if self.length <= Self::MAX_INLINE_SIZE {
212                self.get_inlined_slice_unchecked()
213            } else {
214                self.get_external_slice_unchecked(buffers)
215            }
216        }
217    }
218
219    /// Construct a byte slice from an inline view, if it is inline.
220    #[inline]
221    pub fn get_inlined_slice(&self) -> Option<&[u8]> {
222        if self.length <= Self::MAX_INLINE_SIZE {
223            unsafe { Some(self.get_inlined_slice_unchecked()) }
224        } else {
225            None
226        }
227    }
228
229    /// Construct a byte slice from an inline view.
230    ///
231    /// # Safety
232    /// Assumes that this view is inlinable.
233    #[inline]
234    pub unsafe fn get_inlined_slice_unchecked(&self) -> &[u8] {
235        debug_assert!(self.length <= View::MAX_INLINE_SIZE);
236        let ptr = self as *const View as *const u8;
237        unsafe { std::slice::from_raw_parts(ptr.add(4), self.length as usize) }
238    }
239
240    /// Construct a byte slice from an external view.
241    ///
242    /// # Safety
243    /// Assumes that this view is in the external buffers.
244    #[inline]
245    pub unsafe fn get_external_slice_unchecked<'a, B: AsRef<[u8]>>(
246        &self,
247        buffers: &'a [B],
248    ) -> &'a [u8] {
249        debug_assert!(self.length > View::MAX_INLINE_SIZE);
250        let data = buffers.get_unchecked(self.buffer_idx as usize);
251        let offset = self.offset as usize;
252        data.as_ref()
253            .get_unchecked(offset..offset + self.length as usize)
254    }
255
256    /// Extend a `Vec<View>` with inline views slices of `src` with `width`.
257    ///
258    /// This tries to use SIMD to optimize the copying and can be massively faster than doing a
259    /// `views.extend(src.chunks_exact(width).map(View::new_inline))`.
260    ///
261    /// # Panics
262    ///
263    /// This function panics if `src.len()` is not divisible by `width`, `width >
264    /// View::MAX_INLINE_SIZE` or `width == 0`.
265    pub fn extend_with_inlinable_strided(views: &mut Vec<Self>, src: &[u8], width: u8) {
266        macro_rules! dispatch {
267            ($n:ident = $match:ident in [$($v:literal),+ $(,)?] => $block:block, otherwise = $otherwise:expr) => {
268                match $match {
269                    $(
270                        $v => {
271                            const $n: usize = $v;
272
273                            $block
274                        }
275                    )+
276                    _ => $otherwise,
277                }
278            }
279        }
280
281        let width = width as usize;
282
283        assert!(width > 0);
284        assert!(width <= View::MAX_INLINE_SIZE as usize);
285
286        assert_eq!(src.len() % width, 0);
287
288        let num_values = src.len() / width;
289
290        views.reserve(num_values);
291
292        #[allow(unused_mut)]
293        let mut src = src;
294
295        dispatch! {
296            N = width in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] => {
297                #[cfg(feature = "simd")]
298                {
299                    macro_rules! repeat_with {
300                        ($i:ident = [$($v:literal),+ $(,)?] => $block:block) => {
301                            $({
302                                const $i: usize = $v;
303
304                                $block
305                            })+
306                        }
307                    }
308
309                    use std::simd::*;
310
311                    // SAFETY: This is always allowed, since views.len() is always in the Vec
312                    // buffer.
313                    let mut dst = unsafe { views.as_mut_ptr().add(views.len()).cast::<u8>() };
314
315                    let length_mask = u8x16::from_array([N as u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
316
317                    const BLOCKS_PER_LOAD: usize = 16 / N;
318                    const BYTES_PER_LOOP: usize = N * BLOCKS_PER_LOAD;
319
320                    let num_loops = (src.len() / BYTES_PER_LOOP).saturating_sub(1);
321
322                    for _ in 0..num_loops {
323                        // SAFETY: The num_loops calculates how many times we can do this.
324                        let loaded = u8x16::from_array(unsafe {
325                            src.get_unchecked(..16).try_into().unwrap()
326                        });
327                        src = unsafe { src.get_unchecked(BYTES_PER_LOOP..) };
328
329                        // This way we can reuse the same load for multiple views.
330                        repeat_with!(
331                            I = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] => {
332                                if I < BLOCKS_PER_LOAD {
333                                    let zero = u8x16::default();
334                                    const SWIZZLE: [usize; 16] = const {
335                                        let mut swizzle = [16usize; 16];
336
337                                        let mut i = 0;
338                                        while i < N {
339                                            let idx = i + I * N;
340                                            if idx < 16 {
341                                                swizzle[4+i] = idx;
342                                            }
343                                            i += 1;
344                                        }
345
346                                        swizzle
347                                    };
348
349                                    let scattered = simd_swizzle!(loaded, zero, SWIZZLE);
350                                    let view_bytes = (scattered | length_mask).to_array();
351
352                                    // SAFETY: dst has the capacity reserved and view_bytes is 16
353                                    // bytes long.
354                                    unsafe {
355                                        core::ptr::copy_nonoverlapping(view_bytes.as_ptr(), dst, 16);
356                                        dst = dst.add(16);
357                                    }
358                                }
359                            }
360                        );
361                    }
362
363                    unsafe {
364                        views.set_len(views.len() + num_loops * BLOCKS_PER_LOAD);
365                    }
366                }
367
368                views.extend(src.chunks_exact(N).map(|slice| unsafe {
369                    View::new_inline_unchecked(slice)
370                }));
371            },
372            otherwise = unreachable!()
373        }
374    }
375}
376
377impl IsNull for View {
378    const HAS_NULLS: bool = false;
379    type Inner = Self;
380
381    fn is_null(&self) -> bool {
382        false
383    }
384
385    fn unwrap_inner(self) -> Self::Inner {
386        self
387    }
388}
389
390impl Display for View {
391    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
392        write!(f, "{self:?}")
393    }
394}
395
396unsafe impl Zeroable for View {}
397
398unsafe impl Pod for View {}
399
400impl PartialEq for View {
401    fn eq(&self, other: &Self) -> bool {
402        self.as_u128() == other.as_u128()
403    }
404}
405
406// These are 'implemented' because we want to implement NativeType
407// for View, that should probably not be done.
408impl TotalOrd for View {
409    fn tot_cmp(&self, _other: &Self) -> Ordering {
410        unimplemented!()
411    }
412}
413
414impl TotalEq for View {
415    fn tot_eq(&self, other: &Self) -> bool {
416        self.eq(other)
417    }
418}
419
420impl MinMax for View {
421    fn nan_min_lt(&self, _other: &Self) -> bool {
422        unimplemented!()
423    }
424
425    fn nan_max_lt(&self, _other: &Self) -> bool {
426        unimplemented!()
427    }
428}
429
430impl NativeType for View {
431    const PRIMITIVE: PrimitiveType = PrimitiveType::UInt128;
432
433    type Bytes = [u8; 16];
434    type AlignedBytes = Bytes16Alignment4;
435
436    #[inline]
437    fn to_le_bytes(&self) -> Self::Bytes {
438        self.as_u128().to_le_bytes()
439    }
440
441    #[inline]
442    fn to_be_bytes(&self) -> Self::Bytes {
443        self.as_u128().to_be_bytes()
444    }
445
446    #[inline]
447    fn from_le_bytes(bytes: Self::Bytes) -> Self {
448        Self::from(u128::from_le_bytes(bytes))
449    }
450
451    #[inline]
452    fn from_be_bytes(bytes: Self::Bytes) -> Self {
453        Self::from(u128::from_be_bytes(bytes))
454    }
455}
456
457impl From<u128> for View {
458    #[inline]
459    fn from(value: u128) -> Self {
460        unsafe { std::mem::transmute(value) }
461    }
462}
463
464impl From<View> for u128 {
465    #[inline]
466    fn from(value: View) -> Self {
467        value.as_u128()
468    }
469}
470
471pub fn validate_views<B: AsRef<[u8]>, F>(
472    views: &[View],
473    buffers: &[B],
474    validate_bytes: F,
475) -> PolarsResult<()>
476where
477    F: Fn(&[u8]) -> PolarsResult<()>,
478{
479    for view in views {
480        if let Some(inline_slice) = view.get_inlined_slice() {
481            if view.length < View::MAX_INLINE_SIZE && view.as_u128() >> (32 + view.length * 8) != 0
482            {
483                polars_bail!(ComputeError: "view contained non-zero padding in prefix");
484            }
485
486            validate_bytes(inline_slice)?;
487        } else {
488            let data = buffers.get(view.buffer_idx as usize).ok_or_else(|| {
489                polars_err!(OutOfBounds: "view index out of bounds\n\nGot: {} buffers and index: {}", buffers.len(), view.buffer_idx)
490            })?;
491
492            let start = view.offset as usize;
493            let end = start + view.length as usize;
494            let b = data
495                .as_ref()
496                .get(start..end)
497                .ok_or_else(|| polars_err!(OutOfBounds: "buffer slice out of bounds"))?;
498
499            polars_ensure!(b.starts_with(&view.prefix.to_le_bytes()), ComputeError: "prefix does not match string data");
500            validate_bytes(b)?;
501        }
502    }
503
504    Ok(())
505}
506
507pub fn validate_binary_views<B: AsRef<[u8]>>(views: &[View], buffers: &[B]) -> PolarsResult<()> {
508    validate_views(views, buffers, |_| Ok(()))
509}
510
511fn validate_utf8(b: &[u8]) -> PolarsResult<()> {
512    match simdutf8::basic::from_utf8(b) {
513        Ok(_) => Ok(()),
514        Err(_) => Err(polars_err!(ComputeError: "invalid utf8")),
515    }
516}
517
518pub fn validate_utf8_views<B: AsRef<[u8]>>(views: &[View], buffers: &[B]) -> PolarsResult<()> {
519    validate_views(views, buffers, validate_utf8)
520}
521
522/// Checks the views for valid UTF-8. Assumes the first num_trusted_buffers are
523/// valid UTF-8 without checking.
524/// # Safety
525/// The views and buffers must uphold the invariants of BinaryView otherwise we will go OOB.
526pub unsafe fn validate_views_utf8_only<B: AsRef<[u8]>>(
527    views: &[View],
528    buffers: &[B],
529    mut num_trusted_buffers: usize,
530) -> PolarsResult<()> {
531    unsafe {
532        while num_trusted_buffers < buffers.len()
533            && buffers[num_trusted_buffers].as_ref().is_ascii()
534        {
535            num_trusted_buffers += 1;
536        }
537
538        // Fast path if all buffers are ASCII (or there are no buffers).
539        if num_trusted_buffers >= buffers.len() {
540            for view in views {
541                if let Some(inlined_slice) = view.get_inlined_slice() {
542                    validate_utf8(inlined_slice)?;
543                }
544            }
545        } else {
546            for view in views {
547                if view.length <= View::MAX_INLINE_SIZE
548                    || view.buffer_idx as usize >= num_trusted_buffers
549                {
550                    validate_utf8(view.get_slice_unchecked(buffers))?;
551                }
552            }
553        }
554
555        Ok(())
556    }
557}