polars_arrow/array/binview/
view.rs1use std::cmp::Ordering;
2use std::fmt::{self, Display, Formatter};
3
4use bytemuck::{Pod, Zeroable};
5use polars_error::*;
6use polars_utils::min_max::MinMax;
7use polars_utils::nulls::IsNull;
8use polars_utils::total_ord::{TotalEq, TotalOrd};
9
10use crate::datatypes::PrimitiveType;
11use crate::types::{Bytes16Alignment4, NativeType};
12
13#[derive(Copy, Clone, Default)]
19#[repr(C)]
20pub struct View {
21 pub length: u32,
23 pub prefix: u32,
25 pub buffer_idx: u32,
27 pub offset: u32,
29}
30
31impl fmt::Debug for View {
32 fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
33 if self.length <= Self::MAX_INLINE_SIZE {
34 fmt.debug_struct("View")
35 .field("length", &self.length)
36 .field("content", &unsafe {
37 std::slice::from_raw_parts(
38 (self as *const _ as *const u8).add(4),
39 self.length as usize,
40 )
41 })
42 .finish()
43 } else {
44 fmt.debug_struct("View")
45 .field("length", &self.length)
46 .field("prefix", &self.prefix.to_be_bytes())
47 .field("buffer_idx", &self.buffer_idx)
48 .field("offset", &self.offset)
49 .finish()
50 }
51 }
52}
53
54impl View {
55 pub const MAX_INLINE_SIZE: u32 = 12;
56
57 #[inline(always)]
58 pub fn is_inline(&self) -> bool {
59 self.length <= Self::MAX_INLINE_SIZE
60 }
61
62 #[inline(always)]
63 pub fn as_u128(self) -> u128 {
64 unsafe { std::mem::transmute(self) }
65 }
66
67 #[inline]
73 pub unsafe fn new_inline_unchecked(bytes: &[u8]) -> Self {
74 debug_assert!(bytes.len() <= u32::MAX as usize);
75 debug_assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);
76
77 let mut view = Self {
78 length: bytes.len() as u32,
79 ..Default::default()
80 };
81
82 let view_ptr = &mut view as *mut _ as *mut u8;
83
84 unsafe {
91 let inline_data_ptr = view_ptr.add(4);
92 core::ptr::copy_nonoverlapping(bytes.as_ptr(), inline_data_ptr, bytes.len());
93 }
94 view
95 }
96
97 #[inline]
103 pub fn new_inline(bytes: &[u8]) -> Self {
104 assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);
105 unsafe { Self::new_inline_unchecked(bytes) }
106 }
107
108 #[inline]
114 pub unsafe fn new_noninline_unchecked(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {
115 debug_assert!(bytes.len() <= u32::MAX as usize);
116 debug_assert!(bytes.len() as u32 > View::MAX_INLINE_SIZE);
117
118 let prefix = unsafe { u32::from_le_bytes(bytes[0..4].try_into().unwrap_unchecked()) };
120 Self {
121 length: bytes.len() as u32,
122 prefix,
123 buffer_idx,
124 offset,
125 }
126 }
127
128 #[inline]
129 pub fn new_from_bytes(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {
130 debug_assert!(bytes.len() <= u32::MAX as usize);
131
132 unsafe {
134 if bytes.len() as u32 <= Self::MAX_INLINE_SIZE {
135 Self::new_inline_unchecked(bytes)
136 } else {
137 Self::new_noninline_unchecked(bytes, buffer_idx, offset)
138 }
139 }
140 }
141
142 #[inline]
147 pub unsafe fn get_slice_unchecked<'a, B: AsRef<[u8]>>(&'a self, buffers: &'a [B]) -> &'a [u8] {
148 unsafe {
149 if self.length <= Self::MAX_INLINE_SIZE {
150 self.get_inlined_slice_unchecked()
151 } else {
152 self.get_external_slice_unchecked(buffers)
153 }
154 }
155 }
156
157 #[inline]
159 pub fn get_inlined_slice(&self) -> Option<&[u8]> {
160 if self.length <= Self::MAX_INLINE_SIZE {
161 unsafe { Some(self.get_inlined_slice_unchecked()) }
162 } else {
163 None
164 }
165 }
166
167 #[inline]
172 pub unsafe fn get_inlined_slice_unchecked(&self) -> &[u8] {
173 debug_assert!(self.length <= View::MAX_INLINE_SIZE);
174 let ptr = self as *const View as *const u8;
175 unsafe { std::slice::from_raw_parts(ptr.add(4), self.length as usize) }
176 }
177
178 #[inline]
183 pub unsafe fn get_external_slice_unchecked<'a, B: AsRef<[u8]>>(
184 &self,
185 buffers: &'a [B],
186 ) -> &'a [u8] {
187 debug_assert!(self.length > View::MAX_INLINE_SIZE);
188 let data = buffers.get_unchecked(self.buffer_idx as usize);
189 let offset = self.offset as usize;
190 data.as_ref()
191 .get_unchecked(offset..offset + self.length as usize)
192 }
193
194 pub fn extend_with_inlinable_strided(views: &mut Vec<Self>, src: &[u8], width: u8) {
204 macro_rules! dispatch {
205 ($n:ident = $match:ident in [$($v:literal),+ $(,)?] => $block:block, otherwise = $otherwise:expr) => {
206 match $match {
207 $(
208 $v => {
209 const $n: usize = $v;
210
211 $block
212 }
213 )+
214 _ => $otherwise,
215 }
216 }
217 }
218
219 let width = width as usize;
220
221 assert!(width > 0);
222 assert!(width <= View::MAX_INLINE_SIZE as usize);
223
224 assert_eq!(src.len() % width, 0);
225
226 let num_values = src.len() / width;
227
228 views.reserve(num_values);
229
230 #[allow(unused_mut)]
231 let mut src = src;
232
233 dispatch! {
234 N = width in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] => {
235 #[cfg(feature = "simd")]
236 {
237 macro_rules! repeat_with {
238 ($i:ident = [$($v:literal),+ $(,)?] => $block:block) => {
239 $({
240 const $i: usize = $v;
241
242 $block
243 })+
244 }
245 }
246
247 use std::simd::*;
248
249 let mut dst = unsafe { views.as_mut_ptr().add(views.len()).cast::<u8>() };
252
253 let length_mask = u8x16::from_array([N as u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
254
255 const BLOCKS_PER_LOAD: usize = 16 / N;
256 const BYTES_PER_LOOP: usize = N * BLOCKS_PER_LOAD;
257
258 let num_loops = (src.len() / BYTES_PER_LOOP).saturating_sub(1);
259
260 for _ in 0..num_loops {
261 let loaded = u8x16::from_array(unsafe {
263 src.get_unchecked(..16).try_into().unwrap()
264 });
265 src = unsafe { src.get_unchecked(BYTES_PER_LOOP..) };
266
267 repeat_with!(
269 I = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] => {
270 if I < BLOCKS_PER_LOAD {
271 let zero = u8x16::default();
272 const SWIZZLE: [usize; 16] = const {
273 let mut swizzle = [16usize; 16];
274
275 let mut i = 0;
276 while i < N {
277 let idx = i + I * N;
278 if idx < 16 {
279 swizzle[4+i] = idx;
280 }
281 i += 1;
282 }
283
284 swizzle
285 };
286
287 let scattered = simd_swizzle!(loaded, zero, SWIZZLE);
288 let view_bytes = (scattered | length_mask).to_array();
289
290 unsafe {
293 core::ptr::copy_nonoverlapping(view_bytes.as_ptr(), dst, 16);
294 dst = dst.add(16);
295 }
296 }
297 }
298 );
299 }
300
301 unsafe {
302 views.set_len(views.len() + num_loops * BLOCKS_PER_LOAD);
303 }
304 }
305
306 views.extend(src.chunks_exact(N).map(|slice| unsafe {
307 View::new_inline_unchecked(slice)
308 }));
309 },
310 otherwise = unreachable!()
311 }
312 }
313}
314
315impl IsNull for View {
316 const HAS_NULLS: bool = false;
317 type Inner = Self;
318
319 fn is_null(&self) -> bool {
320 false
321 }
322
323 fn unwrap_inner(self) -> Self::Inner {
324 self
325 }
326}
327
328impl Display for View {
329 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
330 write!(f, "{:?}", self)
331 }
332}
333
334unsafe impl Zeroable for View {}
335
336unsafe impl Pod for View {}
337
338impl PartialEq for View {
339 fn eq(&self, other: &Self) -> bool {
340 self.as_u128() == other.as_u128()
341 }
342}
343
344impl TotalOrd for View {
347 fn tot_cmp(&self, _other: &Self) -> Ordering {
348 unimplemented!()
349 }
350}
351
352impl TotalEq for View {
353 fn tot_eq(&self, other: &Self) -> bool {
354 self.eq(other)
355 }
356}
357
358impl MinMax for View {
359 fn nan_min_lt(&self, _other: &Self) -> bool {
360 unimplemented!()
361 }
362
363 fn nan_max_lt(&self, _other: &Self) -> bool {
364 unimplemented!()
365 }
366}
367
368impl NativeType for View {
369 const PRIMITIVE: PrimitiveType = PrimitiveType::UInt128;
370
371 type Bytes = [u8; 16];
372 type AlignedBytes = Bytes16Alignment4;
373
374 #[inline]
375 fn to_le_bytes(&self) -> Self::Bytes {
376 self.as_u128().to_le_bytes()
377 }
378
379 #[inline]
380 fn to_be_bytes(&self) -> Self::Bytes {
381 self.as_u128().to_be_bytes()
382 }
383
384 #[inline]
385 fn from_le_bytes(bytes: Self::Bytes) -> Self {
386 Self::from(u128::from_le_bytes(bytes))
387 }
388
389 #[inline]
390 fn from_be_bytes(bytes: Self::Bytes) -> Self {
391 Self::from(u128::from_be_bytes(bytes))
392 }
393}
394
395impl From<u128> for View {
396 #[inline]
397 fn from(value: u128) -> Self {
398 unsafe { std::mem::transmute(value) }
399 }
400}
401
402impl From<View> for u128 {
403 #[inline]
404 fn from(value: View) -> Self {
405 value.as_u128()
406 }
407}
408
409pub fn validate_views<B: AsRef<[u8]>, F>(
410 views: &[View],
411 buffers: &[B],
412 validate_bytes: F,
413) -> PolarsResult<()>
414where
415 F: Fn(&[u8]) -> PolarsResult<()>,
416{
417 for view in views {
418 if let Some(inline_slice) = view.get_inlined_slice() {
419 if view.length < View::MAX_INLINE_SIZE && view.as_u128() >> (32 + view.length * 8) != 0
420 {
421 polars_bail!(ComputeError: "view contained non-zero padding in prefix");
422 }
423
424 validate_bytes(inline_slice)?;
425 } else {
426 let data = buffers.get(view.buffer_idx as usize).ok_or_else(|| {
427 polars_err!(OutOfBounds: "view index out of bounds\n\nGot: {} buffers and index: {}", buffers.len(), view.buffer_idx)
428 })?;
429
430 let start = view.offset as usize;
431 let end = start + view.length as usize;
432 let b = data
433 .as_ref()
434 .get(start..end)
435 .ok_or_else(|| polars_err!(OutOfBounds: "buffer slice out of bounds"))?;
436
437 polars_ensure!(b.starts_with(&view.prefix.to_le_bytes()), ComputeError: "prefix does not match string data");
438 validate_bytes(b)?;
439 }
440 }
441
442 Ok(())
443}
444
445pub fn validate_binary_views<B: AsRef<[u8]>>(views: &[View], buffers: &[B]) -> PolarsResult<()> {
446 validate_views(views, buffers, |_| Ok(()))
447}
448
449fn validate_utf8(b: &[u8]) -> PolarsResult<()> {
450 match simdutf8::basic::from_utf8(b) {
451 Ok(_) => Ok(()),
452 Err(_) => Err(polars_err!(ComputeError: "invalid utf8")),
453 }
454}
455
456pub fn validate_utf8_views<B: AsRef<[u8]>>(views: &[View], buffers: &[B]) -> PolarsResult<()> {
457 validate_views(views, buffers, validate_utf8)
458}
459
460pub unsafe fn validate_views_utf8_only<B: AsRef<[u8]>>(
465 views: &[View],
466 buffers: &[B],
467 mut num_trusted_buffers: usize,
468) -> PolarsResult<()> {
469 unsafe {
470 while num_trusted_buffers < buffers.len()
471 && buffers[num_trusted_buffers].as_ref().is_ascii()
472 {
473 num_trusted_buffers += 1;
474 }
475
476 if num_trusted_buffers >= buffers.len() {
478 for view in views {
479 if let Some(inlined_slice) = view.get_inlined_slice() {
480 validate_utf8(inlined_slice)?;
481 }
482 }
483 } else {
484 for view in views {
485 if view.length <= View::MAX_INLINE_SIZE
486 || view.buffer_idx as usize >= num_trusted_buffers
487 {
488 validate_utf8(view.get_slice_unchecked(buffers))?;
489 }
490 }
491 }
492
493 Ok(())
494 }
495}