polars_arrow/array/binview/
view.rs1use std::cmp::Ordering;
2use std::fmt::{self, Display, Formatter};
3
4use bytemuck::{Pod, Zeroable};
5use polars_error::*;
6use polars_utils::min_max::MinMax;
7use polars_utils::nulls::IsNull;
8use polars_utils::total_ord::{TotalEq, TotalOrd};
9
10use crate::datatypes::PrimitiveType;
11use crate::types::{Bytes16Alignment4, NativeType};
12
13#[derive(Copy, Clone, Default)]
19#[repr(C)]
20pub struct View {
21 pub length: u32,
23 pub prefix: u32,
25 pub buffer_idx: u32,
27 pub offset: u32,
29}
30
31impl fmt::Debug for View {
32 fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
33 if self.length <= Self::MAX_INLINE_SIZE {
34 fmt.debug_struct("View")
35 .field("length", &self.length)
36 .field("content", &unsafe {
37 std::slice::from_raw_parts(
38 (self as *const _ as *const u8).add(4),
39 self.length as usize,
40 )
41 })
42 .finish()
43 } else {
44 fmt.debug_struct("View")
45 .field("length", &self.length)
46 .field("prefix", &self.prefix.to_be_bytes())
47 .field("buffer_idx", &self.buffer_idx)
48 .field("offset", &self.offset)
49 .finish()
50 }
51 }
52}
53
54impl View {
55 pub const MAX_INLINE_SIZE: u32 = 12;
56
57 #[inline(always)]
58 pub fn is_inline(&self) -> bool {
59 self.length <= Self::MAX_INLINE_SIZE
60 }
61
62 #[inline(always)]
63 pub fn as_u128(self) -> u128 {
64 unsafe { std::mem::transmute(self) }
65 }
66
67 #[inline]
73 pub unsafe fn new_inline_unchecked(bytes: &[u8]) -> Self {
74 debug_assert!(bytes.len() <= u32::MAX as usize);
75 debug_assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);
76
77 let mut view = Self {
78 length: bytes.len() as u32,
79 ..Default::default()
80 };
81
82 let view_ptr = &mut view as *mut _ as *mut u8;
83
84 unsafe {
91 let inline_data_ptr = view_ptr.add(4);
92 core::ptr::copy_nonoverlapping(bytes.as_ptr(), inline_data_ptr, bytes.len());
93 }
94 view
95 }
96
97 #[inline]
103 pub fn new_inline(bytes: &[u8]) -> Self {
104 assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);
105 unsafe { Self::new_inline_unchecked(bytes) }
106 }
107
108 #[inline]
114 pub unsafe fn new_noninline_unchecked(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {
115 debug_assert!(bytes.len() <= u32::MAX as usize);
116 debug_assert!(bytes.len() as u32 > View::MAX_INLINE_SIZE);
117
118 let prefix = unsafe { u32::from_le_bytes(bytes[0..4].try_into().unwrap_unchecked()) };
120 Self {
121 length: bytes.len() as u32,
122 prefix,
123 buffer_idx,
124 offset,
125 }
126 }
127
128 #[inline]
129 pub fn new_from_bytes(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {
130 assert!(bytes.len() <= u32::MAX as usize);
131
132 unsafe {
134 if bytes.len() as u32 <= Self::MAX_INLINE_SIZE {
135 Self::new_inline_unchecked(bytes)
136 } else {
137 Self::new_noninline_unchecked(bytes, buffer_idx, offset)
138 }
139 }
140 }
141
142 #[inline]
143 pub fn new_with_buffers(
144 bytes: &[u8],
145 buffer_idx_offset: u32,
146 buffers: &mut Vec<Vec<u8>>,
147 ) -> Self {
148 unsafe {
149 if bytes.len() <= Self::MAX_INLINE_SIZE as usize {
150 Self::new_inline_unchecked(bytes)
151 } else {
152 assert!(bytes.len() <= u32::MAX as usize);
153 let num_buffers = buffers.len();
154 if let Some(buf) = buffers.last_mut() {
155 if buf.len() + bytes.len() <= buf.capacity() {
156 let buf_idx = buffer_idx_offset + num_buffers as u32 - 1;
157 let offset = buf.len() as u32;
158 buf.extend_from_slice(bytes);
159 return Self::new_noninline_unchecked(bytes, buf_idx, offset);
160 }
161 }
162
163 Self::new_with_buffers_slow(bytes, buffer_idx_offset, buffers)
164 }
165 }
166 }
167
168 #[cold]
172 unsafe fn new_with_buffers_slow(
173 bytes: &[u8],
174 buffer_idx_offset: u32,
175 buffers: &mut Vec<Vec<u8>>,
176 ) -> Self {
177 const RESIZE_LIMIT: usize = 8192;
179
180 if buffers.is_empty() {
181 buffers.push(bytes.to_vec());
182 return View::new_noninline_unchecked(bytes, buffer_idx_offset, 0);
183 }
184
185 let num_buffers = buffers.len();
186 let old_cap = buffers.last().unwrap().capacity();
187 let new_cap = (old_cap + old_cap).clamp(bytes.len(), u32::MAX as usize);
188 if new_cap <= RESIZE_LIMIT {
189 let buffer_idx = buffer_idx_offset + num_buffers as u32 - 1;
190 let buf = buffers.last_mut().unwrap();
191 let offset = buf.len() as u32;
192 buf.reserve(new_cap - buf.len());
193 buf.extend_from_slice(bytes);
194 View::new_noninline_unchecked(bytes, buffer_idx, offset)
195 } else {
196 let buffer_idx = buffer_idx_offset + num_buffers as u32;
197 let mut buf = Vec::with_capacity(new_cap);
198 buf.extend_from_slice(bytes);
199 buffers.push(buf);
200 View::new_noninline_unchecked(bytes, buffer_idx, 0)
201 }
202 }
203
204 #[inline]
209 pub unsafe fn get_slice_unchecked<'a, B: AsRef<[u8]>>(&'a self, buffers: &'a [B]) -> &'a [u8] {
210 unsafe {
211 if self.length <= Self::MAX_INLINE_SIZE {
212 self.get_inlined_slice_unchecked()
213 } else {
214 self.get_external_slice_unchecked(buffers)
215 }
216 }
217 }
218
219 #[inline]
221 pub fn get_inlined_slice(&self) -> Option<&[u8]> {
222 if self.length <= Self::MAX_INLINE_SIZE {
223 unsafe { Some(self.get_inlined_slice_unchecked()) }
224 } else {
225 None
226 }
227 }
228
229 #[inline]
234 pub unsafe fn get_inlined_slice_unchecked(&self) -> &[u8] {
235 debug_assert!(self.length <= View::MAX_INLINE_SIZE);
236 let ptr = self as *const View as *const u8;
237 unsafe { std::slice::from_raw_parts(ptr.add(4), self.length as usize) }
238 }
239
240 #[inline]
245 pub unsafe fn get_external_slice_unchecked<'a, B: AsRef<[u8]>>(
246 &self,
247 buffers: &'a [B],
248 ) -> &'a [u8] {
249 debug_assert!(self.length > View::MAX_INLINE_SIZE);
250 let data = buffers.get_unchecked(self.buffer_idx as usize);
251 let offset = self.offset as usize;
252 data.as_ref()
253 .get_unchecked(offset..offset + self.length as usize)
254 }
255
256 pub fn extend_with_inlinable_strided(views: &mut Vec<Self>, src: &[u8], width: u8) {
266 macro_rules! dispatch {
267 ($n:ident = $match:ident in [$($v:literal),+ $(,)?] => $block:block, otherwise = $otherwise:expr) => {
268 match $match {
269 $(
270 $v => {
271 const $n: usize = $v;
272
273 $block
274 }
275 )+
276 _ => $otherwise,
277 }
278 }
279 }
280
281 let width = width as usize;
282
283 assert!(width > 0);
284 assert!(width <= View::MAX_INLINE_SIZE as usize);
285
286 assert_eq!(src.len() % width, 0);
287
288 let num_values = src.len() / width;
289
290 views.reserve(num_values);
291
292 #[allow(unused_mut)]
293 let mut src = src;
294
295 dispatch! {
296 N = width in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] => {
297 #[cfg(feature = "simd")]
298 {
299 macro_rules! repeat_with {
300 ($i:ident = [$($v:literal),+ $(,)?] => $block:block) => {
301 $({
302 const $i: usize = $v;
303
304 $block
305 })+
306 }
307 }
308
309 use std::simd::*;
310
311 let mut dst = unsafe { views.as_mut_ptr().add(views.len()).cast::<u8>() };
314
315 let length_mask = u8x16::from_array([N as u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
316
317 const BLOCKS_PER_LOAD: usize = 16 / N;
318 const BYTES_PER_LOOP: usize = N * BLOCKS_PER_LOAD;
319
320 let num_loops = (src.len() / BYTES_PER_LOOP).saturating_sub(1);
321
322 for _ in 0..num_loops {
323 let loaded = u8x16::from_array(unsafe {
325 src.get_unchecked(..16).try_into().unwrap()
326 });
327 src = unsafe { src.get_unchecked(BYTES_PER_LOOP..) };
328
329 repeat_with!(
331 I = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] => {
332 if I < BLOCKS_PER_LOAD {
333 let zero = u8x16::default();
334 const SWIZZLE: [usize; 16] = const {
335 let mut swizzle = [16usize; 16];
336
337 let mut i = 0;
338 while i < N {
339 let idx = i + I * N;
340 if idx < 16 {
341 swizzle[4+i] = idx;
342 }
343 i += 1;
344 }
345
346 swizzle
347 };
348
349 let scattered = simd_swizzle!(loaded, zero, SWIZZLE);
350 let view_bytes = (scattered | length_mask).to_array();
351
352 unsafe {
355 core::ptr::copy_nonoverlapping(view_bytes.as_ptr(), dst, 16);
356 dst = dst.add(16);
357 }
358 }
359 }
360 );
361 }
362
363 unsafe {
364 views.set_len(views.len() + num_loops * BLOCKS_PER_LOAD);
365 }
366 }
367
368 views.extend(src.chunks_exact(N).map(|slice| unsafe {
369 View::new_inline_unchecked(slice)
370 }));
371 },
372 otherwise = unreachable!()
373 }
374 }
375}
376
377impl IsNull for View {
378 const HAS_NULLS: bool = false;
379 type Inner = Self;
380
381 fn is_null(&self) -> bool {
382 false
383 }
384
385 fn unwrap_inner(self) -> Self::Inner {
386 self
387 }
388}
389
390impl Display for View {
391 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
392 write!(f, "{self:?}")
393 }
394}
395
396unsafe impl Zeroable for View {}
397
398unsafe impl Pod for View {}
399
400impl PartialEq for View {
401 fn eq(&self, other: &Self) -> bool {
402 self.as_u128() == other.as_u128()
403 }
404}
405
406impl TotalOrd for View {
409 fn tot_cmp(&self, _other: &Self) -> Ordering {
410 unimplemented!()
411 }
412}
413
414impl TotalEq for View {
415 fn tot_eq(&self, other: &Self) -> bool {
416 self.eq(other)
417 }
418}
419
420impl MinMax for View {
421 fn nan_min_lt(&self, _other: &Self) -> bool {
422 unimplemented!()
423 }
424
425 fn nan_max_lt(&self, _other: &Self) -> bool {
426 unimplemented!()
427 }
428}
429
430impl NativeType for View {
431 const PRIMITIVE: PrimitiveType = PrimitiveType::UInt128;
432
433 type Bytes = [u8; 16];
434 type AlignedBytes = Bytes16Alignment4;
435
436 #[inline]
437 fn to_le_bytes(&self) -> Self::Bytes {
438 self.as_u128().to_le_bytes()
439 }
440
441 #[inline]
442 fn to_be_bytes(&self) -> Self::Bytes {
443 self.as_u128().to_be_bytes()
444 }
445
446 #[inline]
447 fn from_le_bytes(bytes: Self::Bytes) -> Self {
448 Self::from(u128::from_le_bytes(bytes))
449 }
450
451 #[inline]
452 fn from_be_bytes(bytes: Self::Bytes) -> Self {
453 Self::from(u128::from_be_bytes(bytes))
454 }
455}
456
457impl From<u128> for View {
458 #[inline]
459 fn from(value: u128) -> Self {
460 unsafe { std::mem::transmute(value) }
461 }
462}
463
464impl From<View> for u128 {
465 #[inline]
466 fn from(value: View) -> Self {
467 value.as_u128()
468 }
469}
470
471pub fn validate_views<B: AsRef<[u8]>, F>(
472 views: &[View],
473 buffers: &[B],
474 validate_bytes: F,
475) -> PolarsResult<()>
476where
477 F: Fn(&[u8]) -> PolarsResult<()>,
478{
479 for view in views {
480 if let Some(inline_slice) = view.get_inlined_slice() {
481 if view.length < View::MAX_INLINE_SIZE && view.as_u128() >> (32 + view.length * 8) != 0
482 {
483 polars_bail!(ComputeError: "view contained non-zero padding in prefix");
484 }
485
486 validate_bytes(inline_slice)?;
487 } else {
488 let data = buffers.get(view.buffer_idx as usize).ok_or_else(|| {
489 polars_err!(OutOfBounds: "view index out of bounds\n\nGot: {} buffers and index: {}", buffers.len(), view.buffer_idx)
490 })?;
491
492 let start = view.offset as usize;
493 let end = start + view.length as usize;
494 let b = data
495 .as_ref()
496 .get(start..end)
497 .ok_or_else(|| polars_err!(OutOfBounds: "buffer slice out of bounds"))?;
498
499 polars_ensure!(b.starts_with(&view.prefix.to_le_bytes()), ComputeError: "prefix does not match string data");
500 validate_bytes(b)?;
501 }
502 }
503
504 Ok(())
505}
506
507pub fn validate_binary_views<B: AsRef<[u8]>>(views: &[View], buffers: &[B]) -> PolarsResult<()> {
508 validate_views(views, buffers, |_| Ok(()))
509}
510
511fn validate_utf8(b: &[u8]) -> PolarsResult<()> {
512 match simdutf8::basic::from_utf8(b) {
513 Ok(_) => Ok(()),
514 Err(_) => Err(polars_err!(ComputeError: "invalid utf8")),
515 }
516}
517
518pub fn validate_utf8_views<B: AsRef<[u8]>>(views: &[View], buffers: &[B]) -> PolarsResult<()> {
519 validate_views(views, buffers, validate_utf8)
520}
521
522pub unsafe fn validate_views_utf8_only<B: AsRef<[u8]>>(
527 views: &[View],
528 buffers: &[B],
529 mut num_trusted_buffers: usize,
530) -> PolarsResult<()> {
531 unsafe {
532 while num_trusted_buffers < buffers.len()
533 && buffers[num_trusted_buffers].as_ref().is_ascii()
534 {
535 num_trusted_buffers += 1;
536 }
537
538 if num_trusted_buffers >= buffers.len() {
540 for view in views {
541 if let Some(inlined_slice) = view.get_inlined_slice() {
542 validate_utf8(inlined_slice)?;
543 }
544 }
545 } else {
546 for view in views {
547 if view.length <= View::MAX_INLINE_SIZE
548 || view.buffer_idx as usize >= num_trusted_buffers
549 {
550 validate_utf8(view.get_slice_unchecked(buffers))?;
551 }
552 }
553 }
554
555 Ok(())
556 }
557}