1#![allow(unsafe_op_in_unsafe_fn)]
2mod builder;
5pub use builder::*;
6mod ffi;
7pub(super) mod fmt;
8mod iterator;
9mod mutable;
10#[cfg(feature = "proptest")]
11pub mod proptest;
12mod view;
13
14use std::any::Any;
15use std::fmt::Debug;
16use std::marker::PhantomData;
17use std::sync::Arc;
18use std::sync::atomic::{AtomicU64, Ordering};
19
20use polars_error::*;
21
22use crate::array::Array;
23use crate::bitmap::Bitmap;
24use crate::buffer::Buffer;
25use crate::datatypes::ArrowDataType;
26
27mod private {
28 pub trait Sealed: Send + Sync {}
29
30 impl Sealed for str {}
31 impl Sealed for [u8] {}
32}
33pub use iterator::BinaryViewValueIter;
34pub use mutable::MutableBinaryViewArray;
35use polars_utils::aliases::{InitHashMaps, PlHashMap};
36use private::Sealed;
37
38use crate::array::binview::view::{validate_binary_views, validate_views_utf8_only};
39use crate::array::iterator::NonNullValuesIter;
40use crate::bitmap::utils::{BitmapIter, ZipValidity};
41pub type BinaryViewArray = BinaryViewArrayGeneric<[u8]>;
42pub type Utf8ViewArray = BinaryViewArrayGeneric<str>;
43pub use view::{View, validate_utf8_views};
44
45use super::Splitable;
46
47pub type MutablePlString = MutableBinaryViewArray<str>;
48pub type MutablePlBinary = MutableBinaryViewArray<[u8]>;
49
50static BIN_VIEW_TYPE: ArrowDataType = ArrowDataType::BinaryView;
51static UTF8_VIEW_TYPE: ArrowDataType = ArrowDataType::Utf8View;
52
53const DEFAULT_BLOCK_SIZE: usize = 8 * 1024;
55const MAX_EXP_BLOCK_SIZE: usize = 16 * 1024 * 1024;
56
57pub trait ViewType: Sealed + 'static + PartialEq + AsRef<Self> {
58 const IS_UTF8: bool;
59 const DATA_TYPE: ArrowDataType;
60 type Owned: Debug + Clone + Sync + Send + AsRef<Self>;
61
62 unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self;
65 fn from_bytes(slice: &[u8]) -> Option<&Self>;
66
67 fn to_bytes(&self) -> &[u8];
68
69 #[allow(clippy::wrong_self_convention)]
70 fn into_owned(&self) -> Self::Owned;
71
72 fn dtype() -> &'static ArrowDataType;
73}
74
75impl ViewType for str {
76 const IS_UTF8: bool = true;
77 const DATA_TYPE: ArrowDataType = ArrowDataType::Utf8View;
78 type Owned = String;
79
80 #[inline(always)]
81 unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self {
82 std::str::from_utf8_unchecked(slice)
83 }
84 #[inline(always)]
85 fn from_bytes(slice: &[u8]) -> Option<&Self> {
86 std::str::from_utf8(slice).ok()
87 }
88
89 #[inline(always)]
90 fn to_bytes(&self) -> &[u8] {
91 self.as_bytes()
92 }
93
94 fn into_owned(&self) -> Self::Owned {
95 self.to_string()
96 }
97 fn dtype() -> &'static ArrowDataType {
98 &UTF8_VIEW_TYPE
99 }
100}
101
102impl ViewType for [u8] {
103 const IS_UTF8: bool = false;
104 const DATA_TYPE: ArrowDataType = ArrowDataType::BinaryView;
105 type Owned = Vec<u8>;
106
107 #[inline(always)]
108 unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self {
109 slice
110 }
111 #[inline(always)]
112 fn from_bytes(slice: &[u8]) -> Option<&Self> {
113 Some(slice)
114 }
115
116 #[inline(always)]
117 fn to_bytes(&self) -> &[u8] {
118 self
119 }
120
121 fn into_owned(&self) -> Self::Owned {
122 self.to_vec()
123 }
124
125 fn dtype() -> &'static ArrowDataType {
126 &BIN_VIEW_TYPE
127 }
128}
129
130pub struct BinaryViewArrayGeneric<T: ViewType + ?Sized> {
131 dtype: ArrowDataType,
132 views: Buffer<View>,
133 buffers: Arc<[Buffer<u8>]>,
134 validity: Option<Bitmap>,
135 phantom: PhantomData<T>,
136 total_bytes_len: AtomicU64,
138 total_buffer_len: usize,
140}
141
142impl<T: ViewType + ?Sized> PartialEq for BinaryViewArrayGeneric<T> {
143 fn eq(&self, other: &Self) -> bool {
144 self.len() == other.len() && self.into_iter().zip(other).all(|(l, r)| l == r)
145 }
146}
147
148impl<T: ViewType + ?Sized> Clone for BinaryViewArrayGeneric<T> {
149 fn clone(&self) -> Self {
150 Self {
151 dtype: self.dtype.clone(),
152 views: self.views.clone(),
153 buffers: self.buffers.clone(),
154 validity: self.validity.clone(),
155 phantom: Default::default(),
156 total_bytes_len: AtomicU64::new(self.total_bytes_len.load(Ordering::Relaxed)),
157 total_buffer_len: self.total_buffer_len,
158 }
159 }
160}
161
162unsafe impl<T: ViewType + ?Sized> Send for BinaryViewArrayGeneric<T> {}
163unsafe impl<T: ViewType + ?Sized> Sync for BinaryViewArrayGeneric<T> {}
164
165const UNKNOWN_LEN: u64 = u64::MAX;
166
167impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {
168 pub unsafe fn new_unchecked(
173 dtype: ArrowDataType,
174 views: Buffer<View>,
175 buffers: Arc<[Buffer<u8>]>,
176 validity: Option<Bitmap>,
177 total_bytes_len: usize,
178 total_buffer_len: usize,
179 ) -> Self {
180 #[cfg(debug_assertions)]
182 {
183 if let Some(validity) = validity.as_ref() {
184 assert_eq!(validity.len(), views.len());
185 }
186
187 for (i, view) in views.iter().enumerate() {
196 let is_valid = validity.as_ref().is_none_or(|v| v.get_bit(i));
197
198 if !is_valid {
199 continue;
200 }
201
202 if view.length > View::MAX_INLINE_SIZE {
204 assert!((view.buffer_idx as usize) < (buffers.len()));
205 assert!(
206 view.offset as usize + view.length as usize
207 <= buffers[view.buffer_idx as usize].len()
208 );
209 }
210 }
211
212 }
217
218 Self {
219 dtype,
220 views,
221 buffers,
222 validity,
223 phantom: Default::default(),
224 total_bytes_len: AtomicU64::new(total_bytes_len as u64),
225 total_buffer_len,
226 }
227 }
228
229 pub unsafe fn new_unchecked_unknown_md(
234 dtype: ArrowDataType,
235 views: Buffer<View>,
236 buffers: Arc<[Buffer<u8>]>,
237 validity: Option<Bitmap>,
238 total_buffer_len: Option<usize>,
239 ) -> Self {
240 let total_bytes_len = UNKNOWN_LEN as usize;
241 let total_buffer_len =
242 total_buffer_len.unwrap_or_else(|| buffers.iter().map(|b| b.len()).sum());
243 Self::new_unchecked(
244 dtype,
245 views,
246 buffers,
247 validity,
248 total_bytes_len,
249 total_buffer_len,
250 )
251 }
252
253 pub fn data_buffers(&self) -> &Arc<[Buffer<u8>]> {
254 &self.buffers
255 }
256
257 pub fn variadic_buffer_lengths(&self) -> Vec<i64> {
258 self.buffers.iter().map(|buf| buf.len() as i64).collect()
259 }
260
261 pub fn views(&self) -> &Buffer<View> {
262 &self.views
263 }
264
265 pub fn into_views(self) -> Vec<View> {
266 self.views.make_mut()
267 }
268
269 pub fn into_inner(
270 self,
271 ) -> (
272 Buffer<View>,
273 Arc<[Buffer<u8>]>,
274 Option<Bitmap>,
275 usize,
276 usize,
277 ) {
278 let views = self.views;
279 let buffers = self.buffers;
280 let validity = self.validity;
281
282 (
283 views,
284 buffers,
285 validity,
286 self.total_bytes_len.load(Ordering::Relaxed) as usize,
287 self.total_buffer_len,
288 )
289 }
290
291 pub unsafe fn apply_views<F: FnMut(View, &T) -> View>(&self, mut update_view: F) -> Self {
296 let arr = self.clone();
297 let (views, buffers, validity, total_bytes_len, total_buffer_len) = arr.into_inner();
298
299 let mut views = views.make_mut();
300 for v in views.iter_mut() {
301 let str_slice = T::from_bytes_unchecked(v.get_slice_unchecked(&buffers));
302 *v = update_view(*v, str_slice);
303 }
304 Self::new_unchecked(
305 self.dtype.clone(),
306 views.into(),
307 buffers,
308 validity,
309 total_bytes_len,
310 total_buffer_len,
311 )
312 }
313
314 pub fn try_new(
315 dtype: ArrowDataType,
316 views: Buffer<View>,
317 buffers: Arc<[Buffer<u8>]>,
318 validity: Option<Bitmap>,
319 ) -> PolarsResult<Self> {
320 if T::IS_UTF8 {
321 validate_utf8_views(views.as_ref(), buffers.as_ref())?;
322 } else {
323 validate_binary_views(views.as_ref(), buffers.as_ref())?;
324 }
325
326 if let Some(validity) = &validity {
327 polars_ensure!(validity.len()== views.len(), ComputeError: "validity mask length must match the number of values" )
328 }
329
330 unsafe {
331 Ok(Self::new_unchecked_unknown_md(
332 dtype, views, buffers, validity, None,
333 ))
334 }
335 }
336
337 #[inline]
339 pub fn new_empty(dtype: ArrowDataType) -> Self {
340 unsafe { Self::new_unchecked(dtype, Buffer::new(), Arc::from([]), None, 0, 0) }
341 }
342
343 #[inline]
345 pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
346 let validity = Some(Bitmap::new_zeroed(length));
347 unsafe { Self::new_unchecked(dtype, Buffer::zeroed(length), Arc::from([]), validity, 0, 0) }
348 }
349
350 #[inline]
354 pub fn value(&self, i: usize) -> &T {
355 assert!(i < self.len());
356 unsafe { self.value_unchecked(i) }
357 }
358
359 #[inline]
364 pub unsafe fn value_unchecked(&self, i: usize) -> &T {
365 let v = self.views.get_unchecked(i);
366 T::from_bytes_unchecked(v.get_slice_unchecked(&self.buffers))
367 }
368
369 #[inline]
373 pub fn get(&self, i: usize) -> Option<&T> {
374 assert!(i < self.len());
375 unsafe { self.get_unchecked(i) }
376 }
377
378 #[inline]
383 pub unsafe fn get_unchecked(&self, i: usize) -> Option<&T> {
384 if self
385 .validity
386 .as_ref()
387 .is_none_or(|v| v.get_bit_unchecked(i))
388 {
389 let v = self.views.get_unchecked(i);
390 Some(T::from_bytes_unchecked(
391 v.get_slice_unchecked(&self.buffers),
392 ))
393 } else {
394 None
395 }
396 }
397
398 pub fn iter(&self) -> ZipValidity<&T, BinaryViewValueIter<T>, BitmapIter> {
400 ZipValidity::new_with_validity(self.values_iter(), self.validity.as_ref())
401 }
402
403 pub fn values_iter(&self) -> BinaryViewValueIter<T> {
405 BinaryViewValueIter::new(self)
406 }
407
408 pub fn len_iter(&self) -> impl Iterator<Item = u32> + '_ {
409 self.views.iter().map(|v| v.length)
410 }
411
412 pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, BinaryViewArrayGeneric<T>> {
414 NonNullValuesIter::new(self, self.validity())
415 }
416
417 pub fn non_null_views_iter(&self) -> NonNullValuesIter<'_, Buffer<View>> {
419 NonNullValuesIter::new(self.views(), self.validity())
420 }
421
422 impl_sliced!();
423 impl_mut_validity!();
424 impl_into_array!();
425
426 pub fn from_slice<S: AsRef<T>, P: AsRef<[Option<S>]>>(slice: P) -> Self {
427 let mutable = MutableBinaryViewArray::from_iterator(
428 slice.as_ref().iter().map(|opt_v| opt_v.as_ref()),
429 );
430 mutable.into()
431 }
432
433 pub fn from_slice_values<S: AsRef<T>, P: AsRef<[S]>>(slice: P) -> Self {
434 let mutable =
435 MutableBinaryViewArray::from_values_iter(slice.as_ref().iter().map(|v| v.as_ref()));
436 mutable.into()
437 }
438
439 pub fn total_bytes_len(&self) -> usize {
441 let total = self.total_bytes_len.load(Ordering::Relaxed);
442 if total == UNKNOWN_LEN {
443 let total = self.len_iter().map(|v| v as usize).sum::<usize>();
444 self.total_bytes_len.store(total as u64, Ordering::Relaxed);
445 total
446 } else {
447 total as usize
448 }
449 }
450
451 pub fn total_buffer_len(&self) -> usize {
453 self.total_buffer_len
454 }
455
456 fn total_unshared_buffer_len(&self) -> usize {
457 self.buffers
461 .iter()
462 .map(|buf| {
463 if buf.storage_refcount() > 1 {
464 0
465 } else {
466 buf.len()
467 }
468 })
469 .sum()
470 }
471
472 #[inline(always)]
473 pub fn len(&self) -> usize {
474 self.views.len()
475 }
476
477 pub fn gc(self) -> Self {
479 if self.buffers.is_empty() {
480 return self;
481 }
482 let mut mutable = MutableBinaryViewArray::with_capacity(self.len());
483 let buffers = self.buffers.as_ref();
484
485 for view in self.views.as_ref() {
486 unsafe { mutable.push_view_unchecked(*view, buffers) }
487 }
488 mutable.freeze().with_validity(self.validity)
489 }
490
491 pub fn deshare(&self) -> Self {
492 if Arc::strong_count(&self.buffers) == 1
493 && self.buffers.iter().all(|b| b.storage_refcount() == 1)
494 {
495 return self.clone();
496 }
497 self.clone().gc()
498 }
499
500 pub fn is_sliced(&self) -> bool {
501 !std::ptr::eq(self.views.as_ptr(), self.views.storage_ptr())
502 }
503
504 pub fn maybe_gc(self) -> Self {
505 const GC_MINIMUM_SAVINGS: usize = 16 * 1024; if self.total_buffer_len <= GC_MINIMUM_SAVINGS {
508 return self;
509 }
510
511 if Arc::strong_count(&self.buffers) != 1 {
512 return self;
516 }
517
518 let total_bytes_len = self.total_bytes_len();
521 let buffer_req_lower_bound = total_bytes_len.saturating_sub(self.len() * 12);
522
523 let lower_bound_mem_usage_post_gc = self.len() * 16 + buffer_req_lower_bound;
524 let cur_mem_usage = self.len() * 16 + self.total_unshared_buffer_len();
526 let savings_upper_bound = cur_mem_usage.saturating_sub(lower_bound_mem_usage_post_gc);
527
528 if savings_upper_bound >= GC_MINIMUM_SAVINGS
529 && cur_mem_usage >= 4 * lower_bound_mem_usage_post_gc
530 {
531 self.gc()
532 } else {
533 self
534 }
535 }
536
537 pub fn make_mut(self) -> MutableBinaryViewArray<T> {
538 let views = self.views.make_mut();
539 let completed_buffers = self.buffers.to_vec();
540 let validity = self.validity.map(|bitmap| bitmap.make_mut());
541
542 let mut total_bytes_len = self.total_bytes_len.load(Ordering::Relaxed);
544 if total_bytes_len == UNKNOWN_LEN {
545 total_bytes_len = views.iter().map(|view| view.length as u64).sum();
546 }
547 let total_bytes_len = total_bytes_len as usize;
548
549 MutableBinaryViewArray {
550 views,
551 completed_buffers,
552 in_progress_buffer: vec![],
553 validity,
554 phantom: Default::default(),
555 total_bytes_len,
556 total_buffer_len: self.total_buffer_len,
557 stolen_buffers: PlHashMap::new(),
558 }
559 }
560}
561
562impl BinaryViewArray {
563 pub fn validate_utf8(&self) -> PolarsResult<()> {
565 unsafe { validate_views_utf8_only(&self.views, &self.buffers, 0) }
567 }
568
569 pub fn to_utf8view(&self) -> PolarsResult<Utf8ViewArray> {
571 self.validate_utf8()?;
572 unsafe { Ok(self.to_utf8view_unchecked()) }
573 }
574
575 pub unsafe fn to_utf8view_unchecked(&self) -> Utf8ViewArray {
580 Utf8ViewArray::new_unchecked(
581 ArrowDataType::Utf8View,
582 self.views.clone(),
583 self.buffers.clone(),
584 self.validity.clone(),
585 self.total_bytes_len.load(Ordering::Relaxed) as usize,
586 self.total_buffer_len,
587 )
588 }
589}
590
591impl Utf8ViewArray {
592 pub fn to_binview(&self) -> BinaryViewArray {
593 unsafe {
595 BinaryViewArray::new_unchecked(
596 ArrowDataType::BinaryView,
597 self.views.clone(),
598 self.buffers.clone(),
599 self.validity.clone(),
600 self.total_bytes_len.load(Ordering::Relaxed) as usize,
601 self.total_buffer_len,
602 )
603 }
604 }
605}
606
607impl<T: ViewType + ?Sized> Array for BinaryViewArrayGeneric<T> {
608 fn as_any(&self) -> &dyn Any {
609 self
610 }
611
612 fn as_any_mut(&mut self) -> &mut dyn Any {
613 self
614 }
615
616 #[inline(always)]
617 fn len(&self) -> usize {
618 BinaryViewArrayGeneric::len(self)
619 }
620
621 fn dtype(&self) -> &ArrowDataType {
622 T::dtype()
623 }
624
625 fn validity(&self) -> Option<&Bitmap> {
626 self.validity.as_ref()
627 }
628
629 fn split_at_boxed(&self, offset: usize) -> (Box<dyn Array>, Box<dyn Array>) {
630 let (lhs, rhs) = Splitable::split_at(self, offset);
631 (Box::new(lhs), Box::new(rhs))
632 }
633
634 unsafe fn split_at_boxed_unchecked(&self, offset: usize) -> (Box<dyn Array>, Box<dyn Array>) {
635 let (lhs, rhs) = unsafe { Splitable::split_at_unchecked(self, offset) };
636 (Box::new(lhs), Box::new(rhs))
637 }
638
639 fn slice(&mut self, offset: usize, length: usize) {
640 assert!(
641 offset + length <= self.len(),
642 "the offset of the new Buffer cannot exceed the existing length"
643 );
644 unsafe { self.slice_unchecked(offset, length) }
645 }
646
647 unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
648 debug_assert!(offset + length <= self.len());
649 self.validity = self
650 .validity
651 .take()
652 .map(|bitmap| bitmap.sliced_unchecked(offset, length))
653 .filter(|bitmap| bitmap.unset_bits() > 0);
654 self.views.slice_unchecked(offset, length);
655 self.total_bytes_len.store(UNKNOWN_LEN, Ordering::Relaxed)
656 }
657
658 fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
659 debug_assert!(
660 validity.as_ref().is_none_or(|v| v.len() == self.len()),
661 "{} != {}",
662 validity.as_ref().unwrap().len(),
663 self.len()
664 );
665
666 let mut new = self.clone();
667 new.validity = validity;
668 Box::new(new)
669 }
670
671 fn to_boxed(&self) -> Box<dyn Array> {
672 Box::new(self.clone())
673 }
674}
675
676impl<T: ViewType + ?Sized> Splitable for BinaryViewArrayGeneric<T> {
677 fn check_bound(&self, offset: usize) -> bool {
678 offset <= self.len()
679 }
680
681 unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
682 let (lhs_views, rhs_views) = unsafe { self.views.split_at_unchecked(offset) };
683 let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };
684
685 unsafe {
686 (
687 Self::new_unchecked(
688 self.dtype.clone(),
689 lhs_views,
690 self.buffers.clone(),
691 lhs_validity,
692 if offset == 0 { 0 } else { UNKNOWN_LEN as _ },
693 self.total_buffer_len(),
694 ),
695 Self::new_unchecked(
696 self.dtype.clone(),
697 rhs_views,
698 self.buffers.clone(),
699 rhs_validity,
700 if offset == self.len() {
701 0
702 } else {
703 UNKNOWN_LEN as _
704 },
705 self.total_buffer_len(),
706 ),
707 )
708 }
709 }
710}