1#![allow(unsafe_op_in_unsafe_fn)]
2mod builder;
5pub use builder::*;
6mod ffi;
7pub(super) mod fmt;
8mod iterator;
9mod mutable;
10#[cfg(feature = "proptest")]
11pub mod proptest;
12mod view;
13
14use std::any::Any;
15use std::fmt::Debug;
16use std::marker::PhantomData;
17
18use polars_buffer::Buffer;
19use polars_error::*;
20use polars_utils::relaxed_cell::RelaxedCell;
21
22use crate::array::Array;
23use crate::bitmap::Bitmap;
24use crate::datatypes::ArrowDataType;
25
26mod private {
27 pub trait Sealed: Send + Sync {}
28
29 impl Sealed for str {}
30 impl Sealed for [u8] {}
31}
32pub use iterator::BinaryViewValueIter;
33pub use mutable::MutableBinaryViewArray;
34use polars_utils::aliases::{InitHashMaps, PlHashMap};
35use private::Sealed;
36
37use crate::array::binview::view::{validate_binary_views, validate_views_utf8_only};
38use crate::array::iterator::NonNullValuesIter;
39use crate::bitmap::utils::{BitmapIter, ZipValidity};
40pub type BinaryViewArray = BinaryViewArrayGeneric<[u8]>;
41pub type Utf8ViewArray = BinaryViewArrayGeneric<str>;
42pub type BinaryViewArrayBuilder = BinaryViewArrayGenericBuilder<[u8]>;
43pub type Utf8ViewArrayBuilder = BinaryViewArrayGenericBuilder<str>;
44pub use view::{View, validate_utf8_views};
45
46use super::Splitable;
47
48pub type MutablePlString = MutableBinaryViewArray<str>;
49pub type MutablePlBinary = MutableBinaryViewArray<[u8]>;
50
51static BIN_VIEW_TYPE: ArrowDataType = ArrowDataType::BinaryView;
52static UTF8_VIEW_TYPE: ArrowDataType = ArrowDataType::Utf8View;
53
54const DEFAULT_BLOCK_SIZE: usize = 8 * 1024;
56const MAX_EXP_BLOCK_SIZE: usize = 16 * 1024 * 1024;
57
58pub trait ViewType: Sealed + 'static + PartialEq + AsRef<Self> {
59 const IS_UTF8: bool;
60 const DATA_TYPE: ArrowDataType;
61 type Owned: Debug + Clone + Sync + Send + AsRef<Self>;
62
63 unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self;
66 fn from_bytes(slice: &[u8]) -> Option<&Self>;
67
68 fn to_bytes(&self) -> &[u8];
69
70 #[allow(clippy::wrong_self_convention)]
71 fn into_owned(&self) -> Self::Owned;
72
73 fn dtype() -> &'static ArrowDataType;
74}
75
76impl ViewType for str {
77 const IS_UTF8: bool = true;
78 const DATA_TYPE: ArrowDataType = ArrowDataType::Utf8View;
79 type Owned = String;
80
81 #[inline(always)]
82 unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self {
83 std::str::from_utf8_unchecked(slice)
84 }
85 #[inline(always)]
86 fn from_bytes(slice: &[u8]) -> Option<&Self> {
87 std::str::from_utf8(slice).ok()
88 }
89
90 #[inline(always)]
91 fn to_bytes(&self) -> &[u8] {
92 self.as_bytes()
93 }
94
95 fn into_owned(&self) -> Self::Owned {
96 self.to_string()
97 }
98 fn dtype() -> &'static ArrowDataType {
99 &UTF8_VIEW_TYPE
100 }
101}
102
103impl ViewType for [u8] {
104 const IS_UTF8: bool = false;
105 const DATA_TYPE: ArrowDataType = ArrowDataType::BinaryView;
106 type Owned = Vec<u8>;
107
108 #[inline(always)]
109 unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self {
110 slice
111 }
112 #[inline(always)]
113 fn from_bytes(slice: &[u8]) -> Option<&Self> {
114 Some(slice)
115 }
116
117 #[inline(always)]
118 fn to_bytes(&self) -> &[u8] {
119 self
120 }
121
122 fn into_owned(&self) -> Self::Owned {
123 self.to_vec()
124 }
125
126 fn dtype() -> &'static ArrowDataType {
127 &BIN_VIEW_TYPE
128 }
129}
130
131pub struct BinaryViewArrayGeneric<T: ViewType + ?Sized> {
132 dtype: ArrowDataType,
133 views: Buffer<View>,
134 buffers: Buffer<Buffer<u8>>,
135 validity: Option<Bitmap>,
136 phantom: PhantomData<T>,
137 total_bytes_len: RelaxedCell<u64>,
139 total_buffer_len: usize,
141}
142
143impl<T: ViewType + ?Sized> PartialEq for BinaryViewArrayGeneric<T> {
144 fn eq(&self, other: &Self) -> bool {
145 self.len() == other.len() && self.into_iter().zip(other).all(|(l, r)| l == r)
146 }
147}
148
149impl<T: ViewType + ?Sized> Clone for BinaryViewArrayGeneric<T> {
150 fn clone(&self) -> Self {
151 Self {
152 dtype: self.dtype.clone(),
153 views: self.views.clone(),
154 buffers: self.buffers.clone(),
155 validity: self.validity.clone(),
156 phantom: Default::default(),
157 total_bytes_len: self.total_bytes_len.clone(),
158 total_buffer_len: self.total_buffer_len,
159 }
160 }
161}
162
163unsafe impl<T: ViewType + ?Sized> Send for BinaryViewArrayGeneric<T> {}
164unsafe impl<T: ViewType + ?Sized> Sync for BinaryViewArrayGeneric<T> {}
165
166const UNKNOWN_LEN: u64 = u64::MAX;
167
168impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {
169 pub unsafe fn new_unchecked(
174 dtype: ArrowDataType,
175 views: Buffer<View>,
176 buffers: Buffer<Buffer<u8>>,
177 validity: Option<Bitmap>,
178 total_bytes_len: Option<usize>,
179 total_buffer_len: usize,
180 ) -> Self {
181 #[cfg(debug_assertions)]
183 {
184 if let Some(validity) = validity.as_ref() {
185 assert_eq!(validity.len(), views.len());
186 }
187
188 for (i, view) in views.iter().enumerate() {
197 let is_valid = validity.as_ref().is_none_or(|v| v.get_bit(i));
198
199 if !is_valid {
200 continue;
201 }
202
203 if view.length > View::MAX_INLINE_SIZE {
205 assert!((view.buffer_idx as usize) < (buffers.len()));
206 assert!(
207 view.offset as usize + view.length as usize
208 <= buffers[view.buffer_idx as usize].len()
209 );
210 }
211 }
212
213 }
218
219 Self {
220 dtype,
221 views,
222 buffers,
223 validity,
224 phantom: Default::default(),
225 total_bytes_len: RelaxedCell::from(
226 total_bytes_len.map(|l| l as u64).unwrap_or(UNKNOWN_LEN),
227 ),
228 total_buffer_len,
229 }
230 }
231
232 pub unsafe fn new_unchecked_unknown_md(
237 dtype: ArrowDataType,
238 views: Buffer<View>,
239 buffers: Buffer<Buffer<u8>>,
240 validity: Option<Bitmap>,
241 total_buffer_len: Option<usize>,
242 ) -> Self {
243 let total_bytes_len = None;
244 let total_buffer_len =
245 total_buffer_len.unwrap_or_else(|| buffers.iter().map(|b| b.len()).sum());
246 Self::new_unchecked(
247 dtype,
248 views,
249 buffers,
250 validity,
251 total_bytes_len,
252 total_buffer_len,
253 )
254 }
255
256 pub fn data_buffers(&self) -> &Buffer<Buffer<u8>> {
257 &self.buffers
258 }
259
260 pub fn data_buffers_mut(&mut self) -> &mut Buffer<Buffer<u8>> {
261 &mut self.buffers
262 }
263
264 pub fn variadic_buffer_lengths(&self) -> Vec<i64> {
265 self.buffers.iter().map(|buf| buf.len() as i64).collect()
266 }
267
268 pub fn views(&self) -> &Buffer<View> {
269 &self.views
270 }
271
272 pub fn into_views(self) -> Vec<View> {
273 self.views.to_vec()
274 }
275
276 pub fn into_inner(
277 self,
278 ) -> (
279 Buffer<View>,
280 Buffer<Buffer<u8>>,
281 Option<Bitmap>,
282 Option<usize>,
283 usize,
284 ) {
285 let total_bytes_len = self.try_total_bytes_len();
286 let views = self.views;
287 let buffers = self.buffers;
288 let validity = self.validity;
289
290 (
291 views,
292 buffers,
293 validity,
294 total_bytes_len,
295 self.total_buffer_len,
296 )
297 }
298
299 pub unsafe fn apply_views<F: FnMut(View, &T) -> View>(&self, mut update_view: F) -> Self {
304 let arr = self.clone();
305 let (views, buffers, validity, _total_bytes_len, total_buffer_len) = arr.into_inner();
306
307 let mut total_bytes_len = 0;
308 let mut views = views.to_vec();
309 for v in views.iter_mut() {
310 let str_slice = T::from_bytes_unchecked(v.get_slice_unchecked(&buffers));
311 *v = update_view(*v, str_slice);
312 total_bytes_len += v.length as usize;
313 }
314
315 let len_valid = validity.is_none();
316 Self::new_unchecked(
317 self.dtype.clone(),
318 views.into(),
319 buffers,
320 validity,
321 len_valid.then_some(total_bytes_len),
322 total_buffer_len,
323 )
324 }
325
326 pub unsafe fn with_views_mut<F: FnOnce(&mut [View])>(&mut self, f: F) {
331 self.total_bytes_len.store(UNKNOWN_LEN);
332 if let Some(views) = self.views.get_mut_slice() {
333 f(views)
334 } else {
335 let mut views = self.views.as_slice().to_vec();
336 f(&mut views);
337 self.views = Buffer::from(views);
338 }
339 }
340
341 pub fn try_new(
342 dtype: ArrowDataType,
343 views: Buffer<View>,
344 buffers: Buffer<Buffer<u8>>,
345 validity: Option<Bitmap>,
346 ) -> PolarsResult<Self> {
347 if T::IS_UTF8 {
348 validate_utf8_views(views.as_ref(), buffers.as_ref())?;
349 } else {
350 validate_binary_views(views.as_ref(), buffers.as_ref())?;
351 }
352
353 if let Some(validity) = &validity {
354 polars_ensure!(validity.len()== views.len(), ComputeError: "validity mask length must match the number of values" )
355 }
356
357 unsafe {
358 Ok(Self::new_unchecked_unknown_md(
359 dtype, views, buffers, validity, None,
360 ))
361 }
362 }
363
364 #[inline]
366 pub fn new_empty(dtype: ArrowDataType) -> Self {
367 unsafe { Self::new_unchecked(dtype, Buffer::new(), Buffer::new(), None, Some(0), 0) }
368 }
369
370 #[inline]
372 pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
373 let validity = Some(Bitmap::new_zeroed(length));
374 unsafe {
375 Self::new_unchecked(
376 dtype,
377 Buffer::zeroed(length),
378 Buffer::new(),
379 validity,
380 Some(0),
381 0,
382 )
383 }
384 }
385
386 #[inline]
390 pub fn value(&self, i: usize) -> &T {
391 assert!(i < self.len());
392 unsafe { self.value_unchecked(i) }
393 }
394
395 #[inline]
400 pub unsafe fn value_unchecked(&self, i: usize) -> &T {
401 let v = self.views.get_unchecked(i);
402 T::from_bytes_unchecked(v.get_slice_unchecked(&self.buffers))
403 }
404
405 #[inline]
409 pub fn get(&self, i: usize) -> Option<&T> {
410 assert!(i < self.len());
411 unsafe { self.get_unchecked(i) }
412 }
413
414 #[inline]
419 pub unsafe fn get_unchecked(&self, i: usize) -> Option<&T> {
420 if self
421 .validity
422 .as_ref()
423 .is_none_or(|v| v.get_bit_unchecked(i))
424 {
425 let v = self.views.get_unchecked(i);
426 Some(T::from_bytes_unchecked(
427 v.get_slice_unchecked(&self.buffers),
428 ))
429 } else {
430 None
431 }
432 }
433
434 pub fn iter(&self) -> ZipValidity<&T, BinaryViewValueIter<'_, T>, BitmapIter<'_>> {
436 ZipValidity::new_with_validity(self.values_iter(), self.validity.as_ref())
437 }
438
439 pub fn values_iter(&self) -> BinaryViewValueIter<'_, T> {
441 BinaryViewValueIter::new(self)
442 }
443
444 pub fn len_iter(&self) -> impl Iterator<Item = u32> + '_ {
445 self.views.iter().map(|v| v.length)
446 }
447
448 pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, BinaryViewArrayGeneric<T>> {
450 NonNullValuesIter::new(self, self.validity())
451 }
452
453 pub fn non_null_views_iter(&self) -> NonNullValuesIter<'_, Buffer<View>> {
455 NonNullValuesIter::new(self.views(), self.validity())
456 }
457
458 impl_sliced!();
459 impl_into_array!();
460
461 #[must_use]
465 #[inline]
466 pub fn with_validity(mut self, validity: Option<Bitmap>) -> Self {
467 self.set_validity(validity);
468 self
469 }
470
471 #[inline]
475 pub fn set_validity(&mut self, validity: Option<Bitmap>) {
476 if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) {
477 panic!("validity must be equal to the array's length")
478 }
479 self.total_bytes_len.store(UNKNOWN_LEN);
480 self.validity = validity;
481 }
482
483 #[inline]
485 pub fn take_validity(&mut self) -> Option<Bitmap> {
486 self.total_bytes_len.store(UNKNOWN_LEN);
487 self.validity.take()
488 }
489
490 pub fn from_slice<S: AsRef<T>, P: AsRef<[Option<S>]>>(slice: P) -> Self {
491 let mutable = MutableBinaryViewArray::from_iterator(
492 slice.as_ref().iter().map(|opt_v| opt_v.as_ref()),
493 );
494 mutable.into()
495 }
496
497 pub fn from_slice_values<S: AsRef<T>, P: AsRef<[S]>>(slice: P) -> Self {
498 let mutable =
499 MutableBinaryViewArray::from_values_iter(slice.as_ref().iter().map(|v| v.as_ref()));
500 mutable.into()
501 }
502
503 pub fn total_bytes_len(&self) -> usize {
505 let total = self.total_bytes_len.load();
506 if total == UNKNOWN_LEN {
507 let total = ZipValidity::new_with_validity(self.len_iter(), self.validity.as_ref())
508 .map(|v| v.unwrap_or(0) as usize)
509 .sum::<usize>();
510 self.total_bytes_len.store(total as u64);
511 total
512 } else {
513 total as usize
514 }
515 }
516
517 pub fn try_total_bytes_len(&self) -> Option<usize> {
519 let b = self.total_bytes_len.load();
520 (b != UNKNOWN_LEN).then_some(b as usize)
521 }
522
523 pub fn total_buffer_len(&self) -> usize {
525 self.total_buffer_len
526 }
527
528 fn total_unshared_buffer_len(&self) -> usize {
529 self.buffers
533 .iter()
534 .map(|buf| {
535 if buf.storage_refcount() > 1 {
536 0
537 } else {
538 buf.len()
539 }
540 })
541 .sum()
542 }
543
544 #[inline(always)]
545 pub fn len(&self) -> usize {
546 self.views.len()
547 }
548
549 pub fn gc(self) -> Self {
551 if self.buffers.is_empty() {
552 return self;
553 }
554 let mut mutable = MutableBinaryViewArray::with_capacity(self.len());
555 let buffers = self.buffers.as_ref();
556
557 for view in self.views.as_ref() {
558 unsafe { mutable.push_view_unchecked(*view, buffers) }
559 }
560 mutable.freeze().with_validity(self.validity)
561 }
562
563 pub fn deshare(&self) -> Self {
564 if self.buffers.storage_refcount() == 1
565 && self.buffers.iter().all(|b| b.storage_refcount() == 1)
566 {
567 return self.clone();
568 }
569 self.clone().gc()
570 }
571
572 pub fn is_sliced(&self) -> bool {
573 !std::ptr::eq(self.views.as_ptr(), self.views.storage_ptr())
574 }
575
576 pub fn maybe_gc(self) -> Self {
577 const GC_MINIMUM_SAVINGS: usize = 16 * 1024; if self.total_buffer_len <= GC_MINIMUM_SAVINGS {
580 return self;
581 }
582
583 if self.buffers.storage_refcount() != 1 {
584 return self;
588 }
589
590 let total_bytes_len = self.total_bytes_len();
593 let buffer_req_lower_bound = total_bytes_len.saturating_sub(self.len() * 12);
594
595 let lower_bound_mem_usage_post_gc = self.len() * 16 + buffer_req_lower_bound;
596 let cur_mem_usage = self.len() * 16 + self.total_unshared_buffer_len();
598 let savings_upper_bound = cur_mem_usage.saturating_sub(lower_bound_mem_usage_post_gc);
599
600 if savings_upper_bound >= GC_MINIMUM_SAVINGS
601 && cur_mem_usage >= 4 * lower_bound_mem_usage_post_gc
602 {
603 self.gc()
604 } else {
605 self
606 }
607 }
608
609 pub fn make_mut(self) -> MutableBinaryViewArray<T> {
610 let views = self.views.to_vec();
611 let completed_buffers = self.buffers.to_vec();
612 let validity = self.validity.map(|bitmap| bitmap.make_mut());
613
614 let mut total_bytes_len = self.total_bytes_len.load();
616 if total_bytes_len == UNKNOWN_LEN {
617 total_bytes_len = views.iter().map(|view| view.length as u64).sum();
618 }
619 let total_bytes_len = total_bytes_len as usize;
620
621 MutableBinaryViewArray {
622 views,
623 completed_buffers,
624 in_progress_buffer: vec![],
625 validity,
626 phantom: Default::default(),
627 total_bytes_len,
628 total_buffer_len: self.total_buffer_len,
629 stolen_buffers: PlHashMap::new(),
630 }
631 }
632}
633
634impl BinaryViewArray {
635 pub fn validate_utf8(&self) -> PolarsResult<()> {
637 unsafe { validate_views_utf8_only(&self.views, &self.buffers, 0) }
639 }
640
641 pub fn to_utf8view(&self) -> PolarsResult<Utf8ViewArray> {
643 self.validate_utf8()?;
644 unsafe { Ok(self.to_utf8view_unchecked()) }
645 }
646
647 pub unsafe fn to_utf8view_unchecked(&self) -> Utf8ViewArray {
652 Utf8ViewArray::new_unchecked(
653 ArrowDataType::Utf8View,
654 self.views.clone(),
655 self.buffers.clone(),
656 self.validity.clone(),
657 self.try_total_bytes_len(),
658 self.total_buffer_len,
659 )
660 }
661}
662
663impl Utf8ViewArray {
664 pub fn to_binview(&self) -> BinaryViewArray {
665 unsafe {
667 BinaryViewArray::new_unchecked(
668 ArrowDataType::BinaryView,
669 self.views.clone(),
670 self.buffers.clone(),
671 self.validity.clone(),
672 self.try_total_bytes_len(),
673 self.total_buffer_len,
674 )
675 }
676 }
677}
678
679impl<T: ViewType + ?Sized> Array for BinaryViewArrayGeneric<T> {
680 fn as_any(&self) -> &dyn Any {
681 self
682 }
683
684 fn as_any_mut(&mut self) -> &mut dyn Any {
685 self
686 }
687
688 #[inline(always)]
689 fn len(&self) -> usize {
690 BinaryViewArrayGeneric::len(self)
691 }
692
693 #[inline(always)]
694 fn dtype(&self) -> &ArrowDataType {
695 &self.dtype
696 }
697
698 #[inline(always)]
699 fn dtype_mut(&mut self) -> &mut ArrowDataType {
700 &mut self.dtype
701 }
702
703 fn validity(&self) -> Option<&Bitmap> {
704 self.validity.as_ref()
705 }
706
707 fn split_at_boxed(&self, offset: usize) -> (Box<dyn Array>, Box<dyn Array>) {
708 let (lhs, rhs) = Splitable::split_at(self, offset);
709 (Box::new(lhs), Box::new(rhs))
710 }
711
712 unsafe fn split_at_boxed_unchecked(&self, offset: usize) -> (Box<dyn Array>, Box<dyn Array>) {
713 let (lhs, rhs) = unsafe { Splitable::split_at_unchecked(self, offset) };
714 (Box::new(lhs), Box::new(rhs))
715 }
716
717 fn slice(&mut self, offset: usize, length: usize) {
718 assert!(
719 offset + length <= self.len(),
720 "the offset of the new Buffer cannot exceed the existing length"
721 );
722 unsafe { self.slice_unchecked(offset, length) }
723 }
724
725 unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
726 debug_assert!(offset + length <= self.len());
727 self.validity = self
728 .validity
729 .take()
730 .map(|bitmap| bitmap.sliced_unchecked(offset, length))
731 .filter(|bitmap| bitmap.unset_bits() > 0);
732 self.views.slice_in_place_unchecked(offset..offset + length);
733 self.total_bytes_len.store(UNKNOWN_LEN)
734 }
735
736 fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
737 debug_assert!(
738 validity.as_ref().is_none_or(|v| v.len() == self.len()),
739 "{} != {}",
740 validity.as_ref().unwrap().len(),
741 self.len()
742 );
743
744 let mut new = self.clone();
745 new.validity = validity;
746 Box::new(new)
747 }
748
749 fn to_boxed(&self) -> Box<dyn Array> {
750 Box::new(self.clone())
751 }
752}
753
754impl<T: ViewType + ?Sized> Splitable for BinaryViewArrayGeneric<T> {
755 fn check_bound(&self, offset: usize) -> bool {
756 offset <= self.len()
757 }
758
759 unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
760 let (lhs_views, rhs_views) = unsafe { self.views.split_at_unchecked(offset) };
761 let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };
762
763 unsafe {
764 (
765 Self::new_unchecked(
766 self.dtype.clone(),
767 lhs_views,
768 self.buffers.clone(),
769 lhs_validity,
770 (offset == 0).then_some(0),
771 self.total_buffer_len(),
772 ),
773 Self::new_unchecked(
774 self.dtype.clone(),
775 rhs_views,
776 self.buffers.clone(),
777 rhs_validity,
778 (offset == self.len()).then_some(0),
779 self.total_buffer_len(),
780 ),
781 )
782 }
783 }
784}