1#![allow(unsafe_op_in_unsafe_fn)]
2mod builder;
5pub use builder::*;
6mod ffi;
7pub(super) mod fmt;
8mod iterator;
9mod mutable;
10#[cfg(feature = "proptest")]
11pub mod proptest;
12mod view;
13
14use std::any::Any;
15use std::fmt::Debug;
16use std::marker::PhantomData;
17use std::sync::Arc;
18
19use polars_error::*;
20use polars_utils::relaxed_cell::RelaxedCell;
21
22use crate::array::Array;
23use crate::bitmap::Bitmap;
24use crate::buffer::Buffer;
25use crate::datatypes::ArrowDataType;
26
27mod private {
28 pub trait Sealed: Send + Sync {}
29
30 impl Sealed for str {}
31 impl Sealed for [u8] {}
32}
33pub use iterator::BinaryViewValueIter;
34pub use mutable::MutableBinaryViewArray;
35use polars_utils::aliases::{InitHashMaps, PlHashMap};
36use private::Sealed;
37
38use crate::array::binview::view::{validate_binary_views, validate_views_utf8_only};
39use crate::array::iterator::NonNullValuesIter;
40use crate::bitmap::utils::{BitmapIter, ZipValidity};
41pub type BinaryViewArray = BinaryViewArrayGeneric<[u8]>;
42pub type Utf8ViewArray = BinaryViewArrayGeneric<str>;
43pub type BinaryViewArrayBuilder = BinaryViewArrayGenericBuilder<[u8]>;
44pub type Utf8ViewArrayBuilder = BinaryViewArrayGenericBuilder<str>;
45pub use view::{View, validate_utf8_views};
46
47use super::Splitable;
48
49pub type MutablePlString = MutableBinaryViewArray<str>;
50pub type MutablePlBinary = MutableBinaryViewArray<[u8]>;
51
52static BIN_VIEW_TYPE: ArrowDataType = ArrowDataType::BinaryView;
53static UTF8_VIEW_TYPE: ArrowDataType = ArrowDataType::Utf8View;
54
55const DEFAULT_BLOCK_SIZE: usize = 8 * 1024;
57const MAX_EXP_BLOCK_SIZE: usize = 16 * 1024 * 1024;
58
59pub trait ViewType: Sealed + 'static + PartialEq + AsRef<Self> {
60 const IS_UTF8: bool;
61 const DATA_TYPE: ArrowDataType;
62 type Owned: Debug + Clone + Sync + Send + AsRef<Self>;
63
64 unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self;
67 fn from_bytes(slice: &[u8]) -> Option<&Self>;
68
69 fn to_bytes(&self) -> &[u8];
70
71 #[allow(clippy::wrong_self_convention)]
72 fn into_owned(&self) -> Self::Owned;
73
74 fn dtype() -> &'static ArrowDataType;
75}
76
77impl ViewType for str {
78 const IS_UTF8: bool = true;
79 const DATA_TYPE: ArrowDataType = ArrowDataType::Utf8View;
80 type Owned = String;
81
82 #[inline(always)]
83 unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self {
84 std::str::from_utf8_unchecked(slice)
85 }
86 #[inline(always)]
87 fn from_bytes(slice: &[u8]) -> Option<&Self> {
88 std::str::from_utf8(slice).ok()
89 }
90
91 #[inline(always)]
92 fn to_bytes(&self) -> &[u8] {
93 self.as_bytes()
94 }
95
96 fn into_owned(&self) -> Self::Owned {
97 self.to_string()
98 }
99 fn dtype() -> &'static ArrowDataType {
100 &UTF8_VIEW_TYPE
101 }
102}
103
104impl ViewType for [u8] {
105 const IS_UTF8: bool = false;
106 const DATA_TYPE: ArrowDataType = ArrowDataType::BinaryView;
107 type Owned = Vec<u8>;
108
109 #[inline(always)]
110 unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self {
111 slice
112 }
113 #[inline(always)]
114 fn from_bytes(slice: &[u8]) -> Option<&Self> {
115 Some(slice)
116 }
117
118 #[inline(always)]
119 fn to_bytes(&self) -> &[u8] {
120 self
121 }
122
123 fn into_owned(&self) -> Self::Owned {
124 self.to_vec()
125 }
126
127 fn dtype() -> &'static ArrowDataType {
128 &BIN_VIEW_TYPE
129 }
130}
131
132pub struct BinaryViewArrayGeneric<T: ViewType + ?Sized> {
133 dtype: ArrowDataType,
134 views: Buffer<View>,
135 buffers: Arc<[Buffer<u8>]>,
136 validity: Option<Bitmap>,
137 phantom: PhantomData<T>,
138 total_bytes_len: RelaxedCell<u64>,
140 total_buffer_len: usize,
142}
143
144impl<T: ViewType + ?Sized> PartialEq for BinaryViewArrayGeneric<T> {
145 fn eq(&self, other: &Self) -> bool {
146 self.len() == other.len() && self.into_iter().zip(other).all(|(l, r)| l == r)
147 }
148}
149
150impl<T: ViewType + ?Sized> Clone for BinaryViewArrayGeneric<T> {
151 fn clone(&self) -> Self {
152 Self {
153 dtype: self.dtype.clone(),
154 views: self.views.clone(),
155 buffers: self.buffers.clone(),
156 validity: self.validity.clone(),
157 phantom: Default::default(),
158 total_bytes_len: self.total_bytes_len.clone(),
159 total_buffer_len: self.total_buffer_len,
160 }
161 }
162}
163
164unsafe impl<T: ViewType + ?Sized> Send for BinaryViewArrayGeneric<T> {}
165unsafe impl<T: ViewType + ?Sized> Sync for BinaryViewArrayGeneric<T> {}
166
167const UNKNOWN_LEN: u64 = u64::MAX;
168
169impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {
170 pub unsafe fn new_unchecked(
175 dtype: ArrowDataType,
176 views: Buffer<View>,
177 buffers: Arc<[Buffer<u8>]>,
178 validity: Option<Bitmap>,
179 total_bytes_len: usize,
180 total_buffer_len: usize,
181 ) -> Self {
182 #[cfg(debug_assertions)]
184 {
185 if let Some(validity) = validity.as_ref() {
186 assert_eq!(validity.len(), views.len());
187 }
188
189 for (i, view) in views.iter().enumerate() {
198 let is_valid = validity.as_ref().is_none_or(|v| v.get_bit(i));
199
200 if !is_valid {
201 continue;
202 }
203
204 if view.length > View::MAX_INLINE_SIZE {
206 assert!((view.buffer_idx as usize) < (buffers.len()));
207 assert!(
208 view.offset as usize + view.length as usize
209 <= buffers[view.buffer_idx as usize].len()
210 );
211 }
212 }
213
214 }
219
220 Self {
221 dtype,
222 views,
223 buffers,
224 validity,
225 phantom: Default::default(),
226 total_bytes_len: RelaxedCell::from(total_bytes_len as u64),
227 total_buffer_len,
228 }
229 }
230
231 pub unsafe fn new_unchecked_unknown_md(
236 dtype: ArrowDataType,
237 views: Buffer<View>,
238 buffers: Arc<[Buffer<u8>]>,
239 validity: Option<Bitmap>,
240 total_buffer_len: Option<usize>,
241 ) -> Self {
242 let total_bytes_len = UNKNOWN_LEN as usize;
243 let total_buffer_len =
244 total_buffer_len.unwrap_or_else(|| buffers.iter().map(|b| b.len()).sum());
245 Self::new_unchecked(
246 dtype,
247 views,
248 buffers,
249 validity,
250 total_bytes_len,
251 total_buffer_len,
252 )
253 }
254
255 pub fn data_buffers(&self) -> &Arc<[Buffer<u8>]> {
256 &self.buffers
257 }
258
259 pub fn variadic_buffer_lengths(&self) -> Vec<i64> {
260 self.buffers.iter().map(|buf| buf.len() as i64).collect()
261 }
262
263 pub fn views(&self) -> &Buffer<View> {
264 &self.views
265 }
266
267 pub fn into_views(self) -> Vec<View> {
268 self.views.make_mut()
269 }
270
271 pub fn into_inner(
272 self,
273 ) -> (
274 Buffer<View>,
275 Arc<[Buffer<u8>]>,
276 Option<Bitmap>,
277 usize,
278 usize,
279 ) {
280 let views = self.views;
281 let buffers = self.buffers;
282 let validity = self.validity;
283
284 (
285 views,
286 buffers,
287 validity,
288 self.total_bytes_len.load() as usize,
289 self.total_buffer_len,
290 )
291 }
292
293 pub unsafe fn apply_views<F: FnMut(View, &T) -> View>(&self, mut update_view: F) -> Self {
298 let arr = self.clone();
299 let (views, buffers, validity, total_bytes_len, total_buffer_len) = arr.into_inner();
300
301 let mut views = views.make_mut();
302 for v in views.iter_mut() {
303 let str_slice = T::from_bytes_unchecked(v.get_slice_unchecked(&buffers));
304 *v = update_view(*v, str_slice);
305 }
306 Self::new_unchecked(
307 self.dtype.clone(),
308 views.into(),
309 buffers,
310 validity,
311 total_bytes_len,
312 total_buffer_len,
313 )
314 }
315
316 pub fn try_new(
317 dtype: ArrowDataType,
318 views: Buffer<View>,
319 buffers: Arc<[Buffer<u8>]>,
320 validity: Option<Bitmap>,
321 ) -> PolarsResult<Self> {
322 if T::IS_UTF8 {
323 validate_utf8_views(views.as_ref(), buffers.as_ref())?;
324 } else {
325 validate_binary_views(views.as_ref(), buffers.as_ref())?;
326 }
327
328 if let Some(validity) = &validity {
329 polars_ensure!(validity.len()== views.len(), ComputeError: "validity mask length must match the number of values" )
330 }
331
332 unsafe {
333 Ok(Self::new_unchecked_unknown_md(
334 dtype, views, buffers, validity, None,
335 ))
336 }
337 }
338
339 #[inline]
341 pub fn new_empty(dtype: ArrowDataType) -> Self {
342 unsafe { Self::new_unchecked(dtype, Buffer::new(), Arc::from([]), None, 0, 0) }
343 }
344
345 #[inline]
347 pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
348 let validity = Some(Bitmap::new_zeroed(length));
349 unsafe { Self::new_unchecked(dtype, Buffer::zeroed(length), Arc::from([]), validity, 0, 0) }
350 }
351
352 #[inline]
356 pub fn value(&self, i: usize) -> &T {
357 assert!(i < self.len());
358 unsafe { self.value_unchecked(i) }
359 }
360
361 #[inline]
366 pub unsafe fn value_unchecked(&self, i: usize) -> &T {
367 let v = self.views.get_unchecked(i);
368 T::from_bytes_unchecked(v.get_slice_unchecked(&self.buffers))
369 }
370
371 #[inline]
375 pub fn get(&self, i: usize) -> Option<&T> {
376 assert!(i < self.len());
377 unsafe { self.get_unchecked(i) }
378 }
379
380 #[inline]
385 pub unsafe fn get_unchecked(&self, i: usize) -> Option<&T> {
386 if self
387 .validity
388 .as_ref()
389 .is_none_or(|v| v.get_bit_unchecked(i))
390 {
391 let v = self.views.get_unchecked(i);
392 Some(T::from_bytes_unchecked(
393 v.get_slice_unchecked(&self.buffers),
394 ))
395 } else {
396 None
397 }
398 }
399
400 pub fn iter(&self) -> ZipValidity<&T, BinaryViewValueIter<'_, T>, BitmapIter<'_>> {
402 ZipValidity::new_with_validity(self.values_iter(), self.validity.as_ref())
403 }
404
405 pub fn values_iter(&self) -> BinaryViewValueIter<'_, T> {
407 BinaryViewValueIter::new(self)
408 }
409
410 pub fn len_iter(&self) -> impl Iterator<Item = u32> + '_ {
411 self.views.iter().map(|v| v.length)
412 }
413
414 pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, BinaryViewArrayGeneric<T>> {
416 NonNullValuesIter::new(self, self.validity())
417 }
418
419 pub fn non_null_views_iter(&self) -> NonNullValuesIter<'_, Buffer<View>> {
421 NonNullValuesIter::new(self.views(), self.validity())
422 }
423
424 impl_sliced!();
425 impl_mut_validity!();
426 impl_into_array!();
427
428 pub fn from_slice<S: AsRef<T>, P: AsRef<[Option<S>]>>(slice: P) -> Self {
429 let mutable = MutableBinaryViewArray::from_iterator(
430 slice.as_ref().iter().map(|opt_v| opt_v.as_ref()),
431 );
432 mutable.into()
433 }
434
435 pub fn from_slice_values<S: AsRef<T>, P: AsRef<[S]>>(slice: P) -> Self {
436 let mutable =
437 MutableBinaryViewArray::from_values_iter(slice.as_ref().iter().map(|v| v.as_ref()));
438 mutable.into()
439 }
440
441 pub fn total_bytes_len(&self) -> usize {
443 let total = self.total_bytes_len.load();
444 if total == UNKNOWN_LEN {
445 let total = self.len_iter().map(|v| v as usize).sum::<usize>();
446 self.total_bytes_len.store(total as u64);
447 total
448 } else {
449 total as usize
450 }
451 }
452
453 pub fn total_buffer_len(&self) -> usize {
455 self.total_buffer_len
456 }
457
458 fn total_unshared_buffer_len(&self) -> usize {
459 self.buffers
463 .iter()
464 .map(|buf| {
465 if buf.storage_refcount() > 1 {
466 0
467 } else {
468 buf.len()
469 }
470 })
471 .sum()
472 }
473
474 #[inline(always)]
475 pub fn len(&self) -> usize {
476 self.views.len()
477 }
478
479 pub fn gc(self) -> Self {
481 if self.buffers.is_empty() {
482 return self;
483 }
484 let mut mutable = MutableBinaryViewArray::with_capacity(self.len());
485 let buffers = self.buffers.as_ref();
486
487 for view in self.views.as_ref() {
488 unsafe { mutable.push_view_unchecked(*view, buffers) }
489 }
490 mutable.freeze().with_validity(self.validity)
491 }
492
493 pub fn deshare(&self) -> Self {
494 if Arc::strong_count(&self.buffers) == 1
495 && self.buffers.iter().all(|b| b.storage_refcount() == 1)
496 {
497 return self.clone();
498 }
499 self.clone().gc()
500 }
501
502 pub fn is_sliced(&self) -> bool {
503 !std::ptr::eq(self.views.as_ptr(), self.views.storage_ptr())
504 }
505
506 pub fn maybe_gc(self) -> Self {
507 const GC_MINIMUM_SAVINGS: usize = 16 * 1024; if self.total_buffer_len <= GC_MINIMUM_SAVINGS {
510 return self;
511 }
512
513 if Arc::strong_count(&self.buffers) != 1 {
514 return self;
518 }
519
520 let total_bytes_len = self.total_bytes_len();
523 let buffer_req_lower_bound = total_bytes_len.saturating_sub(self.len() * 12);
524
525 let lower_bound_mem_usage_post_gc = self.len() * 16 + buffer_req_lower_bound;
526 let cur_mem_usage = self.len() * 16 + self.total_unshared_buffer_len();
528 let savings_upper_bound = cur_mem_usage.saturating_sub(lower_bound_mem_usage_post_gc);
529
530 if savings_upper_bound >= GC_MINIMUM_SAVINGS
531 && cur_mem_usage >= 4 * lower_bound_mem_usage_post_gc
532 {
533 self.gc()
534 } else {
535 self
536 }
537 }
538
539 pub fn make_mut(self) -> MutableBinaryViewArray<T> {
540 let views = self.views.make_mut();
541 let completed_buffers = self.buffers.to_vec();
542 let validity = self.validity.map(|bitmap| bitmap.make_mut());
543
544 let mut total_bytes_len = self.total_bytes_len.load();
546 if total_bytes_len == UNKNOWN_LEN {
547 total_bytes_len = views.iter().map(|view| view.length as u64).sum();
548 }
549 let total_bytes_len = total_bytes_len as usize;
550
551 MutableBinaryViewArray {
552 views,
553 completed_buffers,
554 in_progress_buffer: vec![],
555 validity,
556 phantom: Default::default(),
557 total_bytes_len,
558 total_buffer_len: self.total_buffer_len,
559 stolen_buffers: PlHashMap::new(),
560 }
561 }
562}
563
564impl BinaryViewArray {
565 pub fn validate_utf8(&self) -> PolarsResult<()> {
567 unsafe { validate_views_utf8_only(&self.views, &self.buffers, 0) }
569 }
570
571 pub fn to_utf8view(&self) -> PolarsResult<Utf8ViewArray> {
573 self.validate_utf8()?;
574 unsafe { Ok(self.to_utf8view_unchecked()) }
575 }
576
577 pub unsafe fn to_utf8view_unchecked(&self) -> Utf8ViewArray {
582 Utf8ViewArray::new_unchecked(
583 ArrowDataType::Utf8View,
584 self.views.clone(),
585 self.buffers.clone(),
586 self.validity.clone(),
587 self.total_bytes_len.load() as usize,
588 self.total_buffer_len,
589 )
590 }
591}
592
593impl Utf8ViewArray {
594 pub fn to_binview(&self) -> BinaryViewArray {
595 unsafe {
597 BinaryViewArray::new_unchecked(
598 ArrowDataType::BinaryView,
599 self.views.clone(),
600 self.buffers.clone(),
601 self.validity.clone(),
602 self.total_bytes_len.load() as usize,
603 self.total_buffer_len,
604 )
605 }
606 }
607}
608
609impl<T: ViewType + ?Sized> Array for BinaryViewArrayGeneric<T> {
610 fn as_any(&self) -> &dyn Any {
611 self
612 }
613
614 fn as_any_mut(&mut self) -> &mut dyn Any {
615 self
616 }
617
618 #[inline(always)]
619 fn len(&self) -> usize {
620 BinaryViewArrayGeneric::len(self)
621 }
622
623 fn dtype(&self) -> &ArrowDataType {
624 T::dtype()
625 }
626
627 fn validity(&self) -> Option<&Bitmap> {
628 self.validity.as_ref()
629 }
630
631 fn split_at_boxed(&self, offset: usize) -> (Box<dyn Array>, Box<dyn Array>) {
632 let (lhs, rhs) = Splitable::split_at(self, offset);
633 (Box::new(lhs), Box::new(rhs))
634 }
635
636 unsafe fn split_at_boxed_unchecked(&self, offset: usize) -> (Box<dyn Array>, Box<dyn Array>) {
637 let (lhs, rhs) = unsafe { Splitable::split_at_unchecked(self, offset) };
638 (Box::new(lhs), Box::new(rhs))
639 }
640
641 fn slice(&mut self, offset: usize, length: usize) {
642 assert!(
643 offset + length <= self.len(),
644 "the offset of the new Buffer cannot exceed the existing length"
645 );
646 unsafe { self.slice_unchecked(offset, length) }
647 }
648
649 unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
650 debug_assert!(offset + length <= self.len());
651 self.validity = self
652 .validity
653 .take()
654 .map(|bitmap| bitmap.sliced_unchecked(offset, length))
655 .filter(|bitmap| bitmap.unset_bits() > 0);
656 self.views.slice_unchecked(offset, length);
657 self.total_bytes_len.store(UNKNOWN_LEN)
658 }
659
660 fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
661 debug_assert!(
662 validity.as_ref().is_none_or(|v| v.len() == self.len()),
663 "{} != {}",
664 validity.as_ref().unwrap().len(),
665 self.len()
666 );
667
668 let mut new = self.clone();
669 new.validity = validity;
670 Box::new(new)
671 }
672
673 fn to_boxed(&self) -> Box<dyn Array> {
674 Box::new(self.clone())
675 }
676}
677
678impl<T: ViewType + ?Sized> Splitable for BinaryViewArrayGeneric<T> {
679 fn check_bound(&self, offset: usize) -> bool {
680 offset <= self.len()
681 }
682
683 unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
684 let (lhs_views, rhs_views) = unsafe { self.views.split_at_unchecked(offset) };
685 let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };
686
687 unsafe {
688 (
689 Self::new_unchecked(
690 self.dtype.clone(),
691 lhs_views,
692 self.buffers.clone(),
693 lhs_validity,
694 if offset == 0 { 0 } else { UNKNOWN_LEN as _ },
695 self.total_buffer_len(),
696 ),
697 Self::new_unchecked(
698 self.dtype.clone(),
699 rhs_views,
700 self.buffers.clone(),
701 rhs_validity,
702 if offset == self.len() {
703 0
704 } else {
705 UNKNOWN_LEN as _
706 },
707 self.total_buffer_len(),
708 ),
709 )
710 }
711 }
712}