polars_arrow/array/binary/
mod.rs

1use either::Either;
2
3use super::specification::try_check_offsets_bounds;
4use super::{Array, GenericBinaryArray, Splitable};
5use crate::array::iterator::NonNullValuesIter;
6use crate::bitmap::Bitmap;
7use crate::bitmap::utils::{BitmapIter, ZipValidity};
8use crate::buffer::Buffer;
9use crate::datatypes::ArrowDataType;
10use crate::offset::{Offset, Offsets, OffsetsBuffer};
11use crate::trusted_len::TrustedLen;
12
13mod builder;
14pub use builder::*;
15mod ffi;
16pub(super) mod fmt;
17mod iterator;
18pub use iterator::*;
19mod from;
20mod mutable_values;
21pub use mutable_values::*;
22mod mutable;
23pub use mutable::*;
24use polars_error::{PolarsResult, polars_bail};
25#[cfg(feature = "proptest")]
26pub mod proptest;
27
28/// A [`BinaryArray`] is Arrow's semantically equivalent of an immutable `Vec<Option<Vec<u8>>>`.
29/// It implements [`Array`].
30///
31/// The size of this struct is `O(1)`, as all data is stored behind an [`std::sync::Arc`].
32/// # Example
33/// ```
34/// use polars_arrow::array::BinaryArray;
35/// use polars_arrow::bitmap::Bitmap;
36/// use polars_arrow::buffer::Buffer;
37///
38/// let array = BinaryArray::<i32>::from([Some([1, 2].as_ref()), None, Some([3].as_ref())]);
39/// assert_eq!(array.value(0), &[1, 2]);
40/// assert_eq!(array.iter().collect::<Vec<_>>(), vec![Some([1, 2].as_ref()), None, Some([3].as_ref())]);
41/// assert_eq!(array.values_iter().collect::<Vec<_>>(), vec![[1, 2].as_ref(), &[], &[3]]);
42/// // the underlying representation:
43/// assert_eq!(array.values(), &Buffer::from(vec![1, 2, 3]));
44/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 3]));
45/// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true])));
46/// ```
47///
48/// # Generic parameter
49/// The generic parameter [`Offset`] can only be `i32` or `i64` and tradeoffs maximum array length with
50/// memory usage:
51/// * the sum of lengths of all elements cannot exceed `Offset::MAX`
52/// * the total size of the underlying data is `array.len() * size_of::<Offset>() + sum of lengths of all elements`
53///
54/// # Safety
55/// The following invariants hold:
56/// * Two consecutive `offsets` cast (`as`) to `usize` are valid slices of `values`.
57/// * `len` is equal to `validity.len()`, when defined.
58#[derive(Clone)]
59pub struct BinaryArray<O: Offset> {
60    dtype: ArrowDataType,
61    offsets: OffsetsBuffer<O>,
62    values: Buffer<u8>,
63    validity: Option<Bitmap>,
64}
65
66impl<O: Offset> BinaryArray<O> {
67    /// Returns a [`BinaryArray`] created from its internal representation.
68    ///
69    /// # Errors
70    /// This function returns an error iff:
71    /// * The last offset is not equal to the values' length.
72    /// * the validity's length is not equal to `offsets.len()`.
73    /// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`.
74    /// # Implementation
75    /// This function is `O(1)`
76    pub fn try_new(
77        dtype: ArrowDataType,
78        offsets: OffsetsBuffer<O>,
79        values: Buffer<u8>,
80        validity: Option<Bitmap>,
81    ) -> PolarsResult<Self> {
82        try_check_offsets_bounds(&offsets, values.len())?;
83
84        if validity
85            .as_ref()
86            .is_some_and(|validity| validity.len() != offsets.len_proxy())
87        {
88            polars_bail!(ComputeError: "validity mask length must match the number of values")
89        }
90
91        if dtype.to_physical_type() != Self::default_dtype().to_physical_type() {
92            polars_bail!(ComputeError: "BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary")
93        }
94
95        Ok(Self {
96            dtype,
97            offsets,
98            values,
99            validity,
100        })
101    }
102
103    /// Creates a new [`BinaryArray`] without checking invariants.
104    ///
105    /// # Safety
106    ///
107    /// The invariants must be valid (see try_new).
108    pub unsafe fn new_unchecked(
109        dtype: ArrowDataType,
110        offsets: OffsetsBuffer<O>,
111        values: Buffer<u8>,
112        validity: Option<Bitmap>,
113    ) -> Self {
114        Self {
115            dtype,
116            offsets,
117            values,
118            validity,
119        }
120    }
121
122    /// Creates a new [`BinaryArray`] from slices of `&[u8]`.
123    pub fn from_slice<T: AsRef<[u8]>, P: AsRef<[T]>>(slice: P) -> Self {
124        Self::from_trusted_len_values_iter(slice.as_ref().iter())
125    }
126
127    /// Creates a new [`BinaryArray`] from a slice of optional `&[u8]`.
128    // Note: this can't be `impl From` because Rust does not allow double `AsRef` on it.
129    pub fn from<T: AsRef<[u8]>, P: AsRef<[Option<T>]>>(slice: P) -> Self {
130        MutableBinaryArray::<O>::from(slice).into()
131    }
132
133    /// Returns an iterator of `Option<&[u8]>` over every element of this array.
134    pub fn iter(&self) -> ZipValidity<&[u8], BinaryValueIter<O>, BitmapIter> {
135        ZipValidity::new_with_validity(self.values_iter(), self.validity.as_ref())
136    }
137
138    /// Returns an iterator of `&[u8]` over every element of this array, ignoring the validity
139    pub fn values_iter(&self) -> BinaryValueIter<O> {
140        BinaryValueIter::new(self)
141    }
142
143    /// Returns an iterator of the non-null values.
144    #[inline]
145    pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, BinaryArray<O>> {
146        NonNullValuesIter::new(self, self.validity())
147    }
148
149    /// Returns the length of this array
150    #[inline]
151    pub fn len(&self) -> usize {
152        self.offsets.len_proxy()
153    }
154
155    /// Returns the element at index `i`
156    /// # Panics
157    /// iff `i >= self.len()`
158    #[inline]
159    pub fn value(&self, i: usize) -> &[u8] {
160        assert!(i < self.len());
161        unsafe { self.value_unchecked(i) }
162    }
163
164    /// Returns the element at index `i`
165    ///
166    /// # Safety
167    /// Assumes that the `i < self.len`.
168    #[inline]
169    pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] {
170        // soundness: the invariant of the function
171        let (start, end) = self.offsets.start_end_unchecked(i);
172
173        // soundness: the invariant of the struct
174        self.values.get_unchecked(start..end)
175    }
176
177    /// Returns the element at index `i` or `None` if it is null
178    /// # Panics
179    /// iff `i >= self.len()`
180    #[inline]
181    pub fn get(&self, i: usize) -> Option<&[u8]> {
182        if !self.is_null(i) {
183            // soundness: Array::is_null panics if i >= self.len
184            unsafe { Some(self.value_unchecked(i)) }
185        } else {
186            None
187        }
188    }
189
190    /// Returns the [`ArrowDataType`] of this array.
191    #[inline]
192    pub fn dtype(&self) -> &ArrowDataType {
193        &self.dtype
194    }
195
196    /// Returns the values of this [`BinaryArray`].
197    #[inline]
198    pub fn values(&self) -> &Buffer<u8> {
199        &self.values
200    }
201
202    /// Returns the offsets of this [`BinaryArray`].
203    #[inline]
204    pub fn offsets(&self) -> &OffsetsBuffer<O> {
205        &self.offsets
206    }
207
208    /// The optional validity.
209    #[inline]
210    pub fn validity(&self) -> Option<&Bitmap> {
211        self.validity.as_ref()
212    }
213
214    /// Slices this [`BinaryArray`].
215    /// # Implementation
216    /// This function is `O(1)`.
217    /// # Panics
218    /// iff `offset + length > self.len()`.
219    pub fn slice(&mut self, offset: usize, length: usize) {
220        assert!(
221            offset + length <= self.len(),
222            "the offset of the new Buffer cannot exceed the existing length"
223        );
224        unsafe { self.slice_unchecked(offset, length) }
225    }
226
227    /// Slices this [`BinaryArray`].
228    /// # Implementation
229    /// This function is `O(1)`.
230    ///
231    /// # Safety
232    /// The caller must ensure that `offset + length <= self.len()`.
233    pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
234        self.validity = self
235            .validity
236            .take()
237            .map(|bitmap| bitmap.sliced_unchecked(offset, length))
238            .filter(|bitmap| bitmap.unset_bits() > 0);
239        self.offsets.slice_unchecked(offset, length + 1);
240    }
241
242    impl_sliced!();
243    impl_mut_validity!();
244    impl_into_array!();
245
246    /// Returns its internal representation
247    #[must_use]
248    pub fn into_inner(self) -> (ArrowDataType, OffsetsBuffer<O>, Buffer<u8>, Option<Bitmap>) {
249        let Self {
250            dtype,
251            offsets,
252            values,
253            validity,
254        } = self;
255        (dtype, offsets, values, validity)
256    }
257
258    /// Try to convert this `BinaryArray` to a `MutableBinaryArray`
259    #[must_use]
260    pub fn into_mut(self) -> Either<Self, MutableBinaryArray<O>> {
261        use Either::*;
262        if let Some(bitmap) = self.validity {
263            match bitmap.into_mut() {
264                // SAFETY: invariants are preserved
265                Left(bitmap) => Left(BinaryArray::new(
266                    self.dtype,
267                    self.offsets,
268                    self.values,
269                    Some(bitmap),
270                )),
271                Right(mutable_bitmap) => match (self.values.into_mut(), self.offsets.into_mut()) {
272                    (Left(values), Left(offsets)) => Left(BinaryArray::new(
273                        self.dtype,
274                        offsets,
275                        values,
276                        Some(mutable_bitmap.into()),
277                    )),
278                    (Left(values), Right(offsets)) => Left(BinaryArray::new(
279                        self.dtype,
280                        offsets.into(),
281                        values,
282                        Some(mutable_bitmap.into()),
283                    )),
284                    (Right(values), Left(offsets)) => Left(BinaryArray::new(
285                        self.dtype,
286                        offsets,
287                        values.into(),
288                        Some(mutable_bitmap.into()),
289                    )),
290                    (Right(values), Right(offsets)) => Right(
291                        MutableBinaryArray::try_new(
292                            self.dtype,
293                            offsets,
294                            values,
295                            Some(mutable_bitmap),
296                        )
297                        .unwrap(),
298                    ),
299                },
300            }
301        } else {
302            match (self.values.into_mut(), self.offsets.into_mut()) {
303                (Left(values), Left(offsets)) => {
304                    Left(BinaryArray::new(self.dtype, offsets, values, None))
305                },
306                (Left(values), Right(offsets)) => {
307                    Left(BinaryArray::new(self.dtype, offsets.into(), values, None))
308                },
309                (Right(values), Left(offsets)) => {
310                    Left(BinaryArray::new(self.dtype, offsets, values.into(), None))
311                },
312                (Right(values), Right(offsets)) => {
313                    Right(MutableBinaryArray::try_new(self.dtype, offsets, values, None).unwrap())
314                },
315            }
316        }
317    }
318
319    /// Creates an empty [`BinaryArray`], i.e. whose `.len` is zero.
320    pub fn new_empty(dtype: ArrowDataType) -> Self {
321        Self::new(dtype, OffsetsBuffer::new(), Buffer::new(), None)
322    }
323
324    /// Creates an null [`BinaryArray`], i.e. whose `.null_count() == .len()`.
325    #[inline]
326    pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
327        unsafe {
328            Self::new_unchecked(
329                dtype,
330                Offsets::new_zeroed(length).into(),
331                Buffer::new(),
332                Some(Bitmap::new_zeroed(length)),
333            )
334        }
335    }
336
337    /// Returns the default [`ArrowDataType`], `DataType::Binary` or `DataType::LargeBinary`
338    pub fn default_dtype() -> ArrowDataType {
339        if O::IS_LARGE {
340            ArrowDataType::LargeBinary
341        } else {
342            ArrowDataType::Binary
343        }
344    }
345
346    /// Alias for unwrapping [`Self::try_new`]
347    pub fn new(
348        dtype: ArrowDataType,
349        offsets: OffsetsBuffer<O>,
350        values: Buffer<u8>,
351        validity: Option<Bitmap>,
352    ) -> Self {
353        Self::try_new(dtype, offsets, values, validity).unwrap()
354    }
355
356    /// Returns a [`BinaryArray`] from an iterator of trusted length.
357    ///
358    /// The [`BinaryArray`] is guaranteed to not have a validity
359    #[inline]
360    pub fn from_trusted_len_values_iter<T: AsRef<[u8]>, I: TrustedLen<Item = T>>(
361        iterator: I,
362    ) -> Self {
363        MutableBinaryArray::<O>::from_trusted_len_values_iter(iterator).into()
364    }
365
366    /// Returns a new [`BinaryArray`] from a [`Iterator`] of `&[u8]`.
367    ///
368    /// The [`BinaryArray`] is guaranteed to not have a validity
369    pub fn from_iter_values<T: AsRef<[u8]>, I: Iterator<Item = T>>(iterator: I) -> Self {
370        MutableBinaryArray::<O>::from_iter_values(iterator).into()
371    }
372
373    /// Creates a [`BinaryArray`] from an iterator of trusted length.
374    ///
375    /// # Safety
376    /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
377    /// I.e. that `size_hint().1` correctly reports its length.
378    #[inline]
379    pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self
380    where
381        P: AsRef<[u8]>,
382        I: Iterator<Item = Option<P>>,
383    {
384        MutableBinaryArray::<O>::from_trusted_len_iter_unchecked(iterator).into()
385    }
386
387    /// Creates a [`BinaryArray`] from a [`TrustedLen`]
388    #[inline]
389    pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self
390    where
391        P: AsRef<[u8]>,
392        I: TrustedLen<Item = Option<P>>,
393    {
394        // soundness: I is `TrustedLen`
395        unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
396    }
397
398    /// Creates a [`BinaryArray`] from an falible iterator of trusted length.
399    ///
400    /// # Safety
401    /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
402    /// I.e. that `size_hint().1` correctly reports its length.
403    #[inline]
404    pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(iterator: I) -> Result<Self, E>
405    where
406        P: AsRef<[u8]>,
407        I: IntoIterator<Item = Result<Option<P>, E>>,
408    {
409        MutableBinaryArray::<O>::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into())
410    }
411
412    /// Creates a [`BinaryArray`] from an fallible iterator of trusted length.
413    #[inline]
414    pub fn try_from_trusted_len_iter<E, I, P>(iter: I) -> Result<Self, E>
415    where
416        P: AsRef<[u8]>,
417        I: TrustedLen<Item = Result<Option<P>, E>>,
418    {
419        // soundness: I: TrustedLen
420        unsafe { Self::try_from_trusted_len_iter_unchecked(iter) }
421    }
422}
423
424impl<O: Offset> Array for BinaryArray<O> {
425    impl_common_array!();
426
427    fn validity(&self) -> Option<&Bitmap> {
428        self.validity.as_ref()
429    }
430
431    #[inline]
432    fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
433        Box::new(self.clone().with_validity(validity))
434    }
435}
436
437unsafe impl<O: Offset> GenericBinaryArray<O> for BinaryArray<O> {
438    #[inline]
439    fn values(&self) -> &[u8] {
440        self.values()
441    }
442
443    #[inline]
444    fn offsets(&self) -> &[O] {
445        self.offsets().buffer()
446    }
447}
448
449impl<O: Offset> Splitable for BinaryArray<O> {
450    #[inline(always)]
451    fn check_bound(&self, offset: usize) -> bool {
452        offset <= self.len()
453    }
454
455    unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
456        let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
457        let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };
458
459        (
460            Self {
461                dtype: self.dtype.clone(),
462                offsets: lhs_offsets,
463                values: self.values.clone(),
464                validity: lhs_validity,
465            },
466            Self {
467                dtype: self.dtype.clone(),
468                offsets: rhs_offsets,
469                values: self.values.clone(),
470                validity: rhs_validity,
471            },
472        )
473    }
474}