polars_arrow/array/binary/
mod.rs

1use either::Either;
2
3use super::specification::try_check_offsets_bounds;
4use super::{Array, GenericBinaryArray, Splitable};
5use crate::array::iterator::NonNullValuesIter;
6use crate::bitmap::Bitmap;
7use crate::bitmap::utils::{BitmapIter, ZipValidity};
8use crate::buffer::Buffer;
9use crate::datatypes::ArrowDataType;
10use crate::offset::{Offset, Offsets, OffsetsBuffer};
11use crate::trusted_len::TrustedLen;
12
13mod builder;
14pub use builder::*;
15mod ffi;
16pub(super) mod fmt;
17mod iterator;
18pub use iterator::*;
19mod from;
20mod mutable_values;
21pub use mutable_values::*;
22mod mutable;
23pub use mutable::*;
24use polars_error::{PolarsResult, polars_bail};
25
26/// A [`BinaryArray`] is Arrow's semantically equivalent of an immutable `Vec<Option<Vec<u8>>>`.
27/// It implements [`Array`].
28///
29/// The size of this struct is `O(1)`, as all data is stored behind an [`std::sync::Arc`].
30/// # Example
31/// ```
32/// use polars_arrow::array::BinaryArray;
33/// use polars_arrow::bitmap::Bitmap;
34/// use polars_arrow::buffer::Buffer;
35///
36/// let array = BinaryArray::<i32>::from([Some([1, 2].as_ref()), None, Some([3].as_ref())]);
37/// assert_eq!(array.value(0), &[1, 2]);
38/// assert_eq!(array.iter().collect::<Vec<_>>(), vec![Some([1, 2].as_ref()), None, Some([3].as_ref())]);
39/// assert_eq!(array.values_iter().collect::<Vec<_>>(), vec![[1, 2].as_ref(), &[], &[3]]);
40/// // the underlying representation:
41/// assert_eq!(array.values(), &Buffer::from(vec![1, 2, 3]));
42/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 3]));
43/// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true])));
44/// ```
45///
46/// # Generic parameter
47/// The generic parameter [`Offset`] can only be `i32` or `i64` and tradeoffs maximum array length with
48/// memory usage:
49/// * the sum of lengths of all elements cannot exceed `Offset::MAX`
50/// * the total size of the underlying data is `array.len() * size_of::<Offset>() + sum of lengths of all elements`
51///
52/// # Safety
53/// The following invariants hold:
54/// * Two consecutive `offsets` cast (`as`) to `usize` are valid slices of `values`.
55/// * `len` is equal to `validity.len()`, when defined.
56#[derive(Clone)]
57pub struct BinaryArray<O: Offset> {
58    dtype: ArrowDataType,
59    offsets: OffsetsBuffer<O>,
60    values: Buffer<u8>,
61    validity: Option<Bitmap>,
62}
63
64impl<O: Offset> BinaryArray<O> {
65    /// Returns a [`BinaryArray`] created from its internal representation.
66    ///
67    /// # Errors
68    /// This function returns an error iff:
69    /// * The last offset is not equal to the values' length.
70    /// * the validity's length is not equal to `offsets.len()`.
71    /// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`.
72    /// # Implementation
73    /// This function is `O(1)`
74    pub fn try_new(
75        dtype: ArrowDataType,
76        offsets: OffsetsBuffer<O>,
77        values: Buffer<u8>,
78        validity: Option<Bitmap>,
79    ) -> PolarsResult<Self> {
80        try_check_offsets_bounds(&offsets, values.len())?;
81
82        if validity
83            .as_ref()
84            .is_some_and(|validity| validity.len() != offsets.len_proxy())
85        {
86            polars_bail!(ComputeError: "validity mask length must match the number of values")
87        }
88
89        if dtype.to_physical_type() != Self::default_dtype().to_physical_type() {
90            polars_bail!(ComputeError: "BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary")
91        }
92
93        Ok(Self {
94            dtype,
95            offsets,
96            values,
97            validity,
98        })
99    }
100
101    /// Creates a new [`BinaryArray`] without checking invariants.
102    ///
103    /// # Safety
104    ///
105    /// The invariants must be valid (see try_new).
106    pub unsafe fn new_unchecked(
107        dtype: ArrowDataType,
108        offsets: OffsetsBuffer<O>,
109        values: Buffer<u8>,
110        validity: Option<Bitmap>,
111    ) -> Self {
112        Self {
113            dtype,
114            offsets,
115            values,
116            validity,
117        }
118    }
119
120    /// Creates a new [`BinaryArray`] from slices of `&[u8]`.
121    pub fn from_slice<T: AsRef<[u8]>, P: AsRef<[T]>>(slice: P) -> Self {
122        Self::from_trusted_len_values_iter(slice.as_ref().iter())
123    }
124
125    /// Creates a new [`BinaryArray`] from a slice of optional `&[u8]`.
126    // Note: this can't be `impl From` because Rust does not allow double `AsRef` on it.
127    pub fn from<T: AsRef<[u8]>, P: AsRef<[Option<T>]>>(slice: P) -> Self {
128        MutableBinaryArray::<O>::from(slice).into()
129    }
130
131    /// Returns an iterator of `Option<&[u8]>` over every element of this array.
132    pub fn iter(&self) -> ZipValidity<&[u8], BinaryValueIter<O>, BitmapIter> {
133        ZipValidity::new_with_validity(self.values_iter(), self.validity.as_ref())
134    }
135
136    /// Returns an iterator of `&[u8]` over every element of this array, ignoring the validity
137    pub fn values_iter(&self) -> BinaryValueIter<O> {
138        BinaryValueIter::new(self)
139    }
140
141    /// Returns an iterator of the non-null values.
142    #[inline]
143    pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, BinaryArray<O>> {
144        NonNullValuesIter::new(self, self.validity())
145    }
146
147    /// Returns the length of this array
148    #[inline]
149    pub fn len(&self) -> usize {
150        self.offsets.len_proxy()
151    }
152
153    /// Returns the element at index `i`
154    /// # Panics
155    /// iff `i >= self.len()`
156    #[inline]
157    pub fn value(&self, i: usize) -> &[u8] {
158        assert!(i < self.len());
159        unsafe { self.value_unchecked(i) }
160    }
161
162    /// Returns the element at index `i`
163    ///
164    /// # Safety
165    /// Assumes that the `i < self.len`.
166    #[inline]
167    pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] {
168        // soundness: the invariant of the function
169        let (start, end) = self.offsets.start_end_unchecked(i);
170
171        // soundness: the invariant of the struct
172        self.values.get_unchecked(start..end)
173    }
174
175    /// Returns the element at index `i` or `None` if it is null
176    /// # Panics
177    /// iff `i >= self.len()`
178    #[inline]
179    pub fn get(&self, i: usize) -> Option<&[u8]> {
180        if !self.is_null(i) {
181            // soundness: Array::is_null panics if i >= self.len
182            unsafe { Some(self.value_unchecked(i)) }
183        } else {
184            None
185        }
186    }
187
188    /// Returns the [`ArrowDataType`] of this array.
189    #[inline]
190    pub fn dtype(&self) -> &ArrowDataType {
191        &self.dtype
192    }
193
194    /// Returns the values of this [`BinaryArray`].
195    #[inline]
196    pub fn values(&self) -> &Buffer<u8> {
197        &self.values
198    }
199
200    /// Returns the offsets of this [`BinaryArray`].
201    #[inline]
202    pub fn offsets(&self) -> &OffsetsBuffer<O> {
203        &self.offsets
204    }
205
206    /// The optional validity.
207    #[inline]
208    pub fn validity(&self) -> Option<&Bitmap> {
209        self.validity.as_ref()
210    }
211
212    /// Slices this [`BinaryArray`].
213    /// # Implementation
214    /// This function is `O(1)`.
215    /// # Panics
216    /// iff `offset + length > self.len()`.
217    pub fn slice(&mut self, offset: usize, length: usize) {
218        assert!(
219            offset + length <= self.len(),
220            "the offset of the new Buffer cannot exceed the existing length"
221        );
222        unsafe { self.slice_unchecked(offset, length) }
223    }
224
225    /// Slices this [`BinaryArray`].
226    /// # Implementation
227    /// This function is `O(1)`.
228    ///
229    /// # Safety
230    /// The caller must ensure that `offset + length <= self.len()`.
231    pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
232        self.validity = self
233            .validity
234            .take()
235            .map(|bitmap| bitmap.sliced_unchecked(offset, length))
236            .filter(|bitmap| bitmap.unset_bits() > 0);
237        self.offsets.slice_unchecked(offset, length + 1);
238    }
239
240    impl_sliced!();
241    impl_mut_validity!();
242    impl_into_array!();
243
244    /// Returns its internal representation
245    #[must_use]
246    pub fn into_inner(self) -> (ArrowDataType, OffsetsBuffer<O>, Buffer<u8>, Option<Bitmap>) {
247        let Self {
248            dtype,
249            offsets,
250            values,
251            validity,
252        } = self;
253        (dtype, offsets, values, validity)
254    }
255
256    /// Try to convert this `BinaryArray` to a `MutableBinaryArray`
257    #[must_use]
258    pub fn into_mut(self) -> Either<Self, MutableBinaryArray<O>> {
259        use Either::*;
260        if let Some(bitmap) = self.validity {
261            match bitmap.into_mut() {
262                // SAFETY: invariants are preserved
263                Left(bitmap) => Left(BinaryArray::new(
264                    self.dtype,
265                    self.offsets,
266                    self.values,
267                    Some(bitmap),
268                )),
269                Right(mutable_bitmap) => match (self.values.into_mut(), self.offsets.into_mut()) {
270                    (Left(values), Left(offsets)) => Left(BinaryArray::new(
271                        self.dtype,
272                        offsets,
273                        values,
274                        Some(mutable_bitmap.into()),
275                    )),
276                    (Left(values), Right(offsets)) => Left(BinaryArray::new(
277                        self.dtype,
278                        offsets.into(),
279                        values,
280                        Some(mutable_bitmap.into()),
281                    )),
282                    (Right(values), Left(offsets)) => Left(BinaryArray::new(
283                        self.dtype,
284                        offsets,
285                        values.into(),
286                        Some(mutable_bitmap.into()),
287                    )),
288                    (Right(values), Right(offsets)) => Right(
289                        MutableBinaryArray::try_new(
290                            self.dtype,
291                            offsets,
292                            values,
293                            Some(mutable_bitmap),
294                        )
295                        .unwrap(),
296                    ),
297                },
298            }
299        } else {
300            match (self.values.into_mut(), self.offsets.into_mut()) {
301                (Left(values), Left(offsets)) => {
302                    Left(BinaryArray::new(self.dtype, offsets, values, None))
303                },
304                (Left(values), Right(offsets)) => {
305                    Left(BinaryArray::new(self.dtype, offsets.into(), values, None))
306                },
307                (Right(values), Left(offsets)) => {
308                    Left(BinaryArray::new(self.dtype, offsets, values.into(), None))
309                },
310                (Right(values), Right(offsets)) => {
311                    Right(MutableBinaryArray::try_new(self.dtype, offsets, values, None).unwrap())
312                },
313            }
314        }
315    }
316
317    /// Creates an empty [`BinaryArray`], i.e. whose `.len` is zero.
318    pub fn new_empty(dtype: ArrowDataType) -> Self {
319        Self::new(dtype, OffsetsBuffer::new(), Buffer::new(), None)
320    }
321
322    /// Creates an null [`BinaryArray`], i.e. whose `.null_count() == .len()`.
323    #[inline]
324    pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
325        unsafe {
326            Self::new_unchecked(
327                dtype,
328                Offsets::new_zeroed(length).into(),
329                Buffer::new(),
330                Some(Bitmap::new_zeroed(length)),
331            )
332        }
333    }
334
335    /// Returns the default [`ArrowDataType`], `DataType::Binary` or `DataType::LargeBinary`
336    pub fn default_dtype() -> ArrowDataType {
337        if O::IS_LARGE {
338            ArrowDataType::LargeBinary
339        } else {
340            ArrowDataType::Binary
341        }
342    }
343
344    /// Alias for unwrapping [`Self::try_new`]
345    pub fn new(
346        dtype: ArrowDataType,
347        offsets: OffsetsBuffer<O>,
348        values: Buffer<u8>,
349        validity: Option<Bitmap>,
350    ) -> Self {
351        Self::try_new(dtype, offsets, values, validity).unwrap()
352    }
353
354    /// Returns a [`BinaryArray`] from an iterator of trusted length.
355    ///
356    /// The [`BinaryArray`] is guaranteed to not have a validity
357    #[inline]
358    pub fn from_trusted_len_values_iter<T: AsRef<[u8]>, I: TrustedLen<Item = T>>(
359        iterator: I,
360    ) -> Self {
361        MutableBinaryArray::<O>::from_trusted_len_values_iter(iterator).into()
362    }
363
364    /// Returns a new [`BinaryArray`] from a [`Iterator`] of `&[u8]`.
365    ///
366    /// The [`BinaryArray`] is guaranteed to not have a validity
367    pub fn from_iter_values<T: AsRef<[u8]>, I: Iterator<Item = T>>(iterator: I) -> Self {
368        MutableBinaryArray::<O>::from_iter_values(iterator).into()
369    }
370
371    /// Creates a [`BinaryArray`] from an iterator of trusted length.
372    ///
373    /// # Safety
374    /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
375    /// I.e. that `size_hint().1` correctly reports its length.
376    #[inline]
377    pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self
378    where
379        P: AsRef<[u8]>,
380        I: Iterator<Item = Option<P>>,
381    {
382        MutableBinaryArray::<O>::from_trusted_len_iter_unchecked(iterator).into()
383    }
384
385    /// Creates a [`BinaryArray`] from a [`TrustedLen`]
386    #[inline]
387    pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self
388    where
389        P: AsRef<[u8]>,
390        I: TrustedLen<Item = Option<P>>,
391    {
392        // soundness: I is `TrustedLen`
393        unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
394    }
395
396    /// Creates a [`BinaryArray`] from an falible iterator of trusted length.
397    ///
398    /// # Safety
399    /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
400    /// I.e. that `size_hint().1` correctly reports its length.
401    #[inline]
402    pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(iterator: I) -> Result<Self, E>
403    where
404        P: AsRef<[u8]>,
405        I: IntoIterator<Item = Result<Option<P>, E>>,
406    {
407        MutableBinaryArray::<O>::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into())
408    }
409
410    /// Creates a [`BinaryArray`] from an fallible iterator of trusted length.
411    #[inline]
412    pub fn try_from_trusted_len_iter<E, I, P>(iter: I) -> Result<Self, E>
413    where
414        P: AsRef<[u8]>,
415        I: TrustedLen<Item = Result<Option<P>, E>>,
416    {
417        // soundness: I: TrustedLen
418        unsafe { Self::try_from_trusted_len_iter_unchecked(iter) }
419    }
420}
421
422impl<O: Offset> Array for BinaryArray<O> {
423    impl_common_array!();
424
425    fn validity(&self) -> Option<&Bitmap> {
426        self.validity.as_ref()
427    }
428
429    #[inline]
430    fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
431        Box::new(self.clone().with_validity(validity))
432    }
433}
434
435unsafe impl<O: Offset> GenericBinaryArray<O> for BinaryArray<O> {
436    #[inline]
437    fn values(&self) -> &[u8] {
438        self.values()
439    }
440
441    #[inline]
442    fn offsets(&self) -> &[O] {
443        self.offsets().buffer()
444    }
445}
446
447impl<O: Offset> Splitable for BinaryArray<O> {
448    #[inline(always)]
449    fn check_bound(&self, offset: usize) -> bool {
450        offset <= self.len()
451    }
452
453    unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
454        let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
455        let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };
456
457        (
458            Self {
459                dtype: self.dtype.clone(),
460                offsets: lhs_offsets,
461                values: self.values.clone(),
462                validity: lhs_validity,
463            },
464            Self {
465                dtype: self.dtype.clone(),
466                offsets: rhs_offsets,
467                values: self.values.clone(),
468                validity: rhs_validity,
469            },
470        )
471    }
472}