polars_arrow/array/binary/
mod.rs1use either::Either;
2
3use super::specification::try_check_offsets_bounds;
4use super::{Array, GenericBinaryArray, Splitable};
5use crate::array::iterator::NonNullValuesIter;
6use crate::bitmap::Bitmap;
7use crate::bitmap::utils::{BitmapIter, ZipValidity};
8use crate::buffer::Buffer;
9use crate::datatypes::ArrowDataType;
10use crate::offset::{Offset, Offsets, OffsetsBuffer};
11use crate::trusted_len::TrustedLen;
12
13mod builder;
14pub use builder::*;
15mod ffi;
16pub(super) mod fmt;
17mod iterator;
18pub use iterator::*;
19mod from;
20mod mutable_values;
21pub use mutable_values::*;
22mod mutable;
23pub use mutable::*;
24use polars_error::{PolarsResult, polars_bail};
25#[cfg(feature = "proptest")]
26pub mod proptest;
27
28#[derive(Clone)]
59pub struct BinaryArray<O: Offset> {
60 dtype: ArrowDataType,
61 offsets: OffsetsBuffer<O>,
62 values: Buffer<u8>,
63 validity: Option<Bitmap>,
64}
65
66impl<O: Offset> BinaryArray<O> {
67 pub fn try_new(
77 dtype: ArrowDataType,
78 offsets: OffsetsBuffer<O>,
79 values: Buffer<u8>,
80 validity: Option<Bitmap>,
81 ) -> PolarsResult<Self> {
82 try_check_offsets_bounds(&offsets, values.len())?;
83
84 if validity
85 .as_ref()
86 .is_some_and(|validity| validity.len() != offsets.len_proxy())
87 {
88 polars_bail!(ComputeError: "validity mask length must match the number of values")
89 }
90
91 if dtype.to_physical_type() != Self::default_dtype().to_physical_type() {
92 polars_bail!(ComputeError: "BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary")
93 }
94
95 Ok(Self {
96 dtype,
97 offsets,
98 values,
99 validity,
100 })
101 }
102
103 pub unsafe fn new_unchecked(
109 dtype: ArrowDataType,
110 offsets: OffsetsBuffer<O>,
111 values: Buffer<u8>,
112 validity: Option<Bitmap>,
113 ) -> Self {
114 Self {
115 dtype,
116 offsets,
117 values,
118 validity,
119 }
120 }
121
122 pub fn from_slice<T: AsRef<[u8]>, P: AsRef<[T]>>(slice: P) -> Self {
124 Self::from_trusted_len_values_iter(slice.as_ref().iter())
125 }
126
127 pub fn from<T: AsRef<[u8]>, P: AsRef<[Option<T>]>>(slice: P) -> Self {
130 MutableBinaryArray::<O>::from(slice).into()
131 }
132
133 pub fn iter(&self) -> ZipValidity<&[u8], BinaryValueIter<O>, BitmapIter> {
135 ZipValidity::new_with_validity(self.values_iter(), self.validity.as_ref())
136 }
137
138 pub fn values_iter(&self) -> BinaryValueIter<O> {
140 BinaryValueIter::new(self)
141 }
142
143 #[inline]
145 pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, BinaryArray<O>> {
146 NonNullValuesIter::new(self, self.validity())
147 }
148
149 #[inline]
151 pub fn len(&self) -> usize {
152 self.offsets.len_proxy()
153 }
154
155 #[inline]
159 pub fn value(&self, i: usize) -> &[u8] {
160 assert!(i < self.len());
161 unsafe { self.value_unchecked(i) }
162 }
163
164 #[inline]
169 pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] {
170 let (start, end) = self.offsets.start_end_unchecked(i);
172
173 self.values.get_unchecked(start..end)
175 }
176
177 #[inline]
181 pub fn get(&self, i: usize) -> Option<&[u8]> {
182 if !self.is_null(i) {
183 unsafe { Some(self.value_unchecked(i)) }
185 } else {
186 None
187 }
188 }
189
190 #[inline]
192 pub fn dtype(&self) -> &ArrowDataType {
193 &self.dtype
194 }
195
196 #[inline]
198 pub fn values(&self) -> &Buffer<u8> {
199 &self.values
200 }
201
202 #[inline]
204 pub fn offsets(&self) -> &OffsetsBuffer<O> {
205 &self.offsets
206 }
207
208 #[inline]
210 pub fn validity(&self) -> Option<&Bitmap> {
211 self.validity.as_ref()
212 }
213
214 pub fn slice(&mut self, offset: usize, length: usize) {
220 assert!(
221 offset + length <= self.len(),
222 "the offset of the new Buffer cannot exceed the existing length"
223 );
224 unsafe { self.slice_unchecked(offset, length) }
225 }
226
227 pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
234 self.validity = self
235 .validity
236 .take()
237 .map(|bitmap| bitmap.sliced_unchecked(offset, length))
238 .filter(|bitmap| bitmap.unset_bits() > 0);
239 self.offsets.slice_unchecked(offset, length + 1);
240 }
241
242 impl_sliced!();
243 impl_mut_validity!();
244 impl_into_array!();
245
246 #[must_use]
248 pub fn into_inner(self) -> (ArrowDataType, OffsetsBuffer<O>, Buffer<u8>, Option<Bitmap>) {
249 let Self {
250 dtype,
251 offsets,
252 values,
253 validity,
254 } = self;
255 (dtype, offsets, values, validity)
256 }
257
258 #[must_use]
260 pub fn into_mut(self) -> Either<Self, MutableBinaryArray<O>> {
261 use Either::*;
262 if let Some(bitmap) = self.validity {
263 match bitmap.into_mut() {
264 Left(bitmap) => Left(BinaryArray::new(
266 self.dtype,
267 self.offsets,
268 self.values,
269 Some(bitmap),
270 )),
271 Right(mutable_bitmap) => match (self.values.into_mut(), self.offsets.into_mut()) {
272 (Left(values), Left(offsets)) => Left(BinaryArray::new(
273 self.dtype,
274 offsets,
275 values,
276 Some(mutable_bitmap.into()),
277 )),
278 (Left(values), Right(offsets)) => Left(BinaryArray::new(
279 self.dtype,
280 offsets.into(),
281 values,
282 Some(mutable_bitmap.into()),
283 )),
284 (Right(values), Left(offsets)) => Left(BinaryArray::new(
285 self.dtype,
286 offsets,
287 values.into(),
288 Some(mutable_bitmap.into()),
289 )),
290 (Right(values), Right(offsets)) => Right(
291 MutableBinaryArray::try_new(
292 self.dtype,
293 offsets,
294 values,
295 Some(mutable_bitmap),
296 )
297 .unwrap(),
298 ),
299 },
300 }
301 } else {
302 match (self.values.into_mut(), self.offsets.into_mut()) {
303 (Left(values), Left(offsets)) => {
304 Left(BinaryArray::new(self.dtype, offsets, values, None))
305 },
306 (Left(values), Right(offsets)) => {
307 Left(BinaryArray::new(self.dtype, offsets.into(), values, None))
308 },
309 (Right(values), Left(offsets)) => {
310 Left(BinaryArray::new(self.dtype, offsets, values.into(), None))
311 },
312 (Right(values), Right(offsets)) => {
313 Right(MutableBinaryArray::try_new(self.dtype, offsets, values, None).unwrap())
314 },
315 }
316 }
317 }
318
319 pub fn new_empty(dtype: ArrowDataType) -> Self {
321 Self::new(dtype, OffsetsBuffer::new(), Buffer::new(), None)
322 }
323
324 #[inline]
326 pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
327 unsafe {
328 Self::new_unchecked(
329 dtype,
330 Offsets::new_zeroed(length).into(),
331 Buffer::new(),
332 Some(Bitmap::new_zeroed(length)),
333 )
334 }
335 }
336
337 pub fn default_dtype() -> ArrowDataType {
339 if O::IS_LARGE {
340 ArrowDataType::LargeBinary
341 } else {
342 ArrowDataType::Binary
343 }
344 }
345
346 pub fn new(
348 dtype: ArrowDataType,
349 offsets: OffsetsBuffer<O>,
350 values: Buffer<u8>,
351 validity: Option<Bitmap>,
352 ) -> Self {
353 Self::try_new(dtype, offsets, values, validity).unwrap()
354 }
355
356 #[inline]
360 pub fn from_trusted_len_values_iter<T: AsRef<[u8]>, I: TrustedLen<Item = T>>(
361 iterator: I,
362 ) -> Self {
363 MutableBinaryArray::<O>::from_trusted_len_values_iter(iterator).into()
364 }
365
366 pub fn from_iter_values<T: AsRef<[u8]>, I: Iterator<Item = T>>(iterator: I) -> Self {
370 MutableBinaryArray::<O>::from_iter_values(iterator).into()
371 }
372
373 #[inline]
379 pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self
380 where
381 P: AsRef<[u8]>,
382 I: Iterator<Item = Option<P>>,
383 {
384 MutableBinaryArray::<O>::from_trusted_len_iter_unchecked(iterator).into()
385 }
386
387 #[inline]
389 pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self
390 where
391 P: AsRef<[u8]>,
392 I: TrustedLen<Item = Option<P>>,
393 {
394 unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
396 }
397
398 #[inline]
404 pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(iterator: I) -> Result<Self, E>
405 where
406 P: AsRef<[u8]>,
407 I: IntoIterator<Item = Result<Option<P>, E>>,
408 {
409 MutableBinaryArray::<O>::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into())
410 }
411
412 #[inline]
414 pub fn try_from_trusted_len_iter<E, I, P>(iter: I) -> Result<Self, E>
415 where
416 P: AsRef<[u8]>,
417 I: TrustedLen<Item = Result<Option<P>, E>>,
418 {
419 unsafe { Self::try_from_trusted_len_iter_unchecked(iter) }
421 }
422}
423
424impl<O: Offset> Array for BinaryArray<O> {
425 impl_common_array!();
426
427 fn validity(&self) -> Option<&Bitmap> {
428 self.validity.as_ref()
429 }
430
431 #[inline]
432 fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
433 Box::new(self.clone().with_validity(validity))
434 }
435}
436
437unsafe impl<O: Offset> GenericBinaryArray<O> for BinaryArray<O> {
438 #[inline]
439 fn values(&self) -> &[u8] {
440 self.values()
441 }
442
443 #[inline]
444 fn offsets(&self) -> &[O] {
445 self.offsets().buffer()
446 }
447}
448
449impl<O: Offset> Splitable for BinaryArray<O> {
450 #[inline(always)]
451 fn check_bound(&self, offset: usize) -> bool {
452 offset <= self.len()
453 }
454
455 unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
456 let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
457 let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };
458
459 (
460 Self {
461 dtype: self.dtype.clone(),
462 offsets: lhs_offsets,
463 values: self.values.clone(),
464 validity: lhs_validity,
465 },
466 Self {
467 dtype: self.dtype.clone(),
468 offsets: rhs_offsets,
469 values: self.values.clone(),
470 validity: rhs_validity,
471 },
472 )
473 }
474}