polars_arrow/array/binary/
mod.rs1use either::Either;
2
3use super::specification::try_check_offsets_bounds;
4use super::{Array, GenericBinaryArray, Splitable};
5use crate::array::iterator::NonNullValuesIter;
6use crate::bitmap::Bitmap;
7use crate::bitmap::utils::{BitmapIter, ZipValidity};
8use crate::buffer::Buffer;
9use crate::datatypes::ArrowDataType;
10use crate::offset::{Offset, Offsets, OffsetsBuffer};
11use crate::trusted_len::TrustedLen;
12
13mod builder;
14pub use builder::*;
15mod ffi;
16pub(super) mod fmt;
17mod iterator;
18pub use iterator::*;
19mod from;
20mod mutable_values;
21pub use mutable_values::*;
22mod mutable;
23pub use mutable::*;
24use polars_error::{PolarsResult, polars_bail};
25
26#[derive(Clone)]
57pub struct BinaryArray<O: Offset> {
58 dtype: ArrowDataType,
59 offsets: OffsetsBuffer<O>,
60 values: Buffer<u8>,
61 validity: Option<Bitmap>,
62}
63
64impl<O: Offset> BinaryArray<O> {
65 pub fn try_new(
75 dtype: ArrowDataType,
76 offsets: OffsetsBuffer<O>,
77 values: Buffer<u8>,
78 validity: Option<Bitmap>,
79 ) -> PolarsResult<Self> {
80 try_check_offsets_bounds(&offsets, values.len())?;
81
82 if validity
83 .as_ref()
84 .is_some_and(|validity| validity.len() != offsets.len_proxy())
85 {
86 polars_bail!(ComputeError: "validity mask length must match the number of values")
87 }
88
89 if dtype.to_physical_type() != Self::default_dtype().to_physical_type() {
90 polars_bail!(ComputeError: "BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary")
91 }
92
93 Ok(Self {
94 dtype,
95 offsets,
96 values,
97 validity,
98 })
99 }
100
101 pub unsafe fn new_unchecked(
107 dtype: ArrowDataType,
108 offsets: OffsetsBuffer<O>,
109 values: Buffer<u8>,
110 validity: Option<Bitmap>,
111 ) -> Self {
112 Self {
113 dtype,
114 offsets,
115 values,
116 validity,
117 }
118 }
119
120 pub fn from_slice<T: AsRef<[u8]>, P: AsRef<[T]>>(slice: P) -> Self {
122 Self::from_trusted_len_values_iter(slice.as_ref().iter())
123 }
124
125 pub fn from<T: AsRef<[u8]>, P: AsRef<[Option<T>]>>(slice: P) -> Self {
128 MutableBinaryArray::<O>::from(slice).into()
129 }
130
131 pub fn iter(&self) -> ZipValidity<&[u8], BinaryValueIter<O>, BitmapIter> {
133 ZipValidity::new_with_validity(self.values_iter(), self.validity.as_ref())
134 }
135
136 pub fn values_iter(&self) -> BinaryValueIter<O> {
138 BinaryValueIter::new(self)
139 }
140
141 #[inline]
143 pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, BinaryArray<O>> {
144 NonNullValuesIter::new(self, self.validity())
145 }
146
147 #[inline]
149 pub fn len(&self) -> usize {
150 self.offsets.len_proxy()
151 }
152
153 #[inline]
157 pub fn value(&self, i: usize) -> &[u8] {
158 assert!(i < self.len());
159 unsafe { self.value_unchecked(i) }
160 }
161
162 #[inline]
167 pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] {
168 let (start, end) = self.offsets.start_end_unchecked(i);
170
171 self.values.get_unchecked(start..end)
173 }
174
175 #[inline]
179 pub fn get(&self, i: usize) -> Option<&[u8]> {
180 if !self.is_null(i) {
181 unsafe { Some(self.value_unchecked(i)) }
183 } else {
184 None
185 }
186 }
187
188 #[inline]
190 pub fn dtype(&self) -> &ArrowDataType {
191 &self.dtype
192 }
193
194 #[inline]
196 pub fn values(&self) -> &Buffer<u8> {
197 &self.values
198 }
199
200 #[inline]
202 pub fn offsets(&self) -> &OffsetsBuffer<O> {
203 &self.offsets
204 }
205
206 #[inline]
208 pub fn validity(&self) -> Option<&Bitmap> {
209 self.validity.as_ref()
210 }
211
212 pub fn slice(&mut self, offset: usize, length: usize) {
218 assert!(
219 offset + length <= self.len(),
220 "the offset of the new Buffer cannot exceed the existing length"
221 );
222 unsafe { self.slice_unchecked(offset, length) }
223 }
224
225 pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
232 self.validity = self
233 .validity
234 .take()
235 .map(|bitmap| bitmap.sliced_unchecked(offset, length))
236 .filter(|bitmap| bitmap.unset_bits() > 0);
237 self.offsets.slice_unchecked(offset, length + 1);
238 }
239
240 impl_sliced!();
241 impl_mut_validity!();
242 impl_into_array!();
243
244 #[must_use]
246 pub fn into_inner(self) -> (ArrowDataType, OffsetsBuffer<O>, Buffer<u8>, Option<Bitmap>) {
247 let Self {
248 dtype,
249 offsets,
250 values,
251 validity,
252 } = self;
253 (dtype, offsets, values, validity)
254 }
255
256 #[must_use]
258 pub fn into_mut(self) -> Either<Self, MutableBinaryArray<O>> {
259 use Either::*;
260 if let Some(bitmap) = self.validity {
261 match bitmap.into_mut() {
262 Left(bitmap) => Left(BinaryArray::new(
264 self.dtype,
265 self.offsets,
266 self.values,
267 Some(bitmap),
268 )),
269 Right(mutable_bitmap) => match (self.values.into_mut(), self.offsets.into_mut()) {
270 (Left(values), Left(offsets)) => Left(BinaryArray::new(
271 self.dtype,
272 offsets,
273 values,
274 Some(mutable_bitmap.into()),
275 )),
276 (Left(values), Right(offsets)) => Left(BinaryArray::new(
277 self.dtype,
278 offsets.into(),
279 values,
280 Some(mutable_bitmap.into()),
281 )),
282 (Right(values), Left(offsets)) => Left(BinaryArray::new(
283 self.dtype,
284 offsets,
285 values.into(),
286 Some(mutable_bitmap.into()),
287 )),
288 (Right(values), Right(offsets)) => Right(
289 MutableBinaryArray::try_new(
290 self.dtype,
291 offsets,
292 values,
293 Some(mutable_bitmap),
294 )
295 .unwrap(),
296 ),
297 },
298 }
299 } else {
300 match (self.values.into_mut(), self.offsets.into_mut()) {
301 (Left(values), Left(offsets)) => {
302 Left(BinaryArray::new(self.dtype, offsets, values, None))
303 },
304 (Left(values), Right(offsets)) => {
305 Left(BinaryArray::new(self.dtype, offsets.into(), values, None))
306 },
307 (Right(values), Left(offsets)) => {
308 Left(BinaryArray::new(self.dtype, offsets, values.into(), None))
309 },
310 (Right(values), Right(offsets)) => {
311 Right(MutableBinaryArray::try_new(self.dtype, offsets, values, None).unwrap())
312 },
313 }
314 }
315 }
316
317 pub fn new_empty(dtype: ArrowDataType) -> Self {
319 Self::new(dtype, OffsetsBuffer::new(), Buffer::new(), None)
320 }
321
322 #[inline]
324 pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
325 unsafe {
326 Self::new_unchecked(
327 dtype,
328 Offsets::new_zeroed(length).into(),
329 Buffer::new(),
330 Some(Bitmap::new_zeroed(length)),
331 )
332 }
333 }
334
335 pub fn default_dtype() -> ArrowDataType {
337 if O::IS_LARGE {
338 ArrowDataType::LargeBinary
339 } else {
340 ArrowDataType::Binary
341 }
342 }
343
344 pub fn new(
346 dtype: ArrowDataType,
347 offsets: OffsetsBuffer<O>,
348 values: Buffer<u8>,
349 validity: Option<Bitmap>,
350 ) -> Self {
351 Self::try_new(dtype, offsets, values, validity).unwrap()
352 }
353
354 #[inline]
358 pub fn from_trusted_len_values_iter<T: AsRef<[u8]>, I: TrustedLen<Item = T>>(
359 iterator: I,
360 ) -> Self {
361 MutableBinaryArray::<O>::from_trusted_len_values_iter(iterator).into()
362 }
363
364 pub fn from_iter_values<T: AsRef<[u8]>, I: Iterator<Item = T>>(iterator: I) -> Self {
368 MutableBinaryArray::<O>::from_iter_values(iterator).into()
369 }
370
371 #[inline]
377 pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self
378 where
379 P: AsRef<[u8]>,
380 I: Iterator<Item = Option<P>>,
381 {
382 MutableBinaryArray::<O>::from_trusted_len_iter_unchecked(iterator).into()
383 }
384
385 #[inline]
387 pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self
388 where
389 P: AsRef<[u8]>,
390 I: TrustedLen<Item = Option<P>>,
391 {
392 unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
394 }
395
396 #[inline]
402 pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(iterator: I) -> Result<Self, E>
403 where
404 P: AsRef<[u8]>,
405 I: IntoIterator<Item = Result<Option<P>, E>>,
406 {
407 MutableBinaryArray::<O>::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into())
408 }
409
410 #[inline]
412 pub fn try_from_trusted_len_iter<E, I, P>(iter: I) -> Result<Self, E>
413 where
414 P: AsRef<[u8]>,
415 I: TrustedLen<Item = Result<Option<P>, E>>,
416 {
417 unsafe { Self::try_from_trusted_len_iter_unchecked(iter) }
419 }
420}
421
422impl<O: Offset> Array for BinaryArray<O> {
423 impl_common_array!();
424
425 fn validity(&self) -> Option<&Bitmap> {
426 self.validity.as_ref()
427 }
428
429 #[inline]
430 fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
431 Box::new(self.clone().with_validity(validity))
432 }
433}
434
435unsafe impl<O: Offset> GenericBinaryArray<O> for BinaryArray<O> {
436 #[inline]
437 fn values(&self) -> &[u8] {
438 self.values()
439 }
440
441 #[inline]
442 fn offsets(&self) -> &[O] {
443 self.offsets().buffer()
444 }
445}
446
447impl<O: Offset> Splitable for BinaryArray<O> {
448 #[inline(always)]
449 fn check_bound(&self, offset: usize) -> bool {
450 offset <= self.len()
451 }
452
453 unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
454 let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
455 let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };
456
457 (
458 Self {
459 dtype: self.dtype.clone(),
460 offsets: lhs_offsets,
461 values: self.values.clone(),
462 validity: lhs_validity,
463 },
464 Self {
465 dtype: self.dtype.clone(),
466 offsets: rhs_offsets,
467 values: self.values.clone(),
468 validity: rhs_validity,
469 },
470 )
471 }
472}