use either::Either;
use polars_buffer::Buffer;
use super::specification::try_check_utf8;
use super::{Array, GenericBinaryArray, Splitable};
use crate::array::BinaryArray;
use crate::array::iterator::NonNullValuesIter;
use crate::bitmap::Bitmap;
use crate::bitmap::utils::{BitmapIter, ZipValidity};
use crate::datatypes::ArrowDataType;
use crate::offset::{Offset, Offsets, OffsetsBuffer};
use crate::trusted_len::TrustedLen;
mod ffi;
pub(super) mod fmt;
mod from;
mod iterator;
mod mutable;
mod mutable_values;
pub use iterator::*;
pub use mutable::*;
pub use mutable_values::MutableUtf8ValuesArray;
use polars_error::*;
pub(super) struct StrAsBytes<P>(P);
impl<T: AsRef<str>> AsRef<[u8]> for StrAsBytes<T> {
#[inline(always)]
fn as_ref(&self) -> &[u8] {
self.0.as_ref().as_bytes()
}
}
#[derive(Clone)]
pub struct Utf8Array<O: Offset> {
dtype: ArrowDataType,
offsets: OffsetsBuffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
}
impl<O: Offset> Utf8Array<O> {
pub fn try_new(
dtype: ArrowDataType,
offsets: OffsetsBuffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> PolarsResult<Self> {
try_check_utf8(&offsets, &values)?;
if validity
.as_ref()
.is_some_and(|validity| validity.len() != offsets.len_proxy())
{
polars_bail!(ComputeError: "validity mask length must match the number of values");
}
if dtype.to_physical_type() != Self::default_dtype().to_physical_type() {
polars_bail!(ComputeError: "Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8")
}
Ok(Self {
dtype,
offsets,
values,
validity,
})
}
pub fn from_slice<T: AsRef<str>, P: AsRef<[T]>>(slice: P) -> Self {
Self::from_trusted_len_values_iter(slice.as_ref().iter())
}
pub fn from<T: AsRef<str>, P: AsRef<[Option<T>]>>(slice: P) -> Self {
MutableUtf8Array::<O>::from(slice).into()
}
pub fn iter(&self) -> ZipValidity<&str, Utf8ValuesIter<'_, O>, BitmapIter<'_>> {
ZipValidity::new_with_validity(self.values_iter(), self.validity())
}
pub fn values_iter(&self) -> Utf8ValuesIter<'_, O> {
Utf8ValuesIter::new(self)
}
#[inline]
pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, Utf8Array<O>> {
NonNullValuesIter::new(self, self.validity())
}
#[inline]
pub fn len(&self) -> usize {
self.offsets.len_proxy()
}
#[inline]
pub fn value(&self, i: usize) -> &str {
assert!(i < self.len());
unsafe { self.value_unchecked(i) }
}
#[inline]
pub unsafe fn value_unchecked(&self, i: usize) -> &str {
let (start, end) = self.offsets.start_end_unchecked(i);
let slice = self.values.get_unchecked(start..end);
std::str::from_utf8_unchecked(slice)
}
#[inline]
pub fn get(&self, i: usize) -> Option<&str> {
if !self.is_null(i) {
unsafe { Some(self.value_unchecked(i)) }
} else {
None
}
}
#[inline]
pub fn dtype(&self) -> &ArrowDataType {
&self.dtype
}
#[inline]
pub fn values(&self) -> &Buffer<u8> {
&self.values
}
#[inline]
pub fn offsets(&self) -> &OffsetsBuffer<O> {
&self.offsets
}
#[inline]
pub fn validity(&self) -> Option<&Bitmap> {
self.validity.as_ref()
}
pub fn slice(&mut self, offset: usize, length: usize) {
assert!(
offset + length <= self.len(),
"the offset of the new array cannot exceed the arrays' length"
);
unsafe { self.slice_unchecked(offset, length) }
}
pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
self.validity = self
.validity
.take()
.map(|bitmap| bitmap.sliced_unchecked(offset, length))
.filter(|bitmap| bitmap.unset_bits() > 0);
self.offsets.slice_unchecked(offset, length + 1);
}
impl_sliced!();
impl_mut_validity!();
impl_into_array!();
#[must_use]
pub fn into_inner(self) -> (ArrowDataType, OffsetsBuffer<O>, Buffer<u8>, Option<Bitmap>) {
let Self {
dtype,
offsets,
values,
validity,
} = self;
(dtype, offsets, values, validity)
}
#[must_use]
pub fn into_mut(self) -> Either<Self, MutableUtf8Array<O>> {
use Either::*;
if let Some(bitmap) = self.validity {
match bitmap.into_mut() {
Left(bitmap) => Left(unsafe {
Utf8Array::new_unchecked(self.dtype, self.offsets, self.values, Some(bitmap))
}),
Right(mutable_bitmap) => match (self.values.into_mut(), self.offsets.into_mut()) {
(Left(values), Left(offsets)) => {
Left(unsafe {
Utf8Array::new_unchecked(
self.dtype,
offsets,
values,
Some(mutable_bitmap.into()),
)
})
},
(Left(values), Right(offsets)) => {
Left(unsafe {
Utf8Array::new_unchecked(
self.dtype,
offsets.into(),
values,
Some(mutable_bitmap.into()),
)
})
},
(Right(values), Left(offsets)) => {
Left(unsafe {
Utf8Array::new_unchecked(
self.dtype,
offsets,
values.into(),
Some(mutable_bitmap.into()),
)
})
},
(Right(values), Right(offsets)) => Right(unsafe {
MutableUtf8Array::new_unchecked(
self.dtype,
offsets,
values,
Some(mutable_bitmap),
)
}),
},
}
} else {
match (self.values.into_mut(), self.offsets.into_mut()) {
(Left(values), Left(offsets)) => {
Left(unsafe { Utf8Array::new_unchecked(self.dtype, offsets, values, None) })
},
(Left(values), Right(offsets)) => Left(unsafe {
Utf8Array::new_unchecked(self.dtype, offsets.into(), values, None)
}),
(Right(values), Left(offsets)) => Left(unsafe {
Utf8Array::new_unchecked(self.dtype, offsets, values.into(), None)
}),
(Right(values), Right(offsets)) => Right(unsafe {
MutableUtf8Array::new_unchecked(self.dtype, offsets, values, None)
}),
}
}
}
#[inline]
pub fn new_empty(dtype: ArrowDataType) -> Self {
unsafe { Self::new_unchecked(dtype, OffsetsBuffer::new(), Buffer::new(), None) }
}
#[inline]
pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
Self::new(
dtype,
Offsets::new_zeroed(length).into(),
Buffer::new(),
Some(Bitmap::new_zeroed(length)),
)
}
pub fn default_dtype() -> ArrowDataType {
if O::IS_LARGE {
ArrowDataType::LargeUtf8
} else {
ArrowDataType::Utf8
}
}
pub unsafe fn new_unchecked(
dtype: ArrowDataType,
offsets: OffsetsBuffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
debug_assert!(
offsets.last().to_usize() <= values.len(),
"offsets must not exceed the values length"
);
debug_assert!(
validity
.as_ref()
.is_none_or(|validity| validity.len() == offsets.len_proxy()),
"validity mask length must match the number of values"
);
debug_assert!(
dtype.to_physical_type() == Self::default_dtype().to_physical_type(),
"Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8"
);
Self {
dtype,
offsets,
values,
validity,
}
}
pub fn new(
dtype: ArrowDataType,
offsets: OffsetsBuffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
Self::try_new(dtype, offsets, values, validity).unwrap()
}
#[inline]
pub fn from_trusted_len_values_iter<T: AsRef<str>, I: TrustedLen<Item = T>>(
iterator: I,
) -> Self {
MutableUtf8Array::<O>::from_trusted_len_values_iter(iterator).into()
}
pub fn from_iter_values<T: AsRef<str>, I: Iterator<Item = T>>(iterator: I) -> Self {
MutableUtf8Array::<O>::from_iter_values(iterator).into()
}
#[inline]
pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self
where
P: AsRef<str>,
I: Iterator<Item = Option<P>>,
{
MutableUtf8Array::<O>::from_trusted_len_iter_unchecked(iterator).into()
}
#[inline]
pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self
where
P: AsRef<str>,
I: TrustedLen<Item = Option<P>>,
{
MutableUtf8Array::<O>::from_trusted_len_iter(iterator).into()
}
#[inline]
pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(
iterator: I,
) -> std::result::Result<Self, E>
where
P: AsRef<str>,
I: IntoIterator<Item = std::result::Result<Option<P>, E>>,
{
MutableUtf8Array::<O>::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into())
}
#[inline]
pub fn try_from_trusted_len_iter<E, I, P>(iter: I) -> std::result::Result<Self, E>
where
P: AsRef<str>,
I: TrustedLen<Item = std::result::Result<Option<P>, E>>,
{
MutableUtf8Array::<O>::try_from_trusted_len_iter(iter).map(|x| x.into())
}
pub fn apply_validity<F: FnOnce(Bitmap) -> Bitmap>(&mut self, f: F) {
if let Some(validity) = std::mem::take(&mut self.validity) {
self.set_validity(Some(f(validity)))
}
}
pub fn to_binary(&self) -> BinaryArray<O> {
unsafe {
BinaryArray::new_unchecked(
BinaryArray::<O>::default_dtype(),
self.offsets.clone(),
self.values.clone(),
self.validity.clone(),
)
}
}
}
impl<O: Offset> Splitable for Utf8Array<O> {
#[inline(always)]
fn check_bound(&self, offset: usize) -> bool {
offset <= self.len()
}
unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };
let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
(
Self {
dtype: self.dtype.clone(),
offsets: lhs_offsets,
values: self.values.clone(),
validity: lhs_validity,
},
Self {
dtype: self.dtype.clone(),
offsets: rhs_offsets,
values: self.values.clone(),
validity: rhs_validity,
},
)
}
}
impl<O: Offset> Array for Utf8Array<O> {
impl_common_array!();
fn validity(&self) -> Option<&Bitmap> {
self.validity.as_ref()
}
#[inline]
fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
Box::new(self.clone().with_validity(validity))
}
}
unsafe impl<O: Offset> GenericBinaryArray<O> for Utf8Array<O> {
#[inline]
fn values(&self) -> &[u8] {
self.values()
}
#[inline]
fn offsets(&self) -> &[O] {
self.offsets().buffer()
}
}
impl<O: Offset> Default for Utf8Array<O> {
fn default() -> Self {
let dtype = if O::IS_LARGE {
ArrowDataType::LargeUtf8
} else {
ArrowDataType::Utf8
};
Utf8Array::new(dtype, Default::default(), Default::default(), None)
}
}