use crate::{
bitmap::Bitmap,
buffer::Buffer,
datatypes::DataType,
error::{ArrowError, Result},
};
use either::Either;
use super::{
specification::{try_check_offsets_and_utf8, try_check_offsets_bounds},
Array, GenericBinaryArray, Offset,
};
mod ffi;
pub(super) mod fmt;
mod from;
mod iterator;
mod mutable;
pub use iterator::*;
pub use mutable::*;
#[derive(Clone)]
pub struct Utf8Array<O: Offset> {
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
}
impl<O: Offset> Utf8Array<O> {
pub fn try_new(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Result<Self> {
try_check_offsets_and_utf8(&offsets, &values)?;
if validity
.as_ref()
.map_or(false, |validity| validity.len() != offsets.len() - 1)
{
return Err(ArrowError::oos(
"validity mask length must match the number of values",
));
}
if data_type.to_physical_type() != Self::default_data_type().to_physical_type() {
return Err(ArrowError::oos(
"Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8",
));
}
Ok(Self {
data_type,
offsets,
values,
validity,
})
}
pub fn new(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
Self::try_new(data_type, offsets, values, validity).unwrap()
}
pub fn from_data(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
Self::new(data_type, offsets, values, validity)
}
#[inline]
pub fn new_empty(data_type: DataType) -> Self {
unsafe {
Self::from_data_unchecked(
data_type,
Buffer::from(vec![O::zero()]),
Buffer::new(),
None,
)
}
}
#[inline]
pub fn new_null(data_type: DataType, length: usize) -> Self {
Self::new(
data_type,
Buffer::new_zeroed(length + 1),
Buffer::new(),
Some(Bitmap::new_zeroed(length)),
)
}
pub fn default_data_type() -> DataType {
if O::is_large() {
DataType::LargeUtf8
} else {
DataType::Utf8
}
}
}
impl<O: Offset> Utf8Array<O> {
pub unsafe fn try_new_unchecked(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Result<Self> {
try_check_offsets_bounds(&offsets, values.len())?;
if validity
.as_ref()
.map_or(false, |validity| validity.len() != offsets.len() - 1)
{
return Err(ArrowError::oos(
"validity mask length must match the number of values",
));
}
if data_type.to_physical_type() != Self::default_data_type().to_physical_type() {
return Err(ArrowError::oos(
"BinaryArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8",
));
}
Ok(Self {
data_type,
offsets,
values,
validity,
})
}
pub unsafe fn new_unchecked(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
Self::try_new_unchecked(data_type, offsets, values, validity).unwrap()
}
pub unsafe fn from_data_unchecked(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
Self::new_unchecked(data_type, offsets, values, validity)
}
}
impl<O: Offset> Utf8Array<O> {
#[must_use]
pub fn slice(&self, offset: usize, length: usize) -> Self {
assert!(
offset + length <= self.len(),
"the offset of the new Buffer cannot exceed the existing length"
);
unsafe { self.slice_unchecked(offset, length) }
}
#[must_use]
pub unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Self {
let validity = self
.validity
.clone()
.map(|x| x.slice_unchecked(offset, length));
let offsets = self.offsets.clone().slice_unchecked(offset, length + 1);
Self {
data_type: self.data_type.clone(),
offsets,
values: self.values.clone(),
validity,
}
}
pub fn with_validity(&self, validity: Option<Bitmap>) -> Self {
if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) {
panic!("validity should be as least as large as the array")
}
let mut arr = self.clone();
arr.validity = validity;
arr
}
pub fn into_mut(self) -> Either<Self, MutableUtf8Array<O>> {
use Either::*;
if let Some(bitmap) = self.validity {
match bitmap.into_mut() {
Left(bitmap) => Left(unsafe {
Utf8Array::new_unchecked(
self.data_type,
self.offsets,
self.values,
Some(bitmap),
)
}),
Right(mutable_bitmap) => match (self.values.into_mut(), self.offsets.into_mut()) {
(Left(immutable_values), Left(immutable_offsets)) => {
Left(unsafe {
Utf8Array::new_unchecked(
self.data_type,
immutable_offsets,
immutable_values,
Some(mutable_bitmap.into()),
)
})
}
(Left(immutable_values), Right(mutable_offsets)) => {
Left(unsafe {
Utf8Array::new_unchecked(
self.data_type,
mutable_offsets.into(),
immutable_values,
Some(mutable_bitmap.into()),
)
})
}
(Right(mutable_values), Left(immutable_offsets)) => {
Left(unsafe {
Utf8Array::new_unchecked(
self.data_type,
immutable_offsets,
mutable_values.into(),
Some(mutable_bitmap.into()),
)
})
}
(Right(mutable_values), Right(mutable_offsets)) => {
Right(MutableUtf8Array::from_data(
self.data_type,
mutable_offsets,
mutable_values,
Some(mutable_bitmap),
))
}
},
}
} else {
match (self.values.into_mut(), self.offsets.into_mut()) {
(Left(immutable_values), Left(immutable_offsets)) => Left(unsafe {
Utf8Array::new_unchecked(
self.data_type,
immutable_offsets,
immutable_values,
None,
)
}),
(Left(immutable_values), Right(mutable_offsets)) => Left(unsafe {
Utf8Array::new_unchecked(
self.data_type,
mutable_offsets.into(),
immutable_values,
None,
)
}),
(Right(mutable_values), Left(immutable_offsets)) => Left(unsafe {
Utf8Array::from_data(
self.data_type,
immutable_offsets,
mutable_values.into(),
None,
)
}),
(Right(mutable_values), Right(mutable_offsets)) => {
Right(MutableUtf8Array::from_data(
self.data_type,
mutable_offsets,
mutable_values,
None,
))
}
}
}
}
}
impl<O: Offset> Utf8Array<O> {
#[inline]
pub fn len(&self) -> usize {
self.offsets.len() - 1
}
pub unsafe fn value_unchecked(&self, i: usize) -> &str {
let start = self.offsets.get_unchecked(i).to_usize();
let end = self.offsets.get_unchecked(i + 1).to_usize();
let slice = self.values.get_unchecked(start..end);
std::str::from_utf8_unchecked(slice)
}
pub fn value(&self, i: usize) -> &str {
let start = self.offsets[i].to_usize();
let end = self.offsets[i + 1].to_usize();
let slice = unsafe { self.values.get_unchecked(start..end) };
unsafe { std::str::from_utf8_unchecked(slice) }
}
#[inline]
pub fn validity(&self) -> Option<&Bitmap> {
self.validity.as_ref()
}
#[inline]
pub fn offsets(&self) -> &Buffer<O> {
&self.offsets
}
#[inline]
pub fn values(&self) -> &Buffer<u8> {
&self.values
}
}
impl<O: Offset> Array for Utf8Array<O> {
#[inline]
fn as_any(&self) -> &dyn std::any::Any {
self
}
#[inline]
fn len(&self) -> usize {
self.len()
}
#[inline]
fn data_type(&self) -> &DataType {
&self.data_type
}
fn validity(&self) -> Option<&Bitmap> {
self.validity.as_ref()
}
fn slice(&self, offset: usize, length: usize) -> Box<dyn Array> {
Box::new(self.slice(offset, length))
}
unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Box<dyn Array> {
Box::new(self.slice_unchecked(offset, length))
}
fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
Box::new(self.with_validity(validity))
}
}
unsafe impl<O: Offset> GenericBinaryArray<O> for Utf8Array<O> {
#[inline]
fn values(&self) -> &[u8] {
self.values()
}
#[inline]
fn offsets(&self) -> &[O] {
self.offsets()
}
}