mod vlen_codec;
mod vlen_partial_decoder;
use std::num::NonZeroU64;
use std::sync::Arc;
use super::bytes::reverse_endianness;
use crate::array::{
ArrayBytesRaw, ChunkShape, ChunkShapeTraits, CodecChain, Endianness, convert_from_bytes_slice,
data_type,
};
use itertools::Itertools;
pub use vlen_codec::VlenCodec;
use zarrs_codec::{
ArrayToBytesCodecTraits, Codec, CodecError, CodecOptions, CodecPluginV3, CodecTraitsV3,
InvalidBytesLengthError,
};
use zarrs_data_type::FillValue;
use zarrs_metadata::v3::MetadataV3;
pub use zarrs_metadata_ext::codec::vlen::{
VlenCodecConfiguration, VlenCodecConfigurationV0, VlenCodecConfigurationV0_1,
};
use zarrs_metadata_ext::codec::vlen::{VlenIndexDataType, VlenIndexLocation};
use zarrs_plugin::PluginCreateError;
zarrs_plugin::impl_extension_aliases!(VlenCodec,
v3: "zarrs.vlen", ["https://codec.zarrs.dev/array_to_bytes/vlen"]
);
inventory::submit! {
CodecPluginV3::new::<VlenCodec>()
}
impl CodecTraitsV3 for VlenCodec {
fn create(metadata: &MetadataV3) -> Result<Codec, PluginCreateError> {
crate::warn_experimental_extension(metadata.name(), "codec");
let configuration: VlenCodecConfiguration = metadata.to_typed_configuration()?;
let codec = Arc::new(VlenCodec::new_with_configuration(&configuration)?);
Ok(Codec::ArrayToBytes(codec))
}
}
fn get_vlen_bytes_and_offsets(
bytes: &ArrayBytesRaw,
shape: &[NonZeroU64],
index_data_type: VlenIndexDataType,
index_codecs: &CodecChain,
data_codecs: &CodecChain,
index_location: VlenIndexLocation,
options: &CodecOptions,
) -> Result<(Vec<u8>, Vec<usize>), CodecError> {
let index_shape = ChunkShape::from(vec![
NonZeroU64::try_from(shape.num_elements_u64() + 1).unwrap(),
]);
let (data_type, fill_value) = match index_data_type {
VlenIndexDataType::UInt32 => (data_type::uint32(), FillValue::from(0u32)),
VlenIndexDataType::UInt64 => (data_type::uint64(), FillValue::from(0u64)),
};
if bytes.len() < size_of::<u64>() {
return Err(InvalidBytesLengthError::new(bytes.len(), size_of::<u64>()).into());
}
let (bytes_index_len, bytes_main) = match index_location {
VlenIndexLocation::Start => bytes.split_at(size_of::<u64>()),
VlenIndexLocation::End => {
let (bytes_main, bytes_index_len) = bytes.split_at(bytes.len() - size_of::<u64>());
(bytes_index_len, bytes_main)
}
};
let index_len = u64::from_le_bytes(bytes_index_len.try_into().unwrap());
let index_len = usize::try_from(index_len)
.map_err(|_| CodecError::Other("index length exceeds usize::MAX".to_string()))?;
let (index_enc, data_enc) = match index_location {
VlenIndexLocation::Start => bytes_main.split_at(index_len),
VlenIndexLocation::End => {
let (bytes_data, bytes_index) = bytes_main.split_at(bytes_main.len() - index_len);
(bytes_index, bytes_data)
}
};
let mut index = index_codecs
.decode(
index_enc.into(),
&index_shape,
&data_type,
&fill_value,
options,
)?
.into_fixed()?;
if Endianness::Big.is_native() {
reverse_endianness(index.to_mut(), &data_type::uint64());
}
let index = match index_data_type {
VlenIndexDataType::UInt32 => {
let index = convert_from_bytes_slice::<u32>(&index);
offsets_u32_to_usize(index)
}
VlenIndexDataType::UInt64 => {
let index = convert_from_bytes_slice::<u64>(&index);
offsets_u64_to_usize(index)
}
};
let Some(&data_len_expected) = index.last() else {
return Err(CodecError::Other(
"Index is empty? It should have at least one element".to_string(),
));
};
let data = if let Ok(data_len_expected) = NonZeroU64::try_from(data_len_expected as u64) {
data_codecs
.decode(
data_enc.into(),
&[data_len_expected],
&data_type::uint8(),
&0u8.into(),
options,
)?
.into_fixed()?
.into_owned()
} else {
vec![]
};
let data_len = data.len();
if data_len != data_len_expected {
return Err(CodecError::Other(format!(
"Expected data length {data_len_expected} does not match data length {data_len}"
)));
}
for (curr, next) in index.iter().tuple_windows() {
if next < curr || *next > data_len {
return Err(CodecError::Other(
"Invalid bytes offsets in vlen Offset64 encoded chunk".to_string(),
));
}
}
Ok((data, index))
}
fn offsets_u32_to_usize(offsets: Vec<u32>) -> Vec<usize> {
if size_of::<u32>() == size_of::<usize>() {
bytemuck::allocation::cast_vec(offsets)
} else {
offsets
.into_iter()
.map(|offset| usize::try_from(offset).unwrap())
.collect()
}
}
fn offsets_u64_to_usize(offsets: Vec<u64>) -> Vec<usize> {
if size_of::<u64>() == size_of::<usize>() {
bytemuck::allocation::cast_vec(offsets)
} else {
offsets
.into_iter()
.map(|offset| usize::try_from(offset).unwrap())
.collect()
}
}