use std::mem::MaybeUninit;
use std::num::NonZeroUsize;
use std::sync::{Arc, OnceLock};
use lru::LruCache;
use ndarray::{ArrayD, IxDyn};
use parking_lot::Mutex;
#[cfg(feature = "rayon")]
use rayon::prelude::*;
use smallvec::SmallVec;
use crate::attribute_api::{
collect_attribute_messages_storage, decode_string, read_one_vlen_string_storage,
resolve_vlen_bytes_storage, Attribute,
};
use crate::cache::{ChunkCache, ChunkCacheStats, ChunkKey};
use crate::chunk_index;
use crate::datatype_api::{dtype_element_size, H5Type};
use crate::error::{ByteOrder, Error, Result};
use crate::filters::{self, FilterRegistry};
use crate::io::Cursor;
use crate::local_heap::LocalHeap;
use crate::messages::attribute::AttributeMessage;
use crate::messages::dataspace::{DataspaceMessage, DataspaceType};
use crate::messages::datatype::{Datatype, StringSize, VarLenKind};
use crate::messages::external_files::ExternalFilesMessage;
use crate::messages::fill_value::{FillTime, FillValueMessage};
use crate::messages::filter_pipeline::FilterPipelineMessage;
use crate::messages::layout::{ChunkIndexing, DataLayout};
use crate::messages::HdfMessage;
use crate::object_header::ObjectHeader;
use crate::storage::DynStorage;
use crate::FileContext;
const HOT_FULL_DATASET_CACHE_MAX_BYTES: usize = 32 * 1024 * 1024;
#[derive(Clone, Copy)]
struct FlatBufferPtr {
ptr: *mut u8,
len: usize,
}
#[derive(Clone, Copy)]
struct ChunkCopyLayout<'a> {
chunk_offsets: &'a [u64],
chunk_shape: &'a [u64],
dataset_shape: &'a [u64],
dataset_strides: &'a [usize],
chunk_strides: &'a [usize],
elem_size: usize,
}
#[derive(Clone, Copy)]
struct UnitStrideCopyLayout<'a> {
chunk_offsets: &'a [u64],
chunk_shape: &'a [u64],
dataset_shape: &'a [u64],
resolved: &'a ResolvedSelection,
chunk_strides: &'a [usize],
result_strides: &'a [usize],
elem_size: usize,
}
#[derive(Clone, Copy)]
struct ContiguousSliceDirectLayout<'a> {
dataset_strides: &'a [usize],
result_strides: &'a [usize],
elem_size: usize,
result_total_bytes: usize,
}
#[derive(Clone)]
struct ResolvedExternalRawSlot {
logical_offset: u64,
storage: DynStorage,
file_offset: u64,
size: u64,
}
pub(crate) struct DatasetParseContext {
pub(crate) context: Arc<FileContext>,
}
#[derive(Clone, Copy)]
struct ChunkEntrySelection<'a> {
shape: &'a [u64],
ndim: usize,
elem_size: usize,
chunk_bounds: Option<(&'a [u64], &'a [u64])>,
}
unsafe impl Send for FlatBufferPtr {}
unsafe impl Sync for FlatBufferPtr {}
impl FlatBufferPtr {
#[cfg(feature = "rayon")]
#[inline(always)]
unsafe fn copy_chunk(self, chunk_data: &[u8], layout: ChunkCopyLayout<'_>) -> Result<()> {
copy_chunk_to_flat_with_strides_ptr(chunk_data, self, layout)
}
#[cfg(feature = "rayon")]
#[inline(always)]
unsafe fn copy_selected(
self,
chunk_data: &[u8],
dim_indices: &[Vec<(usize, usize)>],
chunk_strides: &[usize],
result_strides: &[usize],
elem_size: usize,
ndim: usize,
) -> Result<()> {
copy_selected_elements_ptr(
chunk_data,
self.ptr,
self.len,
dim_indices,
chunk_strides,
result_strides,
elem_size,
ndim,
)
}
#[cfg(feature = "rayon")]
#[inline(always)]
unsafe fn copy_unit_stride_chunk_overlap(
self,
chunk_data: &[u8],
layout: UnitStrideCopyLayout<'_>,
) -> Result<()> {
copy_unit_stride_chunk_overlap_ptr(chunk_data, self, layout)
}
}
#[derive(Debug, Clone)]
pub struct SliceInfo {
pub selections: Vec<SliceInfoElem>,
}
#[derive(Debug, Clone)]
pub enum SliceInfoElem {
Index(u64),
Slice { start: u64, end: u64, step: u64 },
}
#[derive(Clone, Debug)]
struct ResolvedSelectionDim {
start: u64,
end: u64,
step: u64,
count: usize,
}
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
struct ChunkEntryCacheKey {
index_address: u64,
first_chunk: SmallVec<[u64; 4]>,
last_chunk: SmallVec<[u64; 4]>,
}
impl ResolvedSelectionDim {
fn chunk_index_range(&self, chunk_extent: u64) -> Option<(u64, u64)> {
if self.count == 0 {
return None;
}
Some((self.start / chunk_extent, (self.end - 1) / chunk_extent))
}
}
#[derive(Clone, Debug)]
struct ResolvedSelection {
dims: Vec<ResolvedSelectionDim>,
result_shape: Vec<usize>,
result_elements: usize,
}
impl ResolvedSelection {
fn result_dims_with_collapsed(&self) -> Vec<usize> {
self.dims.iter().map(|dim| dim.count).collect()
}
fn is_unit_stride(&self) -> bool {
self.dims.iter().all(|dim| dim.step == 1)
}
}
impl SliceInfo {
pub fn all(ndim: usize) -> Self {
SliceInfo {
selections: vec![
SliceInfoElem::Slice {
start: 0,
end: u64::MAX,
step: 1,
};
ndim
],
}
}
}
fn checked_usize(value: u64, context: &str) -> Result<usize> {
usize::try_from(value).map_err(|_| {
Error::InvalidData(format!(
"{context} value {value} exceeds platform usize capacity"
))
})
}
fn checked_mul_usize(lhs: usize, rhs: usize, context: &str) -> Result<usize> {
lhs.checked_mul(rhs)
.ok_or_else(|| Error::InvalidData(format!("{context} exceeds platform usize capacity")))
}
fn checked_add_usize(lhs: usize, rhs: usize, context: &str) -> Result<usize> {
lhs.checked_add(rhs)
.ok_or_else(|| Error::InvalidData(format!("{context} exceeds platform usize capacity")))
}
fn checked_mul_u64(lhs: u64, rhs: u64, context: &str) -> Result<u64> {
lhs.checked_mul(rhs)
.ok_or_else(|| Error::InvalidData(format!("{context} exceeds u64 capacity")))
}
fn checked_add_u64(lhs: u64, rhs: u64, context: &str) -> Result<u64> {
lhs.checked_add(rhs)
.ok_or_else(|| Error::InvalidData(format!("{context} exceeds u64 capacity")))
}
fn checked_shape_elements_usize(shape: &[u64], context: &str) -> Result<usize> {
let mut total = 1usize;
for &dim in shape {
total = checked_mul_usize(total, checked_usize(dim, context)?, context)?;
}
Ok(total)
}
fn full_dataset_chunk_bounds(
shape: &[u64],
chunk_shape: &[u64],
) -> Result<Option<(Vec<u64>, Vec<u64>)>> {
validate_chunk_shape(shape, chunk_shape)?;
if shape.contains(&0) {
return Ok(None);
}
let first_chunk = vec![0u64; shape.len()];
let last_chunk = shape
.iter()
.zip(chunk_shape.iter())
.map(|(&dim, &chunk)| dim.div_ceil(chunk) - 1)
.collect();
Ok(Some((first_chunk, last_chunk)))
}
fn validate_chunk_shape(shape: &[u64], chunk_shape: &[u64]) -> Result<()> {
if chunk_shape.len() != shape.len() {
return Err(Error::InvalidData(format!(
"chunk rank {} does not match dataset rank {}",
chunk_shape.len(),
shape.len()
)));
}
if let Some((dim, _)) = chunk_shape
.iter()
.enumerate()
.find(|(_, chunk)| **chunk == 0)
{
return Err(Error::InvalidData(format!(
"chunk dimension {dim} has zero extent"
)));
}
Ok(())
}
fn validate_decoded_chunk_len(
entry: &chunk_index::ChunkEntry,
chunk_shape: &[u64],
elem_size: usize,
actual_len: usize,
) -> Result<()> {
let chunk_elements = checked_shape_elements_usize(chunk_shape, "decoded chunk element count")?;
let expected_len = checked_mul_usize(chunk_elements, elem_size, "decoded chunk byte length")?;
if actual_len != expected_len {
return Err(Error::InvalidData(format!(
"chunk at offsets {:?} decoded to {} bytes, expected {} bytes",
entry.offsets, actual_len, expected_len
)));
}
Ok(())
}
fn validate_chunk_grid_coverage(
entries: &mut [chunk_index::ChunkEntry],
shape: &[u64],
chunk_shape: &[u64],
first_chunk: &[u64],
last_chunk: &[u64],
) -> Result<bool> {
validate_chunk_shape(shape, chunk_shape)?;
if first_chunk.len() != shape.len() || last_chunk.len() != shape.len() {
return Err(Error::InvalidData(format!(
"chunk grid bounds rank does not match dataset rank {}",
shape.len()
)));
}
if shape.contains(&0) {
if entries.is_empty() {
return Ok(true);
}
return Err(Error::InvalidData(
"chunk index contains entries for an empty dataset".into(),
));
}
for dim in 0..shape.len() {
if first_chunk[dim] > last_chunk[dim] {
return Err(Error::InvalidData(format!(
"invalid chunk grid bounds for dimension {dim}: {} > {}",
first_chunk[dim], last_chunk[dim]
)));
}
}
entries.sort_by(|a, b| a.offsets.cmp(&b.offsets));
for i in 0..entries.len() {
validate_chunk_entry_offsets(&entries[i], shape, chunk_shape, first_chunk, last_chunk)?;
if i > 0 && entries[i].offsets == entries[i - 1].offsets {
return Err(Error::InvalidData(format!(
"duplicate chunk output offsets {:?} (addresses {:#x} and {:#x})",
entries[i].offsets,
entries[i - 1].address,
entries[i].address
)));
}
}
let mut entry_idx = 0usize;
let mut expected = first_chunk.to_vec();
loop {
let expected_offsets: Vec<u64> = expected
.iter()
.enumerate()
.map(|(dim, chunk_index)| chunk_index * chunk_shape[dim])
.collect();
if entry_idx >= entries.len() || entries[entry_idx].offsets != expected_offsets {
return Ok(false);
}
entry_idx += 1;
if !advance_chunk_index(&mut expected, first_chunk, last_chunk) {
break;
}
}
Ok(entry_idx == entries.len())
}
fn validate_chunk_entry_offsets(
entry: &chunk_index::ChunkEntry,
shape: &[u64],
chunk_shape: &[u64],
first_chunk: &[u64],
last_chunk: &[u64],
) -> Result<()> {
if entry.offsets.len() != shape.len() {
return Err(Error::InvalidData(format!(
"chunk at address {:#x} has rank {}, expected {}",
entry.address,
entry.offsets.len(),
shape.len()
)));
}
for dim in 0..shape.len() {
let offset = entry.offsets[dim];
if offset >= shape[dim] {
return Err(Error::InvalidData(format!(
"chunk at address {:#x} has out-of-bounds offset {} for dimension {} of size {}",
entry.address, offset, dim, shape[dim]
)));
}
if offset % chunk_shape[dim] != 0 {
return Err(Error::InvalidData(format!(
"chunk at address {:#x} has non-grid offset {} for dimension {} with chunk extent {}",
entry.address, offset, dim, chunk_shape[dim]
)));
}
let chunk_index = offset / chunk_shape[dim];
if chunk_index < first_chunk[dim] || chunk_index > last_chunk[dim] {
return Err(Error::InvalidData(format!(
"chunk at address {:#x} has offset {:?} outside requested chunk grid",
entry.address, entry.offsets
)));
}
}
Ok(())
}
fn advance_chunk_index(index: &mut [u64], first_chunk: &[u64], last_chunk: &[u64]) -> bool {
if index.is_empty() {
return false;
}
for dim in (0..index.len()).rev() {
if index[dim] < last_chunk[dim] {
index[dim] += 1;
if dim + 1 < index.len() {
index[(dim + 1)..].copy_from_slice(&first_chunk[(dim + 1)..]);
}
return true;
}
}
false
}
fn row_major_strides(shape: &[u64], context: &str) -> Result<Vec<usize>> {
let ndim = shape.len();
if ndim == 0 {
return Ok(Vec::new());
}
let mut strides = vec![1usize; ndim];
for i in (0..ndim - 1).rev() {
let next_extent = checked_usize(shape[i + 1], context)?;
strides[i] = checked_mul_usize(strides[i + 1], next_extent, context)?;
}
Ok(strides)
}
fn assume_init_u8_vec(mut buffer: Vec<MaybeUninit<u8>>) -> Vec<u8> {
let ptr = buffer.as_mut_ptr() as *mut u8;
let len = buffer.len();
let capacity = buffer.capacity();
std::mem::forget(buffer);
unsafe { Vec::from_raw_parts(ptr, len, capacity) }
}
fn assume_init_vec<T>(mut buffer: Vec<MaybeUninit<T>>) -> Vec<T> {
let ptr = buffer.as_mut_ptr() as *mut T;
let len = buffer.len();
let capacity = buffer.capacity();
std::mem::forget(buffer);
unsafe { Vec::from_raw_parts(ptr, len, capacity) }
}
fn normalize_selection(selection: &SliceInfo, shape: &[u64]) -> Result<ResolvedSelection> {
if selection.selections.len() != shape.len() {
return Err(Error::InvalidData(format!(
"slice has {} dimensions but dataset has {}",
selection.selections.len(),
shape.len()
)));
}
let mut dims = Vec::with_capacity(shape.len());
let mut result_shape = Vec::new();
let mut result_elements = 1usize;
for (i, sel) in selection.selections.iter().enumerate() {
let dim_size = shape[i];
match sel {
SliceInfoElem::Index(idx) => {
if *idx >= dim_size {
return Err(Error::SliceOutOfBounds {
dim: i,
index: *idx,
size: dim_size,
});
}
dims.push(ResolvedSelectionDim {
start: *idx,
end: *idx + 1,
step: 1,
count: 1,
});
}
SliceInfoElem::Slice { start, end, step } => {
if *step == 0 {
return Err(Error::InvalidData("slice step cannot be 0".into()));
}
if *start > dim_size {
return Err(Error::SliceOutOfBounds {
dim: i,
index: *start,
size: dim_size,
});
}
let actual_end = if *end == u64::MAX {
dim_size
} else {
(*end).min(dim_size)
};
let count_u64 = if *start >= actual_end {
0
} else {
(actual_end - *start).div_ceil(*step)
};
let count = checked_usize(count_u64, "slice element count")?;
dims.push(ResolvedSelectionDim {
start: *start,
end: actual_end,
step: *step,
count,
});
result_shape.push(count);
result_elements =
checked_mul_usize(result_elements, count, "slice result element count")?;
}
}
}
Ok(ResolvedSelection {
dims,
result_shape,
result_elements,
})
}
fn selection_dim_is_full_unit(dim: &ResolvedSelectionDim, dim_size: u64) -> bool {
dim.step == 1
&& dim.start == 0
&& dim.end == dim_size
&& u64::try_from(dim.count).ok() == Some(dim_size)
}
fn selection_covers_full_dataset(resolved: &ResolvedSelection, shape: &[u64]) -> bool {
resolved.result_shape.len() == shape.len()
&& resolved
.dims
.iter()
.zip(shape.iter())
.all(|(dim, &dim_size)| selection_dim_is_full_unit(dim, dim_size))
}
fn contiguous_slice_tail_start(shape: &[u64], resolved: &ResolvedSelection) -> usize {
let ndim = shape.len();
if ndim == 0 {
return 0;
}
let mut tail_start = if resolved.dims[ndim - 1].step == 1 {
ndim - 1
} else {
ndim
};
while tail_start > 0 {
let prev = tail_start - 1;
let later_dims_are_full =
(tail_start..ndim).all(|d| selection_dim_is_full_unit(&resolved.dims[d], shape[d]));
if resolved.dims[prev].step == 1 && later_dims_are_full {
tail_start = prev;
} else {
break;
}
}
tail_start
}
fn contiguous_slice_block_elements(
resolved: &ResolvedSelection,
tail_start: usize,
) -> Result<usize> {
let mut elements = 1usize;
for dim in &resolved.dims[tail_start..] {
elements = checked_mul_usize(elements, dim.count, "contiguous slice block elements")?;
}
Ok(elements)
}
fn result_strides_for_dims(result_dims: &[usize]) -> Result<Vec<usize>> {
let ndim = result_dims.len();
let mut result_strides = vec![1usize; ndim];
for d in (0..ndim.saturating_sub(1)).rev() {
result_strides[d] =
checked_mul_usize(result_strides[d + 1], result_dims[d + 1], "result stride")?;
}
Ok(result_strides)
}
#[derive(Clone)]
pub struct Dataset {
pub(crate) context: Arc<FileContext>,
pub(crate) name: String,
pub(crate) data_address: u64,
pub(crate) dataspace: DataspaceMessage,
pub(crate) datatype: Datatype,
pub(crate) layout: DataLayout,
pub(crate) fill_value: Option<FillValueMessage>,
pub(crate) filters: Option<FilterPipelineMessage>,
pub(crate) external_files: Option<ExternalFilesMessage>,
pub(crate) attributes: Vec<AttributeMessage>,
pub(crate) chunk_cache: Arc<ChunkCache>,
chunk_entry_cache: Arc<Mutex<LruCache<ChunkEntryCacheKey, Arc<Vec<chunk_index::ChunkEntry>>>>>,
full_chunk_entries: Arc<OnceLock<Arc<Vec<chunk_index::ChunkEntry>>>>,
full_dataset_bytes: Arc<OnceLock<Arc<Vec<u8>>>>,
external_slots: Arc<OnceLock<Arc<Vec<ResolvedExternalRawSlot>>>>,
pub(crate) filter_registry: Arc<FilterRegistry>,
}
pub struct DatasetChunk {
offsets: Vec<u64>,
shape: Vec<u64>,
filter_mask: u32,
bytes: Arc<Vec<u8>>,
}
impl DatasetChunk {
pub fn offsets(&self) -> &[u64] {
&self.offsets
}
pub fn shape(&self) -> &[u64] {
&self.shape
}
pub fn filter_mask(&self) -> u32 {
self.filter_mask
}
pub fn bytes(&self) -> &[u8] {
self.bytes.as_ref()
}
}
pub struct DatasetChunkIterator {
dataset: Dataset,
entries: Vec<chunk_index::ChunkEntry>,
index_address: u64,
chunk_shape: Vec<u64>,
elem_size: usize,
next: usize,
}
impl Iterator for DatasetChunkIterator {
type Item = Result<DatasetChunk>;
fn next(&mut self) -> Option<Self::Item> {
let entry = self.entries.get(self.next)?;
self.next += 1;
Some(
self.dataset
.load_exact_chunk_data(entry, self.index_address, &self.chunk_shape, self.elem_size)
.map(|bytes| DatasetChunk {
offsets: entry.offsets.clone(),
shape: self.chunk_shape.clone(),
filter_mask: entry.filter_mask,
bytes,
}),
)
}
}
pub(crate) struct DatasetTemplate {
name: String,
data_address: u64,
dataspace: DataspaceMessage,
datatype: Datatype,
layout: DataLayout,
fill_value: Option<FillValueMessage>,
filters: Option<FilterPipelineMessage>,
external_files: Option<ExternalFilesMessage>,
attributes: Vec<AttributeMessage>,
chunk_entry_cache: Arc<Mutex<LruCache<ChunkEntryCacheKey, Arc<Vec<chunk_index::ChunkEntry>>>>>,
full_chunk_entries: Arc<OnceLock<Arc<Vec<chunk_index::ChunkEntry>>>>,
full_dataset_bytes: Arc<OnceLock<Arc<Vec<u8>>>>,
external_slots: Arc<OnceLock<Arc<Vec<ResolvedExternalRawSlot>>>>,
}
impl Dataset {
pub(crate) fn from_template(context: Arc<FileContext>, template: Arc<DatasetTemplate>) -> Self {
Dataset {
chunk_cache: context.chunk_cache.clone(),
filter_registry: context.filter_registry.clone(),
context,
name: template.name.clone(),
data_address: template.data_address,
dataspace: template.dataspace.clone(),
datatype: template.datatype.clone(),
layout: template.layout.clone(),
fill_value: template.fill_value.clone(),
filters: template.filters.clone(),
external_files: template.external_files.clone(),
attributes: template.attributes.clone(),
chunk_entry_cache: template.chunk_entry_cache.clone(),
full_chunk_entries: template.full_chunk_entries.clone(),
full_dataset_bytes: template.full_dataset_bytes.clone(),
external_slots: template.external_slots.clone(),
}
}
pub(crate) fn template(&self) -> Arc<DatasetTemplate> {
Arc::new(DatasetTemplate {
name: self.name.clone(),
data_address: self.data_address,
dataspace: self.dataspace.clone(),
datatype: self.datatype.clone(),
layout: self.layout.clone(),
fill_value: self.fill_value.clone(),
filters: self.filters.clone(),
external_files: self.external_files.clone(),
attributes: self.attributes.clone(),
chunk_entry_cache: self.chunk_entry_cache.clone(),
full_chunk_entries: self.full_chunk_entries.clone(),
full_dataset_bytes: self.full_dataset_bytes.clone(),
external_slots: self.external_slots.clone(),
})
}
pub(crate) fn from_parsed_header(
context: DatasetParseContext,
address: u64,
name: String,
header: &ObjectHeader,
) -> Result<Self> {
let mut dataspace: Option<DataspaceMessage> = None;
let mut datatype: Option<Datatype> = None;
let mut layout: Option<DataLayout> = None;
let mut fill_value: Option<FillValueMessage> = None;
let mut filter_pipeline: Option<FilterPipelineMessage> = None;
let mut external_files: Option<ExternalFilesMessage> = None;
let attributes = collect_attribute_messages_storage(
header,
context.context.storage.as_ref(),
context.context.superblock.offset_size,
context.context.superblock.length_size,
Some(context.context.filter_registry.as_ref()),
)?;
for msg in &header.messages {
match msg {
HdfMessage::Dataspace(ds) => dataspace = Some(ds.clone()),
HdfMessage::Datatype(dt) => datatype = Some(dt.datatype.clone()),
HdfMessage::DataLayout(dl) => layout = Some(dl.layout.clone()),
HdfMessage::FillValue(fv) => fill_value = Some(fv.clone()),
HdfMessage::FilterPipeline(fp) => filter_pipeline = Some(fp.clone()),
HdfMessage::ExternalFiles(ef) => external_files = Some(ef.clone()),
_ => {}
}
}
let dataspace =
dataspace.ok_or_else(|| Error::InvalidData("dataset missing dataspace".into()))?;
let dt = datatype.ok_or_else(|| Error::InvalidData("dataset missing datatype".into()))?;
let layout =
layout.ok_or_else(|| Error::InvalidData("dataset missing data layout".into()))?;
let layout = normalize_layout(layout, &dataspace);
let attr_fill_value = attributes
.iter()
.find(|attr| attr.name == "_FillValue" && attr.dataspace.num_elements() == 1)
.map(|attr| FillValueMessage {
defined: !attr.raw_data.is_empty(),
fill_time: FillTime::IfSet,
value: Some(attr.raw_data.clone()),
});
let fill_value = match fill_value {
Some(existing) if existing.value.is_some() => Some(existing),
_ => attr_fill_value,
};
Ok(Dataset {
context: context.context.clone(),
name,
data_address: address,
dataspace,
datatype: dt,
layout,
fill_value,
filters: filter_pipeline,
external_files,
attributes,
chunk_cache: context.context.chunk_cache.clone(),
chunk_entry_cache: Arc::new(Mutex::new(LruCache::new(NonZeroUsize::new(32).unwrap()))),
full_chunk_entries: Arc::new(OnceLock::new()),
full_dataset_bytes: Arc::new(OnceLock::new()),
external_slots: Arc::new(OnceLock::new()),
filter_registry: context.context.filter_registry.clone(),
})
}
pub fn name(&self) -> &str {
&self.name
}
pub fn address(&self) -> u64 {
self.data_address
}
pub fn shape(&self) -> &[u64] {
&self.dataspace.dims
}
pub fn dtype(&self) -> &Datatype {
&self.datatype
}
pub fn ndim(&self) -> usize {
self.dataspace.dims.len()
}
fn offset_size(&self) -> u8 {
self.context.superblock.offset_size
}
fn length_size(&self) -> u8 {
self.context.superblock.length_size
}
pub fn max_dims(&self) -> Option<&[u64]> {
self.dataspace.max_dims.as_deref()
}
pub fn chunks(&self) -> Option<Vec<u32>> {
match &self.layout {
DataLayout::Chunked { dims, .. } => Some(dims.clone()),
_ => None,
}
}
pub fn iter_chunks(&self) -> Result<DatasetChunkIterator> {
let DataLayout::Chunked {
address,
dims,
chunk_indexing,
..
} = &self.layout
else {
return Err(Error::InvalidData(format!(
"dataset '{}' is not chunked",
self.name
)));
};
if Cursor::is_undefined_offset(*address, self.offset_size()) {
return Ok(DatasetChunkIterator {
dataset: self.clone(),
entries: Vec::new(),
index_address: *address,
chunk_shape: dims.iter().map(|&d| d as u64).collect(),
elem_size: self.raw_element_size(),
next: 0,
});
}
let ndim = self.ndim();
let shape = &self.dataspace.dims;
let elem_size = self.raw_element_size();
let chunk_shape: Vec<u64> = dims.iter().map(|&d| d as u64).collect();
validate_chunk_shape(shape, &chunk_shape)?;
let entries = self.collect_chunk_entries(
*address,
dims,
chunk_indexing.as_ref(),
ChunkEntrySelection {
shape,
ndim,
elem_size,
chunk_bounds: None,
},
)?;
Ok(DatasetChunkIterator {
dataset: self.clone(),
entries,
index_address: *address,
chunk_shape,
elem_size,
next: 0,
})
}
pub fn chunk_cache_stats(&self) -> ChunkCacheStats {
self.chunk_cache.stats()
}
pub fn fill_value(&self) -> Option<&FillValueMessage> {
self.fill_value.as_ref()
}
pub fn attributes(&self) -> Vec<Attribute> {
self.attributes
.iter()
.map(|a| attribute_from_message_storage(a, self.context.as_ref()))
.collect()
}
pub fn attribute(&self, name: &str) -> Result<Attribute> {
self.attributes
.iter()
.find(|a| a.name == name)
.map(|a| attribute_from_message_storage(a, self.context.as_ref()))
.ok_or_else(|| Error::AttributeNotFound(name.to_string()))
}
pub fn read_string(&self) -> Result<String> {
let mut strings = self.read_strings()?;
match strings.len() {
1 => Ok(strings.swap_remove(0)),
0 => Err(Error::InvalidData(format!(
"dataset '{}' contains no string elements",
self.name
))),
count => Err(Error::InvalidData(format!(
"dataset '{}' contains {count} string elements; use read_strings()",
self.name
))),
}
}
pub fn read_strings(&self) -> Result<Vec<String>> {
match &self.datatype {
Datatype::String {
size: StringSize::Fixed(len),
encoding,
padding,
} => {
let raw = self.read_raw_bytes()?;
let elem_size = *len as usize;
let count = checked_usize(self.num_elements(), "dataset string element count")?;
let expected_bytes =
checked_mul_usize(count, elem_size, "dataset string byte size")?;
if raw.len() < expected_bytes {
return Err(Error::InvalidData(format!(
"dataset '{}' string data too short: need {} bytes, have {}",
self.name,
expected_bytes,
raw.len()
)));
}
let mut strings = Vec::with_capacity(count);
for i in 0..count {
let start = i * elem_size;
let end = start + elem_size;
strings.push(decode_string(&raw[start..end], *padding, *encoding)?);
}
Ok(strings)
}
Datatype::String {
size: StringSize::Variable,
encoding,
padding,
} => {
let raw = self.read_raw_bytes()?;
let count = checked_usize(self.num_elements(), "dataset string element count")?;
let ref_size = 4 + self.offset_size() as usize + 4;
let expected_bytes =
checked_mul_usize(count, ref_size, "dataset string reference byte size")?;
if raw.len() < expected_bytes {
return Err(Error::InvalidData(format!(
"dataset '{}' vlen string data too short: need {} bytes, have {}",
self.name,
expected_bytes,
raw.len()
)));
}
let mut strings = Vec::with_capacity(count);
for i in 0..count {
let offset = i * ref_size;
strings.push(read_one_vlen_string_storage(
&raw,
offset,
self.context.storage.as_ref(),
self.offset_size(),
self.length_size(),
*padding,
*encoding,
)?);
}
Ok(strings)
}
Datatype::VarLen {
base,
kind: VarLenKind::String,
encoding,
padding,
} => {
if !matches!(base.as_ref(), Datatype::FixedPoint { size: 1, .. }) {
return Err(Error::TypeMismatch {
expected: "String dataset".into(),
actual: format!("{:?}", self.datatype),
});
}
let raw = self.read_raw_bytes()?;
let count = checked_usize(self.num_elements(), "dataset string element count")?;
let ref_size = 4 + self.offset_size() as usize + 4;
let expected_bytes =
checked_mul_usize(count, ref_size, "dataset string reference byte size")?;
if raw.len() < expected_bytes {
return Err(Error::InvalidData(format!(
"dataset '{}' vlen byte string data too short: need {} bytes, have {}",
self.name,
expected_bytes,
raw.len()
)));
}
let mut strings = Vec::with_capacity(count);
for i in 0..count {
let offset = i * ref_size;
let ref_bytes = &raw[offset..offset + ref_size];
let value = resolve_vlen_bytes_storage(
ref_bytes,
self.context.storage.as_ref(),
self.offset_size(),
self.length_size(),
)
.unwrap_or_default();
strings.push(decode_string(&value, *padding, *encoding)?);
}
Ok(strings)
}
_ => Err(Error::TypeMismatch {
expected: "String dataset".into(),
actual: format!("{:?}", self.datatype),
}),
}
}
pub fn num_elements(&self) -> u64 {
if self.dataspace.dims.is_empty() {
match self.dataspace.dataspace_type {
DataspaceType::Scalar => 1,
DataspaceType::Null => 0,
DataspaceType::Simple => 0,
}
} else {
self.dataspace.dims.iter().product()
}
}
pub fn read_array<T: H5Type>(&self) -> Result<ArrayD<T>> {
let result = match &self.layout {
DataLayout::Compact { data } => self.read_compact::<T>(data),
DataLayout::Contiguous { address, size } => self.read_contiguous::<T>(*address, *size),
DataLayout::Chunked {
address,
dims,
element_size,
chunk_indexing,
} => self.read_chunked::<T>(*address, dims, *element_size, chunk_indexing.as_ref()),
};
result.map_err(|e| e.with_context(&self.name))
}
pub fn read_into<T: H5Type>(&self, dst: &mut [T]) -> Result<()> {
let result = (|| {
let element_count = checked_usize(self.num_elements(), "dataset element count")?;
if dst.len() != element_count {
return Err(Error::InvalidData(format!(
"destination has {} elements, dataset requires {}",
dst.len(),
element_count
)));
}
let elem_size = self.raw_element_size();
if T::native_copy_compatible(&self.datatype) && std::mem::size_of::<T>() == elem_size {
let dst_bytes = unsafe {
std::slice::from_raw_parts_mut(
dst.as_mut_ptr() as *mut u8,
checked_mul_usize(dst.len(), elem_size, "destination byte length")?,
)
};
return self.read_raw_bytes_into_inner(dst_bytes);
}
let array = self.read_array::<T>()?;
let values = array.as_slice_memory_order().ok_or_else(|| {
Error::InvalidData("decoded array is not contiguous in memory order".into())
})?;
dst.clone_from_slice(values);
Ok(())
})();
result.map_err(|e| e.with_context(&self.name))
}
#[cfg(feature = "rayon")]
pub fn read_array_parallel<T: H5Type>(&self) -> Result<ArrayD<T>> {
match &self.layout {
DataLayout::Chunked {
address,
dims,
element_size,
chunk_indexing,
} => self.read_chunked_parallel::<T>(
*address,
dims,
*element_size,
chunk_indexing.as_ref(),
),
_ => self.read_array::<T>(),
}
}
#[cfg(feature = "rayon")]
pub fn read_array_in_pool<T: H5Type>(&self, pool: &rayon::ThreadPool) -> Result<ArrayD<T>> {
match &self.layout {
DataLayout::Chunked {
address,
dims,
element_size,
chunk_indexing,
} => pool.install(|| {
self.read_chunked_parallel::<T>(
*address,
dims,
*element_size,
chunk_indexing.as_ref(),
)
}),
_ => self.read_array::<T>(),
}
}
#[cfg(feature = "rayon")]
pub fn read_slice_parallel<T: H5Type>(&self, selection: &SliceInfo) -> Result<ArrayD<T>> {
let resolved = normalize_selection(selection, &self.dataspace.dims)?;
match &self.layout {
DataLayout::Chunked {
address,
dims,
element_size,
chunk_indexing,
} => self.read_chunked_slice_parallel::<T>(
*address,
dims,
*element_size,
chunk_indexing.as_ref(),
selection,
&resolved,
),
_ => self.read_slice::<T>(selection),
}
}
pub fn read_slice<T: H5Type>(&self, selection: &SliceInfo) -> Result<ArrayD<T>> {
let resolved = normalize_selection(selection, &self.dataspace.dims)?;
match &self.layout {
DataLayout::Contiguous { address, size } => {
self.read_contiguous_slice::<T>(*address, *size, &resolved)
}
DataLayout::Compact { data } => self.read_compact_slice::<T>(data, selection),
DataLayout::Chunked {
address,
dims,
element_size,
chunk_indexing,
} => self.read_chunked_slice::<T>(
*address,
dims,
*element_size,
chunk_indexing.as_ref(),
selection,
&resolved,
),
}
}
fn read_compact<T: H5Type>(&self, data: &[u8]) -> Result<ArrayD<T>> {
self.validate_allocated_raw_data_len("compact", data.len())?;
self.decode_raw_data::<T>(data)
}
pub fn read_raw_bytes(&self) -> Result<Vec<u8>> {
let result: Result<Vec<u8>> = (|| {
let total_bytes = self.raw_byte_len()?;
let mut output = vec![0u8; total_bytes];
self.read_raw_bytes_into_inner(&mut output)?;
Ok(output)
})();
result.map_err(|e| e.with_context(&self.name))
}
pub fn raw_byte_len(&self) -> Result<usize> {
let elem_size = self.raw_element_size();
let total_elements = checked_usize(self.num_elements(), "dataset element count")?;
checked_mul_usize(total_elements, elem_size, "dataset size in bytes")
}
pub fn read_raw_bytes_into(&self, dst: &mut [u8]) -> Result<()> {
let result: Result<()> = (|| {
let total_bytes = self.raw_byte_len()?;
if dst.len() != total_bytes {
return Err(Error::InvalidData(format!(
"destination has {} bytes, dataset requires {}",
dst.len(),
total_bytes
)));
}
self.read_raw_bytes_into_inner(dst)
})();
result.map_err(|e| e.with_context(&self.name))
}
pub fn read_native_bytes(&self) -> Result<Vec<u8>> {
let result: Result<Vec<u8>> = (|| {
let total_bytes = self.raw_byte_len()?;
let mut output = vec![0u8; total_bytes];
self.read_raw_bytes_into_inner(&mut output)?;
self.convert_to_native_endian(&mut output)?;
Ok(output)
})();
result.map_err(|e| e.with_context(&self.name))
}
pub fn read_native_bytes_into(&self, dst: &mut [u8]) -> Result<()> {
let result: Result<()> = (|| {
let total_bytes = self.raw_byte_len()?;
if dst.len() != total_bytes {
return Err(Error::InvalidData(format!(
"destination has {} bytes, dataset requires {}",
dst.len(),
total_bytes
)));
}
self.read_raw_bytes_into_inner(dst)?;
self.convert_to_native_endian(dst)
})();
result.map_err(|e| e.with_context(&self.name))
}
fn read_raw_bytes_into_inner(&self, dst: &mut [u8]) -> Result<()> {
match &self.layout {
DataLayout::Compact { data } => {
self.validate_allocated_raw_data_len("compact", data.len())?;
dst.copy_from_slice(data);
Ok(())
}
DataLayout::Contiguous { address, size } => {
self.read_contiguous_bytes_into(*address, *size, dst)
}
DataLayout::Chunked {
address,
dims,
element_size: _,
chunk_indexing,
} => self.read_chunked_bytes_into(*address, dims, chunk_indexing.as_ref(), dst),
}
}
pub fn vlen_reference_size(&self) -> usize {
4 + self.offset_size() as usize + 4
}
pub fn raw_element_size(&self) -> usize {
raw_element_size_for_datatype(&self.datatype, self.vlen_reference_size())
}
pub fn resolve_vlen_reference_bytes(
&self,
reference: &[u8],
base_element_size: usize,
) -> Result<Vec<u8>> {
if reference.len() < self.vlen_reference_size() {
return Err(Error::InvalidData(format!(
"dataset '{}' vlen reference too short: need {} bytes, have {}",
self.name,
self.vlen_reference_size(),
reference.len()
)));
}
let mut cursor = Cursor::new(reference);
let seq_len = cursor.read_u32_le()? as usize;
let heap_addr = cursor.read_offset(self.offset_size())?;
let obj_index = cursor.read_u32_le()? as u16;
if Cursor::is_undefined_offset(heap_addr, self.offset_size()) || obj_index == 0 {
return Ok(Vec::new());
}
let expected_bytes =
checked_mul_usize(seq_len, base_element_size, "vlen sequence byte size")?;
let collection = crate::global_heap::GlobalHeapCollection::parse_at_storage(
self.context.storage.as_ref(),
heap_addr,
self.offset_size(),
self.length_size(),
)?;
let object = collection.get_object(obj_index).ok_or_else(|| {
Error::InvalidData(format!(
"dataset '{}' references missing vlen heap object {}",
self.name, obj_index
))
})?;
if object.data.len() < expected_bytes {
return Err(Error::InvalidData(format!(
"dataset '{}' vlen heap object too short: need {} bytes, have {}",
self.name,
expected_bytes,
object.data.len()
)));
}
Ok(object.data[..expected_bytes].to_vec())
}
fn read_contiguous<T: H5Type>(&self, address: u64, size: u64) -> Result<ArrayD<T>> {
if self.external_files.is_some() {
let elem_size = self.raw_element_size();
let total_elements = checked_usize(self.num_elements(), "dataset element count")?;
let total_bytes =
checked_mul_usize(total_elements, elem_size, "dataset size in bytes")?;
let raw = self.read_external_range(0, total_bytes)?;
return self.decode_raw_data::<T>(&raw);
}
if Cursor::is_undefined_offset(address, self.offset_size()) || size == 0 {
return self.make_fill_array::<T>();
}
let sz = checked_usize(size, "contiguous dataset size")?;
self.validate_allocated_raw_data_len("contiguous", sz)?;
let raw = self.context.read_range(address, sz)?;
self.decode_raw_data::<T>(raw.as_ref())
}
fn read_contiguous_bytes_into(&self, address: u64, size: u64, dst: &mut [u8]) -> Result<()> {
if self.external_files.is_some() {
return self.read_external_range_into(0, dst);
}
if Cursor::is_undefined_offset(address, self.offset_size()) || size == 0 {
self.fill_output_buffer(dst);
return Ok(());
}
let sz = checked_usize(size, "contiguous dataset size")?;
self.validate_allocated_raw_data_len("contiguous", sz)?;
if dst.is_empty() {
return Ok(());
}
let raw = self.context.read_range(address, sz)?;
dst.copy_from_slice(raw.as_ref());
Ok(())
}
fn read_contiguous_logical_range(
&self,
address: u64,
logical_offset: usize,
len: usize,
) -> Result<Vec<u8>> {
if self.external_files.is_some() {
return self.read_external_range(logical_offset, len);
}
let file_offset = checked_add_u64(
address,
u64::try_from(logical_offset).map_err(|_| {
Error::InvalidData("contiguous logical offset exceeds u64 capacity".to_string())
})?,
"contiguous read file offset",
)?;
Ok(self.context.read_range(file_offset, len)?.to_vec())
}
fn read_external_range(&self, logical_offset: usize, len: usize) -> Result<Vec<u8>> {
let mut output = vec![0u8; len];
self.read_external_range_into(logical_offset, &mut output)?;
Ok(output)
}
fn read_external_range_into(&self, logical_offset: usize, dst: &mut [u8]) -> Result<()> {
self.fill_output_buffer(dst);
if dst.is_empty() {
return Ok(());
}
let request_start = u64::try_from(logical_offset).map_err(|_| {
Error::InvalidData("external dataset offset exceeds u64 capacity".to_string())
})?;
let request_len = u64::try_from(dst.len()).map_err(|_| {
Error::InvalidData("external dataset length exceeds u64 capacity".to_string())
})?;
let request_end = request_start
.checked_add(request_len)
.ok_or_else(|| Error::InvalidData("external dataset range overflows".into()))?;
for slot in self.external_raw_slots()?.iter() {
let slot_end = slot.logical_offset.saturating_add(slot.size);
let overlap_start = request_start.max(slot.logical_offset);
let overlap_end = request_end.min(slot_end);
if overlap_start >= overlap_end {
continue;
}
let read_offset = slot
.file_offset
.checked_add(overlap_start - slot.logical_offset)
.ok_or_else(|| Error::InvalidData("external file read offset overflows".into()))?;
let read_len = checked_usize(overlap_end - overlap_start, "external read length")?;
let dst_start = checked_usize(overlap_start - request_start, "external read dst")?;
let dst_end = checked_add_usize(dst_start, read_len, "external read dst end")?;
let bytes = slot.storage.read_range(read_offset, read_len)?;
dst[dst_start..dst_end].copy_from_slice(bytes.as_ref());
}
Ok(())
}
fn external_raw_slots(&self) -> Result<Arc<Vec<ResolvedExternalRawSlot>>> {
if let Some(slots) = self.external_slots.get() {
return Ok(slots.clone());
}
let slots = Arc::new(self.load_external_raw_slots()?);
let _ = self.external_slots.set(slots.clone());
Ok(self
.external_slots
.get()
.expect("external slot cache must exist after initialization")
.clone())
}
fn load_external_raw_slots(&self) -> Result<Vec<ResolvedExternalRawSlot>> {
let Some(external_files) = self.external_files.as_ref() else {
return Ok(Vec::new());
};
let heap = LocalHeap::parse_at_storage(
self.context.storage.as_ref(),
external_files.heap_address,
self.offset_size(),
self.length_size(),
)?;
let mut logical_offset = 0u64;
let mut slots = Vec::with_capacity(external_files.slots.len());
for slot in &external_files.slots {
let filename =
heap.get_string_storage(slot.name_offset, self.context.storage.as_ref())?;
let storage = self
.context
.resolve_external_file(&filename)?
.ok_or_else(|| {
Error::Other(format!(
"external raw data file '{filename}' could not be resolved"
))
})?;
let size = if Cursor::is_undefined_offset(slot.size, self.length_size()) {
u64::MAX.saturating_sub(logical_offset)
} else {
slot.size
};
slots.push(ResolvedExternalRawSlot {
logical_offset,
storage,
file_offset: slot.offset,
size,
});
if Cursor::is_undefined_offset(slot.size, self.length_size()) {
break;
}
logical_offset = logical_offset.checked_add(slot.size).ok_or_else(|| {
Error::InvalidData("external raw data logical offset overflows".into())
})?;
}
Ok(slots)
}
fn read_chunked<T: H5Type>(
&self,
index_address: u64,
chunk_dims: &[u32],
_element_size: u32,
chunk_indexing: Option<&ChunkIndexing>,
) -> Result<ArrayD<T>> {
if Cursor::is_undefined_offset(index_address, self.offset_size()) {
return self.make_fill_array::<T>();
}
let ndim = self.ndim();
let shape = &self.dataspace.dims;
let elem_size = self.raw_element_size();
let total_elements = checked_usize(self.num_elements(), "dataset element count")?;
let total_bytes = checked_mul_usize(total_elements, elem_size, "dataset size in bytes")?;
if total_bytes <= HOT_FULL_DATASET_CACHE_MAX_BYTES {
if let Some(cached_bytes) = self.full_dataset_bytes.get() {
return self.decode_raw_data::<T>(cached_bytes);
}
}
let chunk_shape: Vec<u64> = chunk_dims.iter().map(|&d| d as u64).collect();
validate_chunk_shape(shape, &chunk_shape)?;
let dataset_strides = row_major_strides(shape, "dataset stride")?;
let chunk_strides = row_major_strides(&chunk_shape, "chunk stride")?;
let mut entries = self.collect_chunk_entries(
index_address,
chunk_dims,
chunk_indexing,
ChunkEntrySelection {
shape,
ndim,
elem_size,
chunk_bounds: None,
},
)?;
let full_chunk_coverage = match full_dataset_chunk_bounds(shape, &chunk_shape)? {
Some((first_chunk, last_chunk)) => validate_chunk_grid_coverage(
&mut entries,
shape,
&chunk_shape,
&first_chunk,
&last_chunk,
)?,
None if entries.is_empty() => true,
None => {
return Err(Error::InvalidData(
"chunk index contains entries for an empty dataset".into(),
))
}
};
if full_chunk_coverage {
if T::native_copy_compatible(&self.datatype) && std::mem::size_of::<T>() == elem_size {
let mut result_values: Vec<MaybeUninit<T>> =
std::iter::repeat_with(MaybeUninit::<T>::uninit)
.take(total_elements)
.collect();
let result_ptr = result_values.as_mut_ptr() as *mut u8;
let result_len = checked_mul_usize(
result_values.len(),
std::mem::size_of::<T>(),
"typed dataset size in bytes",
)?;
for entry in &entries {
let chunk_data =
self.load_exact_chunk_data(entry, index_address, &chunk_shape, elem_size)?;
unsafe {
copy_chunk_to_flat_with_strides_ptr(
&chunk_data,
FlatBufferPtr {
ptr: result_ptr,
len: result_len,
},
ChunkCopyLayout {
chunk_offsets: &entry.offsets,
chunk_shape: &chunk_shape,
dataset_shape: shape,
dataset_strides: &dataset_strides,
chunk_strides: &chunk_strides,
elem_size,
},
)?;
}
}
if total_bytes <= HOT_FULL_DATASET_CACHE_MAX_BYTES {
let mut cached_bytes = vec![0u8; total_bytes];
unsafe {
std::ptr::copy_nonoverlapping(
result_ptr,
cached_bytes.as_mut_ptr(),
total_bytes,
);
}
let _ = self.full_dataset_bytes.set(Arc::new(cached_bytes));
}
let mut result_shape = Vec::with_capacity(shape.len());
for &dim in shape {
result_shape.push(checked_usize(dim, "dataset dimension")?);
}
let result_values = assume_init_vec(result_values);
return ArrayD::from_shape_vec(IxDyn(&result_shape), result_values)
.map_err(|e| Error::InvalidData(format!("array shape error: {e}")));
}
let mut flat_data = vec![MaybeUninit::<u8>::uninit(); total_bytes];
let flat_ptr = flat_data.as_mut_ptr() as *mut u8;
let flat_len = flat_data.len();
for entry in &entries {
let chunk_data =
self.load_exact_chunk_data(entry, index_address, &chunk_shape, elem_size)?;
unsafe {
copy_chunk_to_flat_with_strides_ptr(
&chunk_data,
FlatBufferPtr {
ptr: flat_ptr,
len: flat_len,
},
ChunkCopyLayout {
chunk_offsets: &entry.offsets,
chunk_shape: &chunk_shape,
dataset_shape: shape,
dataset_strides: &dataset_strides,
chunk_strides: &chunk_strides,
elem_size,
},
)?;
}
}
let flat_data = assume_init_u8_vec(flat_data);
if total_bytes <= HOT_FULL_DATASET_CACHE_MAX_BYTES {
let _ = self.full_dataset_bytes.set(Arc::new(flat_data.clone()));
}
return self.decode_raw_data::<T>(&flat_data);
}
let mut flat_data = self.make_output_buffer(total_bytes);
for entry in &entries {
let chunk_data =
self.load_exact_chunk_data(entry, index_address, &chunk_shape, elem_size)?;
copy_chunk_to_flat_with_strides(
&chunk_data,
&mut flat_data,
ChunkCopyLayout {
chunk_offsets: &entry.offsets,
chunk_shape: &chunk_shape,
dataset_shape: shape,
dataset_strides: &dataset_strides,
chunk_strides: &chunk_strides,
elem_size,
},
)?;
}
self.decode_raw_data::<T>(&flat_data)
}
fn read_chunked_bytes_into(
&self,
index_address: u64,
chunk_dims: &[u32],
chunk_indexing: Option<&ChunkIndexing>,
dst: &mut [u8],
) -> Result<()> {
if Cursor::is_undefined_offset(index_address, self.offset_size()) {
self.fill_output_buffer(dst);
return Ok(());
}
let ndim = self.ndim();
let shape = &self.dataspace.dims;
let elem_size = self.raw_element_size();
if dst.len() <= HOT_FULL_DATASET_CACHE_MAX_BYTES {
if let Some(cached_bytes) = self.full_dataset_bytes.get() {
if cached_bytes.len() == dst.len() {
dst.copy_from_slice(cached_bytes.as_slice());
return Ok(());
}
}
}
let chunk_shape: Vec<u64> = chunk_dims.iter().map(|&d| d as u64).collect();
validate_chunk_shape(shape, &chunk_shape)?;
let dataset_strides = row_major_strides(shape, "dataset stride")?;
let chunk_strides = row_major_strides(&chunk_shape, "chunk stride")?;
let mut entries = self.collect_chunk_entries(
index_address,
chunk_dims,
chunk_indexing,
ChunkEntrySelection {
shape,
ndim,
elem_size,
chunk_bounds: None,
},
)?;
let full_chunk_coverage = match full_dataset_chunk_bounds(shape, &chunk_shape)? {
Some((first_chunk, last_chunk)) => validate_chunk_grid_coverage(
&mut entries,
shape,
&chunk_shape,
&first_chunk,
&last_chunk,
)?,
None if entries.is_empty() => true,
None => {
return Err(Error::InvalidData(
"chunk index contains entries for an empty dataset".into(),
))
}
};
self.fill_output_buffer(dst);
for entry in &entries {
let chunk_data =
self.load_exact_chunk_data(entry, index_address, &chunk_shape, elem_size)?;
copy_chunk_to_flat_with_strides(
&chunk_data,
dst,
ChunkCopyLayout {
chunk_offsets: &entry.offsets,
chunk_shape: &chunk_shape,
dataset_shape: shape,
dataset_strides: &dataset_strides,
chunk_strides: &chunk_strides,
elem_size,
},
)?;
}
if full_chunk_coverage && dst.len() <= HOT_FULL_DATASET_CACHE_MAX_BYTES {
let _ = self.full_dataset_bytes.set(Arc::new(dst.to_vec()));
}
Ok(())
}
#[cfg(feature = "rayon")]
fn read_chunked_parallel<T: H5Type>(
&self,
index_address: u64,
chunk_dims: &[u32],
_element_size: u32,
chunk_indexing: Option<&ChunkIndexing>,
) -> Result<ArrayD<T>> {
if Cursor::is_undefined_offset(index_address, self.offset_size()) {
return self.make_fill_array::<T>();
}
let ndim = self.ndim();
let shape = &self.dataspace.dims;
let elem_size = self.raw_element_size();
let total_elements = checked_usize(self.num_elements(), "dataset element count")?;
let total_bytes = checked_mul_usize(total_elements, elem_size, "dataset size in bytes")?;
if total_bytes <= HOT_FULL_DATASET_CACHE_MAX_BYTES {
if let Some(cached_bytes) = self.full_dataset_bytes.get() {
return self.decode_raw_data::<T>(cached_bytes);
}
}
let chunk_shape: Vec<u64> = chunk_dims.iter().map(|&d| d as u64).collect();
validate_chunk_shape(shape, &chunk_shape)?;
let dataset_strides = row_major_strides(shape, "dataset stride")?;
let chunk_strides = row_major_strides(&chunk_shape, "chunk stride")?;
let mut entries = self.collect_chunk_entries(
index_address,
chunk_dims,
chunk_indexing,
ChunkEntrySelection {
shape,
ndim,
elem_size,
chunk_bounds: None,
},
)?;
let full_chunk_coverage = match full_dataset_chunk_bounds(shape, &chunk_shape)? {
Some((first_chunk, last_chunk)) => validate_chunk_grid_coverage(
&mut entries,
shape,
&chunk_shape,
&first_chunk,
&last_chunk,
)?,
None if entries.is_empty() => true,
None => {
return Err(Error::InvalidData(
"chunk index contains entries for an empty dataset".into(),
))
}
};
if full_chunk_coverage {
if T::native_copy_compatible(&self.datatype) && std::mem::size_of::<T>() == elem_size {
let mut result_values: Vec<MaybeUninit<T>> =
std::iter::repeat_with(MaybeUninit::<T>::uninit)
.take(total_elements)
.collect();
let flat = FlatBufferPtr {
ptr: result_values.as_mut_ptr() as *mut u8,
len: checked_mul_usize(
result_values.len(),
std::mem::size_of::<T>(),
"typed dataset size in bytes",
)?,
};
entries
.par_iter()
.map(|entry| {
self.load_exact_chunk_data(entry, index_address, &chunk_shape, elem_size)
.and_then(|data| unsafe {
flat.copy_chunk(
&data,
ChunkCopyLayout {
chunk_offsets: &entry.offsets,
chunk_shape: &chunk_shape,
dataset_shape: shape,
dataset_strides: &dataset_strides,
chunk_strides: &chunk_strides,
elem_size,
},
)
})
})
.collect::<std::result::Result<Vec<_>, Error>>()?;
let mut result_shape = Vec::with_capacity(shape.len());
for &dim in shape {
result_shape.push(checked_usize(dim, "dataset dimension")?);
}
if total_bytes <= HOT_FULL_DATASET_CACHE_MAX_BYTES {
let mut cached_bytes = vec![0u8; total_bytes];
unsafe {
std::ptr::copy_nonoverlapping(
flat.ptr,
cached_bytes.as_mut_ptr(),
total_bytes,
);
}
let _ = self.full_dataset_bytes.set(Arc::new(cached_bytes));
}
let result_values = assume_init_vec(result_values);
return ArrayD::from_shape_vec(IxDyn(&result_shape), result_values)
.map_err(|e| Error::InvalidData(format!("array shape error: {e}")));
}
let mut flat_data = vec![MaybeUninit::<u8>::uninit(); total_bytes];
let flat = FlatBufferPtr {
ptr: flat_data.as_mut_ptr() as *mut u8,
len: flat_data.len(),
};
entries
.par_iter()
.map(|entry| {
self.load_exact_chunk_data(entry, index_address, &chunk_shape, elem_size)
.and_then(|data| unsafe {
flat.copy_chunk(
&data,
ChunkCopyLayout {
chunk_offsets: &entry.offsets,
chunk_shape: &chunk_shape,
dataset_shape: shape,
dataset_strides: &dataset_strides,
chunk_strides: &chunk_strides,
elem_size,
},
)
})
})
.collect::<std::result::Result<Vec<_>, Error>>()?;
let flat_data = assume_init_u8_vec(flat_data);
if total_bytes <= HOT_FULL_DATASET_CACHE_MAX_BYTES {
let _ = self.full_dataset_bytes.set(Arc::new(flat_data.clone()));
}
return self.decode_raw_data::<T>(&flat_data);
}
let mut flat_data = self.make_output_buffer(total_bytes);
let flat = FlatBufferPtr {
ptr: flat_data.as_mut_ptr(),
len: flat_data.len(),
};
entries
.par_iter()
.map(|entry| {
self.load_exact_chunk_data(entry, index_address, &chunk_shape, elem_size)
.and_then(|data| unsafe {
flat.copy_chunk(
&data,
ChunkCopyLayout {
chunk_offsets: &entry.offsets,
chunk_shape: &chunk_shape,
dataset_shape: shape,
dataset_strides: &dataset_strides,
chunk_strides: &chunk_strides,
elem_size,
},
)
})
})
.collect::<std::result::Result<Vec<_>, Error>>()?;
self.decode_raw_data::<T>(&flat_data)
}
fn collect_chunk_entries(
&self,
index_address: u64,
chunk_dims: &[u32],
chunk_indexing: Option<&ChunkIndexing>,
selection: ChunkEntrySelection<'_>,
) -> Result<Vec<chunk_index::ChunkEntry>> {
if selection.chunk_bounds.is_none() {
if let Some(cached) = self.full_chunk_entries.get() {
return Ok((**cached).clone());
}
}
let cache_key =
selection
.chunk_bounds
.map(|(first_chunk, last_chunk)| ChunkEntryCacheKey {
index_address,
first_chunk: SmallVec::from_slice(first_chunk),
last_chunk: SmallVec::from_slice(last_chunk),
});
if let Some(ref key) = cache_key {
let mut cache = self.chunk_entry_cache.lock();
if let Some(cached) = cache.get(key) {
return Ok((**cached).clone());
}
}
let entries = match chunk_indexing {
None => {
self.collect_btree_v1_entries(
index_address,
selection.ndim,
chunk_dims,
selection.chunk_bounds,
)
}
Some(ChunkIndexing::SingleChunk {
filtered_size,
filters,
}) => Ok(vec![chunk_index::single_chunk_entry(
index_address,
*filtered_size,
*filters,
selection.ndim,
)]),
Some(ChunkIndexing::BTreeV2) => chunk_index::collect_v2_chunk_entries_storage(
self.context.storage.as_ref(),
index_address,
self.offset_size(),
self.length_size(),
selection.ndim as u32,
chunk_dims,
selection.chunk_bounds,
),
Some(ChunkIndexing::Implicit) => Ok(chunk_index::collect_implicit_chunk_entries(
index_address,
selection.shape,
chunk_dims,
selection.elem_size,
selection.chunk_bounds,
)),
Some(ChunkIndexing::FixedArray { .. }) => {
crate::fixed_array::collect_fixed_array_chunk_entries_storage(
self.context.storage.as_ref(),
index_address,
self.offset_size(),
self.length_size(),
selection.shape,
chunk_dims,
selection.chunk_bounds,
)
}
Some(ChunkIndexing::ExtensibleArray { .. }) => {
crate::extensible_array::collect_extensible_array_chunk_entries_storage(
self.context.storage.as_ref(),
index_address,
self.offset_size(),
self.length_size(),
selection.shape,
chunk_dims,
selection.chunk_bounds,
)
}
}?;
if let Some(key) = cache_key {
let mut cache = self.chunk_entry_cache.lock();
cache.put(key, Arc::new(entries.clone()));
} else {
let _ = self.full_chunk_entries.set(Arc::new(entries.clone()));
}
Ok(entries)
}
fn collect_btree_v1_entries(
&self,
btree_address: u64,
ndim: usize,
chunk_dims: &[u32],
chunk_bounds: Option<(&[u64], &[u64])>,
) -> Result<Vec<chunk_index::ChunkEntry>> {
let leaves = crate::btree_v1::collect_btree_v1_leaves_storage(
self.context.storage.as_ref(),
btree_address,
self.offset_size(),
self.length_size(),
Some(ndim as u32),
chunk_dims,
chunk_bounds,
)?;
let mut entries = Vec::with_capacity(leaves.len());
for (key, chunk_addr) in &leaves {
match key {
crate::btree_v1::BTreeV1Key::RawData {
chunk_size,
filter_mask,
offsets,
} => {
entries.push(chunk_index::ChunkEntry {
address: *chunk_addr,
size: *chunk_size as u64,
filter_mask: *filter_mask,
offsets: offsets[..ndim].to_vec(),
});
}
_ => {
return Err(Error::InvalidData(
"expected raw data key in chunk B-tree".into(),
))
}
}
}
Ok(entries)
}
fn load_chunk_data(
&self,
entry: &chunk_index::ChunkEntry,
dataset_addr: u64,
chunk_shape: &[u64],
elem_size: usize,
) -> Result<Arc<Vec<u8>>> {
let cache_key = ChunkKey {
dataset_addr,
chunk_offsets: smallvec::SmallVec::from_slice(&entry.offsets),
};
self.chunk_cache.get_or_insert_with(cache_key, || {
let size = if entry.size > 0 {
checked_usize(entry.size, "encoded chunk size")?
} else {
let chunk_elements =
checked_shape_elements_usize(chunk_shape, "chunk element count")?;
checked_mul_usize(chunk_elements, elem_size, "chunk byte size")?
};
let raw = self.context.read_range(entry.address, size)?;
if let Some(ref pipeline) = self.filters {
filters::apply_pipeline(
raw.as_ref(),
&pipeline.filters,
entry.filter_mask,
elem_size,
Some(&self.filter_registry),
)
} else {
Ok(raw.to_vec())
}
})
}
fn load_exact_chunk_data(
&self,
entry: &chunk_index::ChunkEntry,
dataset_addr: u64,
chunk_shape: &[u64],
elem_size: usize,
) -> Result<Arc<Vec<u8>>> {
let data = self.load_chunk_data(entry, dataset_addr, chunk_shape, elem_size)?;
validate_decoded_chunk_len(entry, chunk_shape, elem_size, data.len())?;
Ok(data)
}
fn read_chunked_slice<T: H5Type>(
&self,
index_address: u64,
chunk_dims: &[u32],
_element_size: u32,
chunk_indexing: Option<&ChunkIndexing>,
_selection: &SliceInfo,
resolved: &ResolvedSelection,
) -> Result<ArrayD<T>> {
if resolved.result_elements == 0 {
return self.make_fill_array_from_shape::<T>(0, &resolved.result_shape);
}
if Cursor::is_undefined_offset(index_address, self.offset_size()) {
return self
.make_fill_array_from_shape::<T>(resolved.result_elements, &resolved.result_shape);
}
let ndim = self.ndim();
let shape = &self.dataspace.dims;
let elem_size = dtype_element_size(&self.datatype);
let chunk_shape: Vec<u64> = chunk_dims.iter().map(|&d| d as u64).collect();
validate_chunk_shape(shape, &chunk_shape)?;
let mut first_chunk = vec![0u64; ndim];
let mut last_chunk = vec![0u64; ndim];
for d in 0..ndim {
let (first, last) = resolved.dims[d]
.chunk_index_range(chunk_shape[d])
.expect("zero-sized result handled above");
first_chunk[d] = first;
last_chunk[d] = last;
}
let mut overlapping = self.collect_chunk_entries(
index_address,
chunk_dims,
chunk_indexing,
ChunkEntrySelection {
shape,
ndim,
elem_size,
chunk_bounds: Some((&first_chunk, &last_chunk)),
},
)?;
let fully_covered_grid = validate_chunk_grid_coverage(
&mut overlapping,
shape,
&chunk_shape,
&first_chunk,
&last_chunk,
)?;
let result_total_bytes = checked_mul_usize(
resolved.result_elements,
elem_size,
"slice result size in bytes",
)?;
let result_dims = resolved.result_dims_with_collapsed();
let mut result_strides = vec![1usize; ndim];
for d in (0..ndim.saturating_sub(1)).rev() {
result_strides[d] =
checked_mul_usize(result_strides[d + 1], result_dims[d + 1], "result stride")?;
}
let mut chunk_strides = vec![1usize; ndim];
for d in (0..ndim.saturating_sub(1)).rev() {
chunk_strides[d] = checked_mul_usize(
chunk_strides[d + 1],
chunk_shape[d + 1] as usize,
"chunk stride",
)?;
}
let use_unit_stride_fast_path = resolved.is_unit_stride();
let fully_covered_unit_stride = use_unit_stride_fast_path && fully_covered_grid;
if fully_covered_unit_stride {
if T::native_copy_compatible(&self.datatype) && std::mem::size_of::<T>() == elem_size {
let mut result_values: Vec<MaybeUninit<T>> =
std::iter::repeat_with(MaybeUninit::<T>::uninit)
.take(resolved.result_elements)
.collect();
let result_ptr = result_values.as_mut_ptr() as *mut u8;
let result_len = checked_mul_usize(
result_values.len(),
std::mem::size_of::<T>(),
"typed slice result size in bytes",
)?;
for entry in &overlapping {
let chunk_data =
self.load_exact_chunk_data(entry, index_address, &chunk_shape, elem_size)?;
unsafe {
copy_unit_stride_chunk_overlap_ptr(
&chunk_data,
FlatBufferPtr {
ptr: result_ptr,
len: result_len,
},
UnitStrideCopyLayout {
chunk_offsets: &entry.offsets,
chunk_shape: &chunk_shape,
dataset_shape: shape,
resolved,
chunk_strides: &chunk_strides,
result_strides: &result_strides,
elem_size,
},
)?;
}
}
let result_values = assume_init_vec(result_values);
return ArrayD::from_shape_vec(IxDyn(&resolved.result_shape), result_values)
.map_err(|e| Error::InvalidData(format!("array shape error: {e}")));
}
let mut result_buf = vec![MaybeUninit::<u8>::uninit(); result_total_bytes];
let result_ptr = result_buf.as_mut_ptr() as *mut u8;
let result_len = result_buf.len();
for entry in &overlapping {
let chunk_data =
self.load_exact_chunk_data(entry, index_address, &chunk_shape, elem_size)?;
unsafe {
copy_unit_stride_chunk_overlap_ptr(
&chunk_data,
FlatBufferPtr {
ptr: result_ptr,
len: result_len,
},
UnitStrideCopyLayout {
chunk_offsets: &entry.offsets,
chunk_shape: &chunk_shape,
dataset_shape: shape,
resolved,
chunk_strides: &chunk_strides,
result_strides: &result_strides,
elem_size,
},
)?;
}
}
let result_buf = assume_init_u8_vec(result_buf);
return self.decode_buffer_with_shape::<T>(
&result_buf,
resolved.result_elements,
&resolved.result_shape,
);
}
let mut result_buf = self.make_output_buffer(result_total_bytes);
for entry in &overlapping {
let chunk_data =
self.load_exact_chunk_data(entry, index_address, &chunk_shape, elem_size)?;
if use_unit_stride_fast_path {
copy_unit_stride_chunk_overlap(
&chunk_data,
&mut result_buf,
UnitStrideCopyLayout {
chunk_offsets: &entry.offsets,
chunk_shape: &chunk_shape,
dataset_shape: shape,
resolved,
chunk_strides: &chunk_strides,
result_strides: &result_strides,
elem_size,
},
)?;
continue;
}
let mut dim_indices: Vec<Vec<(usize, usize)>> = Vec::with_capacity(ndim);
for d in 0..ndim {
let chunk_start = entry.offsets[d];
let chunk_end = (chunk_start + chunk_shape[d]).min(shape[d]);
let dim = &resolved.dims[d];
let sel_start = dim.start;
let sel_end = dim.end;
let sel_step = dim.step;
let mut indices = Vec::new();
let first_sel = if sel_start >= chunk_start {
sel_start
} else {
let steps_to_skip = (chunk_start - sel_start).div_ceil(sel_step);
sel_start + steps_to_skip * sel_step
};
let mut sel_idx = first_sel;
while sel_idx < sel_end && sel_idx < chunk_end {
let chunk_local = checked_usize(sel_idx - chunk_start, "chunk-local index")?;
let result_dim_idx =
checked_usize((sel_idx - dim.start) / sel_step, "result index")?;
indices.push((chunk_local, result_dim_idx));
sel_idx += sel_step;
}
dim_indices.push(indices);
}
copy_selected_elements(
&chunk_data,
&mut result_buf,
&dim_indices,
&chunk_strides,
&result_strides,
elem_size,
ndim,
)?;
}
self.decode_buffer_with_shape::<T>(
&result_buf,
resolved.result_elements,
&resolved.result_shape,
)
}
#[cfg(feature = "rayon")]
fn read_chunked_slice_parallel<T: H5Type>(
&self,
index_address: u64,
chunk_dims: &[u32],
_element_size: u32,
chunk_indexing: Option<&ChunkIndexing>,
_selection: &SliceInfo,
resolved: &ResolvedSelection,
) -> Result<ArrayD<T>> {
if resolved.result_elements == 0 {
return self.make_fill_array_from_shape::<T>(0, &resolved.result_shape);
}
if Cursor::is_undefined_offset(index_address, self.offset_size()) {
return self
.make_fill_array_from_shape::<T>(resolved.result_elements, &resolved.result_shape);
}
let ndim = self.ndim();
let shape = &self.dataspace.dims;
let elem_size = dtype_element_size(&self.datatype);
let chunk_shape: Vec<u64> = chunk_dims.iter().map(|&d| d as u64).collect();
validate_chunk_shape(shape, &chunk_shape)?;
let mut first_chunk = vec![0u64; ndim];
let mut last_chunk = vec![0u64; ndim];
for d in 0..ndim {
let (first, last) = resolved.dims[d]
.chunk_index_range(chunk_shape[d])
.expect("zero-sized result handled above");
first_chunk[d] = first;
last_chunk[d] = last;
}
let mut overlapping = self.collect_chunk_entries(
index_address,
chunk_dims,
chunk_indexing,
ChunkEntrySelection {
shape,
ndim,
elem_size,
chunk_bounds: Some((&first_chunk, &last_chunk)),
},
)?;
let fully_covered_grid = validate_chunk_grid_coverage(
&mut overlapping,
shape,
&chunk_shape,
&first_chunk,
&last_chunk,
)?;
let result_total_bytes = checked_mul_usize(
resolved.result_elements,
elem_size,
"slice result size in bytes",
)?;
let result_dims = resolved.result_dims_with_collapsed();
let mut result_strides = vec![1usize; ndim];
for d in (0..ndim.saturating_sub(1)).rev() {
result_strides[d] =
checked_mul_usize(result_strides[d + 1], result_dims[d + 1], "result stride")?;
}
let mut chunk_strides = vec![1usize; ndim];
for d in (0..ndim.saturating_sub(1)).rev() {
chunk_strides[d] = checked_mul_usize(
chunk_strides[d + 1],
chunk_shape[d + 1] as usize,
"chunk stride",
)?;
}
let use_unit_stride_fast_path = resolved.is_unit_stride();
let fully_covered_unit_stride = use_unit_stride_fast_path && fully_covered_grid;
if fully_covered_unit_stride {
if T::native_copy_compatible(&self.datatype) && std::mem::size_of::<T>() == elem_size {
let mut result_values: Vec<MaybeUninit<T>> =
std::iter::repeat_with(MaybeUninit::<T>::uninit)
.take(resolved.result_elements)
.collect();
let flat = FlatBufferPtr {
ptr: result_values.as_mut_ptr() as *mut u8,
len: checked_mul_usize(
result_values.len(),
std::mem::size_of::<T>(),
"typed slice result size in bytes",
)?,
};
overlapping
.par_iter()
.map(|entry| {
let chunk_data = self.load_exact_chunk_data(
entry,
index_address,
&chunk_shape,
elem_size,
)?;
unsafe {
flat.copy_unit_stride_chunk_overlap(
&chunk_data,
UnitStrideCopyLayout {
chunk_offsets: &entry.offsets,
chunk_shape: &chunk_shape,
dataset_shape: shape,
resolved,
chunk_strides: &chunk_strides,
result_strides: &result_strides,
elem_size,
},
)?;
}
Ok(())
})
.collect::<std::result::Result<Vec<_>, Error>>()?;
let result_values = assume_init_vec(result_values);
return ArrayD::from_shape_vec(IxDyn(&resolved.result_shape), result_values)
.map_err(|e| Error::InvalidData(format!("array shape error: {e}")));
}
let mut result_buf = vec![MaybeUninit::<u8>::uninit(); result_total_bytes];
let flat = FlatBufferPtr {
ptr: result_buf.as_mut_ptr() as *mut u8,
len: result_buf.len(),
};
overlapping
.par_iter()
.map(|entry| {
let chunk_data =
self.load_exact_chunk_data(entry, index_address, &chunk_shape, elem_size)?;
unsafe {
flat.copy_unit_stride_chunk_overlap(
&chunk_data,
UnitStrideCopyLayout {
chunk_offsets: &entry.offsets,
chunk_shape: &chunk_shape,
dataset_shape: shape,
resolved,
chunk_strides: &chunk_strides,
result_strides: &result_strides,
elem_size,
},
)?;
}
Ok(())
})
.collect::<std::result::Result<Vec<_>, Error>>()?;
let result_buf = assume_init_u8_vec(result_buf);
return self.decode_buffer_with_shape::<T>(
&result_buf,
resolved.result_elements,
&resolved.result_shape,
);
}
let mut result_buf = self.make_output_buffer(result_total_bytes);
let flat = FlatBufferPtr {
ptr: result_buf.as_mut_ptr(),
len: result_buf.len(),
};
overlapping
.par_iter()
.map(|entry| {
let chunk_data =
self.load_exact_chunk_data(entry, index_address, &chunk_shape, elem_size)?;
if use_unit_stride_fast_path {
unsafe {
flat.copy_unit_stride_chunk_overlap(
&chunk_data,
UnitStrideCopyLayout {
chunk_offsets: &entry.offsets,
chunk_shape: &chunk_shape,
dataset_shape: shape,
resolved,
chunk_strides: &chunk_strides,
result_strides: &result_strides,
elem_size,
},
)?;
}
return Ok(());
}
let mut dim_indices: Vec<Vec<(usize, usize)>> = Vec::with_capacity(ndim);
for d in 0..ndim {
let chunk_start = entry.offsets[d];
let chunk_end = (chunk_start + chunk_shape[d]).min(shape[d]);
let dim = &resolved.dims[d];
let sel_start = dim.start;
let sel_end = dim.end;
let sel_step = dim.step;
let mut indices = Vec::new();
let first_sel = if sel_start >= chunk_start {
sel_start
} else {
let steps_to_skip = (chunk_start - sel_start).div_ceil(sel_step);
sel_start + steps_to_skip * sel_step
};
let mut sel_idx = first_sel;
while sel_idx < sel_end && sel_idx < chunk_end {
let chunk_local =
checked_usize(sel_idx - chunk_start, "chunk-local index")?;
let result_dim_idx =
checked_usize((sel_idx - dim.start) / sel_step, "result index")?;
indices.push((chunk_local, result_dim_idx));
sel_idx += sel_step;
}
dim_indices.push(indices);
}
unsafe {
flat.copy_selected(
&chunk_data,
&dim_indices,
&chunk_strides,
&result_strides,
elem_size,
ndim,
)?;
}
Ok(())
})
.collect::<std::result::Result<Vec<_>, Error>>()?;
self.decode_buffer_with_shape::<T>(
&result_buf,
resolved.result_elements,
&resolved.result_shape,
)
}
fn read_contiguous_slice<T: H5Type>(
&self,
address: u64,
size: u64,
resolved: &ResolvedSelection,
) -> Result<ArrayD<T>> {
if resolved.result_elements == 0 {
return self.make_fill_array_from_shape::<T>(0, &resolved.result_shape);
}
if self.external_files.is_none()
&& (Cursor::is_undefined_offset(address, self.offset_size()) || size == 0)
{
return self
.make_fill_array_from_shape::<T>(resolved.result_elements, &resolved.result_shape);
}
if self.external_files.is_none() {
self.validate_allocated_raw_data_len(
"contiguous",
checked_usize(size, "contiguous dataset size")?,
)?;
}
let shape = &self.dataspace.dims;
if selection_covers_full_dataset(resolved, shape) {
return self.read_contiguous::<T>(address, size);
}
let elem_size = self.raw_element_size();
let result_total_bytes = checked_mul_usize(
resolved.result_elements,
elem_size,
"contiguous slice result size in bytes",
)?;
let dataset_strides = row_major_strides(shape, "contiguous dataset stride")?;
let result_dims = resolved.result_dims_with_collapsed();
let result_strides = result_strides_for_dims(&result_dims)?;
let result_buf = self.read_contiguous_slice_bytes_direct(
address,
size,
resolved,
ContiguousSliceDirectLayout {
dataset_strides: &dataset_strides,
result_strides: &result_strides,
elem_size,
result_total_bytes,
},
)?;
self.decode_buffer_with_shape::<T>(
&result_buf,
resolved.result_elements,
&resolved.result_shape,
)
}
fn read_contiguous_slice_bytes_direct(
&self,
address: u64,
size: u64,
resolved: &ResolvedSelection,
layout: ContiguousSliceDirectLayout<'_>,
) -> Result<Vec<u8>> {
let shape = &self.dataspace.dims;
let ndim = shape.len();
if resolved.dims.len() != ndim
|| layout.dataset_strides.len() != ndim
|| layout.result_strides.len() != ndim
{
return Err(Error::InvalidData(format!(
"contiguous slice layout rank does not match dataset rank {ndim}"
)));
}
let storage_len = if self.external_files.is_some() {
checked_mul_usize(
checked_usize(self.num_elements(), "dataset element count")?,
layout.elem_size,
"external dataset size",
)?
} else {
checked_usize(size, "contiguous dataset size")?
};
let tail_start = contiguous_slice_tail_start(shape, resolved);
let block_elements = contiguous_slice_block_elements(resolved, tail_start)?;
let block_bytes = checked_mul_usize(
block_elements,
layout.elem_size,
"contiguous slice block size in bytes",
)?;
let mut result_buf = self.make_output_buffer(layout.result_total_bytes);
let prefix_blocks =
resolved.dims[..tail_start]
.iter()
.try_fold(1usize, |acc, dim| -> Result<usize> {
checked_mul_usize(acc, dim.count, "contiguous slice block count")
})?;
let mut counters = vec![0usize; tail_start];
for _ in 0..prefix_blocks {
let mut source_elem = 0usize;
let mut result_elem = 0usize;
for (d, &counter) in counters.iter().enumerate().take(tail_start) {
let ordinal = u64::try_from(counter).map_err(|_| {
Error::InvalidData("contiguous slice ordinal exceeds u64".to_string())
})?;
let coord = checked_add_u64(
resolved.dims[d].start,
checked_mul_u64(
ordinal,
resolved.dims[d].step,
"contiguous slice coordinate",
)?,
"contiguous slice coordinate",
)?;
let coord = checked_usize(coord, "contiguous slice source index")?;
let source_term =
checked_mul_usize(coord, layout.dataset_strides[d], "contiguous slice source")?;
let result_term = checked_mul_usize(
counter,
layout.result_strides[d],
"contiguous slice result",
)?;
source_elem =
checked_add_usize(source_elem, source_term, "contiguous slice source")?;
result_elem =
checked_add_usize(result_elem, result_term, "contiguous slice result")?;
}
for (d, &dataset_stride) in layout
.dataset_strides
.iter()
.enumerate()
.take(ndim)
.skip(tail_start)
{
let coord = checked_usize(resolved.dims[d].start, "contiguous slice source index")?;
let source_term =
checked_mul_usize(coord, dataset_stride, "contiguous slice source")?;
source_elem =
checked_add_usize(source_elem, source_term, "contiguous slice source")?;
}
let source_start = checked_mul_usize(
source_elem,
layout.elem_size,
"contiguous slice source byte offset",
)?;
let source_end = checked_add_usize(
source_start,
block_bytes,
"contiguous slice source byte end",
)?;
if source_end > storage_len {
return Err(Error::InvalidData(format!(
"contiguous slice range {}..{} exceeds dataset storage size {}",
source_start, source_end, storage_len
)));
}
let dst_start = checked_mul_usize(
result_elem,
layout.elem_size,
"contiguous slice destination byte offset",
)?;
let dst_end = checked_add_usize(
dst_start,
block_bytes,
"contiguous slice destination byte end",
)?;
if dst_end > result_buf.len() {
return Err(Error::InvalidData(format!(
"contiguous slice destination range {}..{} exceeds result size {}",
dst_start,
dst_end,
result_buf.len()
)));
}
let block = self.read_contiguous_logical_range(address, source_start, block_bytes)?;
if block.len() != block_bytes {
return Err(Error::InvalidData(format!(
"contiguous slice read returned {} bytes, expected {}",
block.len(),
block_bytes
)));
}
result_buf[dst_start..dst_end].copy_from_slice(&block);
let mut carry = true;
for d in (0..tail_start).rev() {
if carry {
counters[d] += 1;
if counters[d] < resolved.dims[d].count {
carry = false;
} else {
counters[d] = 0;
}
}
}
}
Ok(result_buf)
}
fn read_compact_slice<T: H5Type>(
&self,
data: &[u8],
selection: &SliceInfo,
) -> Result<ArrayD<T>> {
let full = self.read_compact::<T>(data)?;
slice_array(&full, selection, &self.dataspace.dims)
}
fn decode_buffer_with_shape<T: H5Type>(
&self,
raw: &[u8],
n: usize,
shape: &[usize],
) -> Result<ArrayD<T>> {
let elem_size = self.raw_element_size();
let expected_bytes = checked_mul_usize(n, elem_size, "decoded buffer byte length")?;
if raw.len() != expected_bytes {
return Err(Error::InvalidData(format!(
"decoded buffer has {} bytes, expected {} bytes",
raw.len(),
expected_bytes
)));
}
if let Some(elements) = T::decode_vec(raw, &self.datatype, n) {
let elements = elements?;
return ArrayD::from_shape_vec(IxDyn(shape), elements)
.map_err(|e| Error::InvalidData(format!("array shape error: {e}")));
}
let mut elements = Vec::with_capacity(n);
for i in 0..n {
let start = checked_mul_usize(i, elem_size, "decoded element byte offset")?;
let end = checked_mul_usize(i + 1, elem_size, "decoded element end offset")?;
elements.push(T::from_bytes(&raw[start..end], &self.datatype)?);
}
ArrayD::from_shape_vec(IxDyn(shape), elements)
.map_err(|e| Error::InvalidData(format!("array shape error: {e}")))
}
fn decode_raw_data<T: H5Type>(&self, raw: &[u8]) -> Result<ArrayD<T>> {
let n = checked_usize(self.num_elements(), "dataset element count")?;
let mut shape = Vec::with_capacity(self.dataspace.dims.len());
for &dim in &self.dataspace.dims {
shape.push(checked_usize(dim, "dataset dimension")?);
}
self.decode_buffer_with_shape::<T>(raw, n, &shape)
}
fn make_fill_array<T: H5Type>(&self) -> Result<ArrayD<T>> {
let n = checked_usize(self.num_elements(), "dataset element count")?;
let mut shape = Vec::with_capacity(self.dataspace.dims.len());
for &dim in &self.dataspace.dims {
shape.push(checked_usize(dim, "dataset dimension")?);
}
self.make_fill_array_from_shape::<T>(n, &shape)
}
fn make_fill_array_from_shape<T: H5Type>(
&self,
element_count: usize,
shape: &[usize],
) -> Result<ArrayD<T>> {
let elem_size = dtype_element_size(&self.datatype);
let total_bytes = checked_mul_usize(element_count, elem_size, "fill result size in bytes")?;
let fill = self.make_output_buffer(total_bytes);
self.decode_buffer_with_shape::<T>(&fill, element_count, shape)
}
fn make_output_buffer(&self, total_bytes: usize) -> Vec<u8> {
let mut buf = vec![0u8; total_bytes];
self.fill_output_buffer(&mut buf);
buf
}
fn fill_output_buffer(&self, buf: &mut [u8]) {
buf.fill(0);
if let Some(ref fv) = self.fill_value {
if let Some(ref fill_bytes) = fv.value {
if !fill_bytes.is_empty() {
for chunk in buf.chunks_exact_mut(fill_bytes.len()) {
chunk.copy_from_slice(fill_bytes);
}
}
}
}
}
fn validate_allocated_raw_data_len(&self, storage_kind: &str, actual_len: usize) -> Result<()> {
let expected_len = self.raw_byte_len()?;
if actual_len != expected_len {
return Err(Error::InvalidData(format!(
"{storage_kind} raw data has {actual_len} bytes, expected {expected_len} bytes"
)));
}
Ok(())
}
fn convert_to_native_endian(&self, bytes: &mut [u8]) -> Result<()> {
let count = checked_usize(self.num_elements(), "dataset element count")?;
convert_datatype_to_native_endian(&self.datatype, self.vlen_reference_size(), bytes, count)
}
}
fn native_byte_order() -> ByteOrder {
if cfg!(target_endian = "little") {
ByteOrder::LittleEndian
} else {
ByteOrder::BigEndian
}
}
fn convert_datatype_to_native_endian(
dtype: &Datatype,
vlen_reference_size: usize,
bytes: &mut [u8],
count: usize,
) -> Result<()> {
match dtype {
Datatype::FixedPoint {
size, byte_order, ..
}
| Datatype::FloatingPoint { size, byte_order }
| Datatype::Bitfield { size, byte_order } => {
swap_elements_to_native(bytes, count, *size as usize, *byte_order)
}
Datatype::Enum { base, .. } => {
convert_datatype_to_native_endian(base, vlen_reference_size, bytes, count)
}
Datatype::Array { base, dims } => {
let array_count = dims.iter().try_fold(1usize, |acc, &dim| {
checked_mul_usize(
acc,
checked_usize(dim, "array datatype dimension")?,
"array datatype element count",
)
})?;
let total_count =
checked_mul_usize(count, array_count, "array datatype total element count")?;
convert_datatype_to_native_endian(base, vlen_reference_size, bytes, total_count)
}
Datatype::Compound { size, fields } => {
let record_size = *size as usize;
let required = checked_mul_usize(count, record_size, "compound byte length")?;
if bytes.len() < required {
return Err(Error::InvalidData(format!(
"compound native-endian conversion needs {required} bytes, got {}",
bytes.len()
)));
}
for record in 0..count {
let record_start =
checked_mul_usize(record, record_size, "compound record byte offset")?;
for field in fields {
let field_offset = field.byte_offset as usize;
let field_size =
raw_element_size_for_datatype(&field.datatype, vlen_reference_size);
let field_start = checked_add_usize(
record_start,
field_offset,
"compound field byte offset",
)?;
let field_end =
checked_add_usize(field_start, field_size, "compound field byte end")?;
if field_end > bytes.len() || field_offset + field_size > record_size {
return Err(Error::InvalidData(format!(
"compound field '{}' range exceeds record size",
field.name
)));
}
convert_datatype_to_native_endian(
&field.datatype,
vlen_reference_size,
&mut bytes[field_start..field_end],
1,
)?;
}
}
Ok(())
}
Datatype::String { .. }
| Datatype::VarLen { .. }
| Datatype::Opaque { .. }
| Datatype::Reference { .. } => Ok(()),
}
}
fn swap_elements_to_native(
bytes: &mut [u8],
count: usize,
elem_size: usize,
byte_order: ByteOrder,
) -> Result<()> {
let required = checked_mul_usize(count, elem_size, "native-endian byte length")?;
if bytes.len() < required {
return Err(Error::InvalidData(format!(
"native-endian conversion needs {required} bytes, got {}",
bytes.len()
)));
}
if elem_size <= 1 || byte_order == native_byte_order() {
return Ok(());
}
for chunk in bytes[..required].chunks_exact_mut(elem_size) {
chunk.reverse();
}
Ok(())
}
fn attribute_from_message_storage(message: &AttributeMessage, context: &FileContext) -> Attribute {
let raw_data = match &message.datatype {
Datatype::VarLen {
base,
kind: VarLenKind::String,
..
} if matches!(base.as_ref(), Datatype::FixedPoint { size: 1, .. })
&& message.dataspace.num_elements() == 1 =>
{
resolve_vlen_bytes_storage(
&message.raw_data,
context.storage.as_ref(),
context.superblock.offset_size,
context.superblock.length_size,
)
.unwrap_or_else(|| message.raw_data.clone())
}
_ => message.raw_data.clone(),
};
Attribute {
name: message.name.clone(),
datatype: message.datatype.clone(),
shape: match message.dataspace.dataspace_type {
DataspaceType::Scalar => vec![],
DataspaceType::Null => vec![0],
DataspaceType::Simple => message.dataspace.dims.clone(),
},
raw_data,
}
}
fn normalize_layout(layout: DataLayout, dataspace: &DataspaceMessage) -> DataLayout {
match layout {
DataLayout::Chunked {
address,
mut dims,
mut element_size,
chunk_indexing,
} if dims.len() == dataspace.dims.len() + 1 => {
if let Some(legacy_element_size) = dims.pop() {
if element_size == 0 {
element_size = legacy_element_size;
}
}
DataLayout::Chunked {
address,
dims,
element_size,
chunk_indexing,
}
}
other => other,
}
}
fn raw_element_size_for_datatype(dtype: &Datatype, vlen_reference_size: usize) -> usize {
match dtype {
Datatype::String {
size: StringSize::Variable,
..
}
| Datatype::VarLen { .. } => vlen_reference_size,
Datatype::Array { base, dims } => {
let base_size = raw_element_size_for_datatype(base, vlen_reference_size);
let count: u64 = dims.iter().product();
base_size * count as usize
}
Datatype::Enum { base, .. } => raw_element_size_for_datatype(base, vlen_reference_size),
Datatype::FixedPoint { size, .. }
| Datatype::FloatingPoint { size, .. }
| Datatype::Bitfield { size, .. }
| Datatype::Reference { size, .. } => *size as usize,
Datatype::String {
size: StringSize::Fixed(len),
..
} => *len as usize,
Datatype::Compound { size, .. } | Datatype::Opaque { size, .. } => *size as usize,
}
}
#[cfg(test)]
fn copy_chunk_to_flat(
chunk_data: &[u8],
flat: &mut [u8],
chunk_offsets: &[u64],
chunk_shape: &[u64],
dataset_shape: &[u64],
elem_size: usize,
) -> Result<()> {
let dataset_strides = row_major_strides(dataset_shape, "dataset stride")
.expect("dataset strides should fit in usize");
let chunk_strides =
row_major_strides(chunk_shape, "chunk stride").expect("chunk strides should fit in usize");
copy_chunk_to_flat_with_strides(
chunk_data,
flat,
ChunkCopyLayout {
chunk_offsets,
chunk_shape,
dataset_shape,
dataset_strides: &dataset_strides,
chunk_strides: &chunk_strides,
elem_size,
},
)
}
fn copy_chunk_to_flat_with_strides(
chunk_data: &[u8],
flat: &mut [u8],
layout: ChunkCopyLayout<'_>,
) -> Result<()> {
unsafe {
copy_chunk_to_flat_with_strides_ptr(
chunk_data,
FlatBufferPtr {
ptr: flat.as_mut_ptr(),
len: flat.len(),
},
layout,
)
}
}
#[inline(always)]
unsafe fn copy_chunk_to_flat_with_strides_ptr(
chunk_data: &[u8],
flat: FlatBufferPtr,
layout: ChunkCopyLayout<'_>,
) -> Result<()> {
let ndim = layout.dataset_shape.len();
if layout.chunk_offsets.len() != ndim
|| layout.chunk_shape.len() != ndim
|| layout.dataset_strides.len() != ndim
|| layout.chunk_strides.len() != ndim
{
return Err(Error::InvalidData(format!(
"chunk copy layout rank does not match dataset rank {ndim}"
)));
}
if ndim == 0 {
if chunk_data.len() < layout.elem_size || flat.len < layout.elem_size {
return Err(Error::InvalidData(format!(
"scalar chunk copy requires {} bytes, got source {} and destination {}",
layout.elem_size,
chunk_data.len(),
flat.len
)));
}
std::ptr::copy_nonoverlapping(chunk_data.as_ptr(), flat.ptr, layout.elem_size);
return Ok(());
}
let mut actual_chunk_shape = Vec::with_capacity(ndim);
for i in 0..ndim {
if layout.chunk_offsets[i] >= layout.dataset_shape[i] {
return Err(Error::InvalidData(format!(
"chunk offset {} is outside dimension {} of size {}",
layout.chunk_offsets[i], i, layout.dataset_shape[i]
)));
}
let remaining = layout.dataset_shape[i] - layout.chunk_offsets[i];
actual_chunk_shape.push(checked_usize(
remaining.min(layout.chunk_shape[i]),
"actual chunk extent",
)?);
}
let row_elems = *actual_chunk_shape.last().unwrap_or(&1);
let row_bytes = checked_mul_usize(row_elems, layout.elem_size, "chunk row bytes")?;
let mut dataset_origin = 0usize;
for (d, offset) in layout.chunk_offsets.iter().enumerate() {
let offset = checked_usize(*offset, "chunk offset")?;
let term = checked_mul_usize(offset, layout.dataset_strides[d], "chunk origin")?;
dataset_origin = checked_add_usize(dataset_origin, term, "chunk origin")?;
}
if ndim == 1 {
let dst_start = checked_mul_usize(dataset_origin, layout.elem_size, "chunk dst offset")?;
let dst_end = checked_add_usize(dst_start, row_bytes, "chunk dst end")?;
if row_bytes > chunk_data.len() || dst_end > flat.len {
return Err(Error::InvalidData(format!(
"chunk copy out of bounds: source row needs {} bytes from {} bytes, destination range {}..{} exceeds {} bytes",
row_bytes,
chunk_data.len(),
dst_start,
dst_end,
flat.len
)));
}
std::ptr::copy_nonoverlapping(chunk_data.as_ptr(), flat.ptr.add(dst_start), row_bytes);
return Ok(());
}
let outer_dims = &actual_chunk_shape[..ndim - 1];
let total_rows = checked_product_usize(outer_dims, "chunk row count")?;
let mut outer_idx = vec![0usize; ndim - 1];
for _ in 0..total_rows {
let mut chunk_row = 0usize;
let mut dataset_row = dataset_origin;
for (d, outer) in outer_idx.iter().copied().enumerate() {
let chunk_term = checked_mul_usize(outer, layout.chunk_strides[d], "chunk row")?;
let dataset_term = checked_mul_usize(outer, layout.dataset_strides[d], "dataset row")?;
chunk_row = checked_add_usize(chunk_row, chunk_term, "chunk row")?;
dataset_row = checked_add_usize(dataset_row, dataset_term, "dataset row")?;
}
let src_start = checked_mul_usize(chunk_row, layout.elem_size, "chunk src offset")?;
let dst_start = checked_mul_usize(dataset_row, layout.elem_size, "chunk dst offset")?;
let src_end = checked_add_usize(src_start, row_bytes, "chunk src end")?;
let dst_end = checked_add_usize(dst_start, row_bytes, "chunk dst end")?;
if src_end > chunk_data.len() || dst_end > flat.len {
return Err(Error::InvalidData(format!(
"chunk copy out of bounds: source range {}..{} of {} bytes, destination range {}..{} of {} bytes",
src_start,
src_end,
chunk_data.len(),
dst_start,
dst_end,
flat.len
)));
}
std::ptr::copy_nonoverlapping(
chunk_data.as_ptr().add(src_start),
flat.ptr.add(dst_start),
row_bytes,
);
let mut carry = true;
for d in (0..outer_idx.len()).rev() {
if carry {
outer_idx[d] += 1;
if outer_idx[d] < outer_dims[d] {
carry = false;
} else {
outer_idx[d] = 0;
}
}
}
}
Ok(())
}
fn checked_product_usize(values: &[usize], context: &str) -> Result<usize> {
let mut product = 1usize;
for &value in values {
product = checked_mul_usize(product, value, context)?;
}
Ok(product)
}
fn unit_stride_chunk_overlap_plan(
chunk_offsets: &[u64],
chunk_shape: &[u64],
dataset_shape: &[u64],
resolved: &ResolvedSelection,
) -> Result<(Vec<usize>, Vec<usize>, Vec<usize>)> {
let ndim = dataset_shape.len();
let mut overlap_counts = Vec::with_capacity(ndim);
let mut chunk_local_start = Vec::with_capacity(ndim);
let mut result_start = Vec::with_capacity(ndim);
for d in 0..ndim {
let chunk_start = chunk_offsets[d];
let chunk_end = (chunk_start + chunk_shape[d]).min(dataset_shape[d]);
let dim = &resolved.dims[d];
let overlap_start = chunk_start.max(dim.start);
let overlap_end = chunk_end.min(dim.end);
if overlap_start >= overlap_end {
return Ok((Vec::new(), Vec::new(), Vec::new()));
}
overlap_counts.push(checked_usize(
overlap_end - overlap_start,
"chunk overlap size",
)?);
chunk_local_start.push(checked_usize(
overlap_start - chunk_start,
"chunk overlap start",
)?);
result_start.push(checked_usize(
overlap_start - dim.start,
"slice result overlap start",
)?);
}
Ok((overlap_counts, chunk_local_start, result_start))
}
#[inline(always)]
fn copy_unit_stride_chunk_overlap(
chunk_data: &[u8],
result_buf: &mut [u8],
layout: UnitStrideCopyLayout<'_>,
) -> Result<()> {
unsafe {
copy_unit_stride_chunk_overlap_ptr(
chunk_data,
FlatBufferPtr {
ptr: result_buf.as_mut_ptr(),
len: result_buf.len(),
},
layout,
)
}
}
#[inline(always)]
unsafe fn copy_unit_stride_chunk_overlap_ptr(
chunk_data: &[u8],
result: FlatBufferPtr,
layout: UnitStrideCopyLayout<'_>,
) -> Result<()> {
let ndim = layout.dataset_shape.len();
if layout.chunk_offsets.len() != ndim
|| layout.chunk_shape.len() != ndim
|| layout.resolved.dims.len() != ndim
|| layout.chunk_strides.len() != ndim
|| layout.result_strides.len() != ndim
{
return Err(Error::InvalidData(format!(
"unit-stride copy layout rank does not match dataset rank {ndim}"
)));
}
if ndim == 0 {
if chunk_data.len() < layout.elem_size || result.len < layout.elem_size {
return Err(Error::InvalidData(format!(
"scalar slice copy requires {} bytes, got source {} and destination {}",
layout.elem_size,
chunk_data.len(),
result.len
)));
}
std::ptr::copy_nonoverlapping(chunk_data.as_ptr(), result.ptr, layout.elem_size);
return Ok(());
}
let (overlap_counts, chunk_local_start, result_start) = unit_stride_chunk_overlap_plan(
layout.chunk_offsets,
layout.chunk_shape,
layout.dataset_shape,
layout.resolved,
)?;
if overlap_counts.is_empty() {
return Ok(());
}
let row_elems = *overlap_counts.last().unwrap_or(&1);
let row_bytes = checked_mul_usize(row_elems, layout.elem_size, "unit-stride slice row bytes")?;
let mut chunk_origin = 0usize;
let mut result_origin = 0usize;
for d in 0..ndim {
let chunk_term = checked_mul_usize(
chunk_local_start[d],
layout.chunk_strides[d],
"chunk overlap origin",
)?;
let result_term = checked_mul_usize(
result_start[d],
layout.result_strides[d],
"slice result origin",
)?;
chunk_origin = checked_add_usize(chunk_origin, chunk_term, "chunk overlap origin")?;
result_origin = checked_add_usize(result_origin, result_term, "slice result origin")?;
}
if ndim == 1 {
let src_start = checked_mul_usize(chunk_origin, layout.elem_size, "slice src offset")?;
let dst_start = checked_mul_usize(result_origin, layout.elem_size, "slice dst offset")?;
let src_end = checked_add_usize(src_start, row_bytes, "slice src end")?;
let dst_end = checked_add_usize(dst_start, row_bytes, "slice dst end")?;
if src_end > chunk_data.len() || dst_end > result.len {
return Err(Error::InvalidData(format!(
"unit-stride slice copy out of bounds: source range {}..{} of {} bytes, destination range {}..{} of {} bytes",
src_start,
src_end,
chunk_data.len(),
dst_start,
dst_end,
result.len
)));
}
std::ptr::copy_nonoverlapping(
chunk_data.as_ptr().add(src_start),
result.ptr.add(dst_start),
row_bytes,
);
return Ok(());
}
let outer_counts = &overlap_counts[..ndim - 1];
let total_rows = checked_product_usize(outer_counts, "unit-stride slice row count")?;
let mut outer_idx = vec![0usize; ndim - 1];
for _ in 0..total_rows {
let mut chunk_row = chunk_origin;
let mut result_row = result_origin;
for (d, outer) in outer_idx.iter().copied().enumerate() {
let chunk_term = checked_mul_usize(outer, layout.chunk_strides[d], "slice chunk row")?;
let result_term =
checked_mul_usize(outer, layout.result_strides[d], "slice result row")?;
chunk_row = checked_add_usize(chunk_row, chunk_term, "slice chunk row")?;
result_row = checked_add_usize(result_row, result_term, "slice result row")?;
}
let src_start = checked_mul_usize(chunk_row, layout.elem_size, "slice src offset")?;
let dst_start = checked_mul_usize(result_row, layout.elem_size, "slice dst offset")?;
let src_end = checked_add_usize(src_start, row_bytes, "slice src end")?;
let dst_end = checked_add_usize(dst_start, row_bytes, "slice dst end")?;
if src_end > chunk_data.len() || dst_end > result.len {
return Err(Error::InvalidData(format!(
"unit-stride slice copy out of bounds: source range {}..{} of {} bytes, destination range {}..{} of {} bytes",
src_start,
src_end,
chunk_data.len(),
dst_start,
dst_end,
result.len
)));
}
std::ptr::copy_nonoverlapping(
chunk_data.as_ptr().add(src_start),
result.ptr.add(dst_start),
row_bytes,
);
let mut carry = true;
for d in (0..outer_idx.len()).rev() {
if carry {
outer_idx[d] += 1;
if outer_idx[d] < outer_counts[d] {
carry = false;
} else {
outer_idx[d] = 0;
}
}
}
}
Ok(())
}
#[allow(clippy::too_many_arguments)]
#[inline(always)]
fn copy_selected_elements(
chunk_data: &[u8],
result_buf: &mut [u8],
dim_indices: &[Vec<(usize, usize)>],
chunk_strides: &[usize],
result_strides: &[usize],
elem_size: usize,
ndim: usize,
) -> Result<()> {
if dim_indices.len() != ndim || chunk_strides.len() != ndim || result_strides.len() != ndim {
return Err(Error::InvalidData(format!(
"selected-element copy layout rank does not match rank {ndim}"
)));
}
if dim_indices.iter().any(|v| v.is_empty()) {
return Ok(());
}
let counts: Vec<usize> = dim_indices.iter().map(|v| v.len()).collect();
let total = checked_product_usize(&counts, "selected-element copy count")?;
let mut counters = vec![0usize; ndim];
for _ in 0..total {
let mut chunk_flat = 0;
let mut result_flat = 0;
for d in 0..ndim {
let (cl, ri) = dim_indices[d][counters[d]];
let chunk_term = checked_mul_usize(cl, chunk_strides[d], "selected chunk offset")?;
let result_term = checked_mul_usize(ri, result_strides[d], "selected result offset")?;
chunk_flat = checked_add_usize(chunk_flat, chunk_term, "selected chunk offset")?;
result_flat = checked_add_usize(result_flat, result_term, "selected result offset")?;
}
let src_start = checked_mul_usize(chunk_flat, elem_size, "selected source byte offset")?;
let dst_start =
checked_mul_usize(result_flat, elem_size, "selected destination byte offset")?;
let src_end = checked_add_usize(src_start, elem_size, "selected source byte end")?;
let dst_end = checked_add_usize(dst_start, elem_size, "selected destination byte end")?;
if src_end > chunk_data.len() || dst_end > result_buf.len() {
return Err(Error::InvalidData(format!(
"selected-element copy out of bounds: source range {}..{} of {} bytes, destination range {}..{} of {} bytes",
src_start,
src_end,
chunk_data.len(),
dst_start,
dst_end,
result_buf.len()
)));
}
result_buf[dst_start..dst_end].copy_from_slice(&chunk_data[src_start..src_end]);
let mut carry = true;
for d in (0..ndim).rev() {
if carry {
counters[d] += 1;
if counters[d] < dim_indices[d].len() {
carry = false;
} else {
counters[d] = 0;
}
}
}
}
Ok(())
}
#[cfg(feature = "rayon")]
#[allow(clippy::too_many_arguments)]
#[inline(always)]
unsafe fn copy_selected_elements_ptr(
chunk_data: &[u8],
result_ptr: *mut u8,
result_len: usize,
dim_indices: &[Vec<(usize, usize)>],
chunk_strides: &[usize],
result_strides: &[usize],
elem_size: usize,
ndim: usize,
) -> Result<()> {
if dim_indices.len() != ndim || chunk_strides.len() != ndim || result_strides.len() != ndim {
return Err(Error::InvalidData(format!(
"selected-element copy layout rank does not match rank {ndim}"
)));
}
if dim_indices.iter().any(|v| v.is_empty()) {
return Ok(());
}
let counts: Vec<usize> = dim_indices.iter().map(|v| v.len()).collect();
let total = checked_product_usize(&counts, "selected-element copy count")?;
let mut counters = vec![0usize; ndim];
for _ in 0..total {
let mut chunk_flat = 0;
let mut result_flat = 0;
for d in 0..ndim {
let (cl, ri) = dim_indices[d][counters[d]];
let chunk_term = checked_mul_usize(cl, chunk_strides[d], "selected chunk offset")?;
let result_term = checked_mul_usize(ri, result_strides[d], "selected result offset")?;
chunk_flat = checked_add_usize(chunk_flat, chunk_term, "selected chunk offset")?;
result_flat = checked_add_usize(result_flat, result_term, "selected result offset")?;
}
let src_start = checked_mul_usize(chunk_flat, elem_size, "selected source byte offset")?;
let dst_start =
checked_mul_usize(result_flat, elem_size, "selected destination byte offset")?;
let src_end = checked_add_usize(src_start, elem_size, "selected source byte end")?;
let dst_end = checked_add_usize(dst_start, elem_size, "selected destination byte end")?;
if src_end > chunk_data.len() || dst_end > result_len {
return Err(Error::InvalidData(format!(
"selected-element copy out of bounds: source range {}..{} of {} bytes, destination range {}..{} of {} bytes",
src_start,
src_end,
chunk_data.len(),
dst_start,
dst_end,
result_len
)));
}
std::ptr::copy_nonoverlapping(
chunk_data.as_ptr().add(src_start),
result_ptr.add(dst_start),
elem_size,
);
let mut carry = true;
for d in (0..ndim).rev() {
if carry {
counters[d] += 1;
if counters[d] < dim_indices[d].len() {
carry = false;
} else {
counters[d] = 0;
}
}
}
}
Ok(())
}
fn slice_array<T: H5Type + Clone>(
array: &ArrayD<T>,
selection: &SliceInfo,
shape: &[u64],
) -> Result<ArrayD<T>> {
let mut result_shape = Vec::new();
for (i, sel) in selection.selections.iter().enumerate() {
let dim_size = shape[i];
match sel {
SliceInfoElem::Index(idx) => {
if *idx >= dim_size {
return Err(Error::SliceOutOfBounds {
dim: i,
index: *idx,
size: dim_size,
});
}
}
SliceInfoElem::Slice { start, end, step } => {
let dim_size = checked_usize(dim_size, "slice dimension size")?;
let actual_end = if *end == u64::MAX {
dim_size
} else {
checked_usize(*end, "slice end")?.min(dim_size)
};
let actual_start = checked_usize(*start, "slice start")?;
let actual_step = checked_usize(*step, "slice step")?;
if actual_step == 0 {
return Err(Error::InvalidData("slice step cannot be 0".into()));
}
if actual_start > dim_size {
return Err(Error::SliceOutOfBounds {
dim: i,
index: *start,
size: shape[i],
});
}
let n = (actual_end - actual_start).div_ceil(actual_step);
result_shape.push(n);
}
}
}
let ndim = shape.len();
let total = checked_product_usize(&result_shape, "slice result element count")?;
let mut elements = Vec::with_capacity(total);
let mut result_idx = vec![0usize; result_shape.len()];
for _ in 0..total {
let mut src_idx = Vec::with_capacity(ndim);
let mut ri = 0;
for sel in selection.selections.iter() {
match sel {
SliceInfoElem::Index(idx) => {
src_idx.push(checked_usize(*idx, "slice source index")?);
}
SliceInfoElem::Slice { start, step, .. } => {
let start = checked_usize(*start, "slice start")?;
let step = checked_usize(*step, "slice step")?;
let offset =
checked_mul_usize(result_idx[ri], step, "slice source index offset")?;
src_idx.push(checked_add_usize(start, offset, "slice source index")?);
ri += 1;
}
}
}
elements.push(array[IxDyn(&src_idx)].clone());
if !result_shape.is_empty() {
let mut carry = true;
for d in (0..result_shape.len()).rev() {
if carry {
result_idx[d] += 1;
if result_idx[d] < result_shape[d] {
carry = false;
} else {
result_idx[d] = 0;
}
}
}
}
}
ArrayD::from_shape_vec(IxDyn(&result_shape), elements)
.map_err(|e| Error::InvalidData(format!("slice shape error: {e}")))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::storage::BytesStorage;
use crate::superblock::Superblock;
use std::collections::HashMap;
fn test_context(bytes: Vec<u8>) -> Arc<FileContext> {
let storage: DynStorage = Arc::new(BytesStorage::new(bytes));
Arc::new(FileContext {
storage,
superblock: Superblock {
version: 2,
offset_size: 8,
length_size: 8,
group_leaf_node_k: 0,
group_internal_node_k: 0,
indexed_storage_k: 0,
consistency_flags: 0,
base_address: 0,
free_space_address: u64::MAX,
eof_address: 0,
driver_info_address: u64::MAX,
root_symbol_table_entry: None,
root_object_header_address: Some(0),
extension_address: None,
},
chunk_cache: Arc::new(ChunkCache::new(1024, 8)),
header_cache: Arc::new(Mutex::new(HashMap::new())),
dataset_path_cache: Arc::new(Mutex::new(HashMap::new())),
filter_registry: Arc::new(FilterRegistry::default()),
external_file_resolver: None,
external_link_resolver: None,
external_file_cache: Mutex::new(HashMap::new()),
sohm_table: OnceLock::new(),
full_file_cache: OnceLock::new(),
})
}
fn fixed_u16_dataset(layout: DataLayout, storage_bytes: Vec<u8>) -> Dataset {
let context = test_context(storage_bytes);
Dataset {
context: context.clone(),
name: "short".to_string(),
data_address: 0,
dataspace: DataspaceMessage {
rank: 1,
dims: vec![3],
max_dims: None,
dataspace_type: DataspaceType::Simple,
},
datatype: Datatype::FixedPoint {
size: 2,
signed: false,
byte_order: ByteOrder::LittleEndian,
},
layout,
fill_value: None,
filters: None,
external_files: None,
attributes: Vec::new(),
chunk_cache: context.chunk_cache.clone(),
chunk_entry_cache: Arc::new(Mutex::new(LruCache::new(NonZeroUsize::new(32).unwrap()))),
full_chunk_entries: Arc::new(OnceLock::new()),
full_dataset_bytes: Arc::new(OnceLock::new()),
external_slots: Arc::new(OnceLock::new()),
filter_registry: context.filter_registry.clone(),
}
}
#[test]
fn test_slice_info_all() {
let s = SliceInfo::all(3);
assert_eq!(s.selections.len(), 3);
}
#[test]
fn test_raw_element_size_uses_file_vlen_reference_width() {
let dtype = Datatype::VarLen {
base: Box::new(Datatype::FixedPoint {
size: 1,
signed: false,
byte_order: crate::error::ByteOrder::LittleEndian,
}),
kind: VarLenKind::Sequence,
encoding: crate::messages::datatype::StringEncoding::Ascii,
padding: crate::messages::datatype::StringPadding::NullTerminate,
};
assert_eq!(raw_element_size_for_datatype(&dtype, 12), 12);
assert_eq!(
raw_element_size_for_datatype(
&Datatype::Array {
base: Box::new(dtype),
dims: vec![2, 3],
},
12,
),
72
);
}
#[test]
fn test_compact_raw_data_requires_exact_logical_length() {
let dataset = fixed_u16_dataset(
DataLayout::Compact {
data: vec![1, 0, 2, 0, 3],
},
Vec::new(),
);
let err = dataset.read_array::<u16>().unwrap_err();
assert!(
matches!(err, Error::Context { .. })
&& err
.to_string()
.contains("compact raw data has 5 bytes, expected 6 bytes"),
"expected compact raw length error, got: {err}"
);
}
#[test]
fn test_contiguous_raw_data_requires_exact_logical_length() {
let dataset = fixed_u16_dataset(
DataLayout::Contiguous {
address: 0,
size: 5,
},
vec![1, 0, 2, 0, 3],
);
let err = dataset.read_raw_bytes().unwrap_err();
assert!(
matches!(err, Error::Context { .. })
&& err
.to_string()
.contains("contiguous raw data has 5 bytes, expected 6 bytes"),
"expected contiguous raw length error, got: {err}"
);
}
#[test]
fn test_copy_chunk_1d() {
let chunk_data = vec![1u8, 2, 3, 4]; let mut flat = vec![0u8; 8];
let chunk_offsets = vec![2u64]; let chunk_shape = vec![4u64];
let dataset_shape = vec![8u64];
copy_chunk_to_flat(
&chunk_data,
&mut flat,
&chunk_offsets,
&chunk_shape,
&dataset_shape,
1,
)
.unwrap();
assert_eq!(flat, vec![0, 0, 1, 2, 3, 4, 0, 0]);
}
#[test]
fn test_copy_chunk_2d_rowwise() {
let chunk_data = vec![1u8, 2, 3, 4, 5, 6];
let mut flat = vec![0u8; 16];
let chunk_offsets = vec![1u64, 1u64];
let chunk_shape = vec![2u64, 3u64];
let dataset_shape = vec![4u64, 4u64];
copy_chunk_to_flat(
&chunk_data,
&mut flat,
&chunk_offsets,
&chunk_shape,
&dataset_shape,
1,
)
.unwrap();
assert_eq!(flat, vec![0, 0, 0, 0, 0, 1, 2, 3, 0, 4, 5, 6, 0, 0, 0, 0,]);
}
#[test]
fn test_copy_unit_stride_chunk_overlap_2d_partial() {
let chunk_data: Vec<u8> = (1..=16).collect();
let mut result = vec![0u8; 6];
let chunk_offsets = vec![0u64, 0u64];
let chunk_shape = vec![4u64, 4u64];
let dataset_shape = vec![4u64, 4u64];
let resolved = ResolvedSelection {
dims: vec![
ResolvedSelectionDim {
start: 1,
end: 3,
step: 1,
count: 2,
},
ResolvedSelectionDim {
start: 1,
end: 4,
step: 1,
count: 3,
},
],
result_shape: vec![2, 3],
result_elements: 6,
};
let chunk_strides = vec![4usize, 1usize];
let result_strides = vec![3usize, 1usize];
copy_unit_stride_chunk_overlap(
&chunk_data,
&mut result,
UnitStrideCopyLayout {
chunk_offsets: &chunk_offsets,
chunk_shape: &chunk_shape,
dataset_shape: &dataset_shape,
resolved: &resolved,
chunk_strides: &chunk_strides,
result_strides: &result_strides,
elem_size: 1,
},
)
.unwrap();
assert_eq!(result, vec![6, 7, 8, 10, 11, 12]);
}
fn chunk_entry(offsets: &[u64], address: u64) -> chunk_index::ChunkEntry {
chunk_index::ChunkEntry {
address,
size: 0,
filter_mask: 0,
offsets: offsets.to_vec(),
}
}
#[test]
fn test_chunk_grid_coverage_detects_missing_chunk() {
let mut entries = vec![
chunk_entry(&[0, 0], 0x1000),
chunk_entry(&[0, 2], 0x2000),
chunk_entry(&[2, 0], 0x3000),
];
let complete =
validate_chunk_grid_coverage(&mut entries, &[4, 4], &[2, 2], &[0, 0], &[1, 1]).unwrap();
assert!(!complete);
}
#[test]
fn test_chunk_grid_coverage_rejects_duplicate_offsets() {
let mut entries = vec![
chunk_entry(&[0, 0], 0x1000),
chunk_entry(&[0, 0], 0x2000),
chunk_entry(&[0, 2], 0x3000),
chunk_entry(&[2, 0], 0x4000),
];
let err = validate_chunk_grid_coverage(&mut entries, &[4, 4], &[2, 2], &[0, 0], &[1, 1])
.unwrap_err();
assert!(matches!(err, Error::InvalidData(_)));
}
#[test]
fn test_decoded_chunk_len_requires_exact_size() {
let entry = chunk_entry(&[0, 0], 0x1000);
validate_decoded_chunk_len(&entry, &[2, 3], 4, 24).unwrap();
let err = validate_decoded_chunk_len(&entry, &[2, 3], 4, 23).unwrap_err();
assert!(matches!(err, Error::InvalidData(_)));
}
#[test]
fn test_copy_chunk_errors_on_short_row() {
let chunk_data = vec![1u8, 2, 3, 4, 5];
let mut flat = vec![0u8; 16];
let chunk_offsets = vec![1u64, 1u64];
let chunk_shape = vec![2u64, 3u64];
let dataset_shape = vec![4u64, 4u64];
let err = copy_chunk_to_flat(
&chunk_data,
&mut flat,
&chunk_offsets,
&chunk_shape,
&dataset_shape,
1,
)
.unwrap_err();
assert!(matches!(err, Error::InvalidData(_)));
}
#[test]
fn test_copy_unit_stride_chunk_overlap_errors_on_short_row() {
let chunk_data: Vec<u8> = (1..=7).collect();
let mut result = vec![0u8; 6];
let chunk_offsets = vec![0u64, 0u64];
let chunk_shape = vec![4u64, 4u64];
let dataset_shape = vec![4u64, 4u64];
let resolved = ResolvedSelection {
dims: vec![
ResolvedSelectionDim {
start: 1,
end: 3,
step: 1,
count: 2,
},
ResolvedSelectionDim {
start: 1,
end: 4,
step: 1,
count: 3,
},
],
result_shape: vec![2, 3],
result_elements: 6,
};
let chunk_strides = vec![4usize, 1usize];
let result_strides = vec![3usize, 1usize];
let err = copy_unit_stride_chunk_overlap(
&chunk_data,
&mut result,
UnitStrideCopyLayout {
chunk_offsets: &chunk_offsets,
chunk_shape: &chunk_shape,
dataset_shape: &dataset_shape,
resolved: &resolved,
chunk_strides: &chunk_strides,
result_strides: &result_strides,
elem_size: 1,
},
)
.unwrap_err();
assert!(matches!(err, Error::InvalidData(_)));
}
}