pub mod checksum;
pub mod error;
pub mod io;
pub mod superblock;
pub mod btree_v1;
pub mod btree_v2;
pub mod chunk_index;
pub mod extensible_array;
pub mod fixed_array;
pub mod fractal_heap;
pub mod global_heap;
pub mod local_heap;
pub mod symbol_table;
pub mod messages;
pub mod object_header;
pub mod attribute_api;
pub mod dataset;
pub mod datatype_api;
pub mod group;
pub mod reference;
pub mod storage;
pub mod filters;
pub mod cache;
use std::collections::HashMap;
use std::path::Path;
use std::sync::{Arc, OnceLock};
use memmap2::Mmap;
use cache::ChunkCache;
use error::{Error, Result};
use group::Group;
use object_header::ObjectHeader;
use storage::DynStorage;
use superblock::Superblock;
pub use attribute_api::Attribute;
use dataset::DatasetTemplate;
pub use dataset::{Dataset, SliceInfo, SliceInfoElem};
pub use datatype_api::{
dtype_element_size, CompoundField, EnumMember, H5Type, ReferenceType, StringEncoding,
StringPadding, StringSize,
};
pub use error::ByteOrder;
pub use filters::FilterRegistry;
pub use messages::datatype::Datatype;
pub use storage::{BytesStorage, FileStorage, MmapStorage, Storage, StorageBuffer};
pub struct OpenOptions {
pub chunk_cache_bytes: usize,
pub chunk_cache_slots: usize,
pub filter_registry: Option<FilterRegistry>,
}
impl Default for OpenOptions {
fn default() -> Self {
OpenOptions {
chunk_cache_bytes: 64 * 1024 * 1024,
chunk_cache_slots: 521,
filter_registry: None,
}
}
}
pub type HeaderCache = Arc<parking_lot::Mutex<HashMap<u64, Arc<ObjectHeader>>>>;
pub struct Hdf5File {
context: Arc<FileContext>,
}
pub(crate) struct FileContext {
pub(crate) storage: DynStorage,
pub(crate) superblock: Superblock,
pub(crate) chunk_cache: Arc<ChunkCache>,
pub(crate) header_cache: HeaderCache,
pub(crate) dataset_path_cache: Arc<parking_lot::Mutex<HashMap<String, Arc<DatasetTemplate>>>>,
pub(crate) filter_registry: Arc<FilterRegistry>,
full_file_cache: OnceLock<StorageBuffer>,
}
impl FileContext {
pub(crate) fn read_range(&self, offset: u64, len: usize) -> Result<StorageBuffer> {
self.storage.read_range(offset, len)
}
pub(crate) fn full_file_data(&self) -> Result<StorageBuffer> {
if let Some(buffer) = self.full_file_cache.get() {
return Ok(buffer.clone());
}
let len = usize::try_from(self.storage.len()).map_err(|_| {
Error::InvalidData("file size exceeds platform usize capacity".to_string())
})?;
let buffer = self.storage.read_range(0, len)?;
let _ = self.full_file_cache.set(buffer);
Ok(self
.full_file_cache
.get()
.expect("full-file buffer must exist after successful initialization")
.clone())
}
pub(crate) fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
{
let cache = self.header_cache.lock();
if let Some(hdr) = cache.get(&addr) {
return Ok(Arc::clone(hdr));
}
}
let mut hdr = ObjectHeader::parse_at_storage(
self.storage.as_ref(),
addr,
self.superblock.offset_size,
self.superblock.length_size,
)?;
hdr.resolve_shared_messages_storage(
self.storage.as_ref(),
self.superblock.offset_size,
self.superblock.length_size,
)?;
let arc = Arc::new(hdr);
let mut cache = self.header_cache.lock();
cache.insert(addr, Arc::clone(&arc));
Ok(arc)
}
}
impl Hdf5File {
fn from_storage_impl(storage: DynStorage, options: OpenOptions) -> Result<Self> {
let superblock = Superblock::parse_from_storage(storage.as_ref())?;
let cache = Arc::new(ChunkCache::new(
options.chunk_cache_bytes,
options.chunk_cache_slots,
));
let registry = options.filter_registry.unwrap_or_default();
Ok(Hdf5File {
context: Arc::new(FileContext {
storage,
superblock,
chunk_cache: cache,
header_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
dataset_path_cache: Arc::new(parking_lot::Mutex::new(HashMap::new())),
filter_registry: Arc::new(registry),
full_file_cache: OnceLock::new(),
}),
})
}
pub fn open(path: impl AsRef<Path>) -> Result<Self> {
Self::open_with_options(path, OpenOptions::default())
}
pub fn open_with_options(path: impl AsRef<Path>, options: OpenOptions) -> Result<Self> {
Self::from_storage_with_options(Arc::new(FileStorage::open(path)?), options)
}
pub fn from_bytes(data: &[u8]) -> Result<Self> {
Self::from_bytes_with_options(data, OpenOptions::default())
}
pub fn from_bytes_with_options(data: &[u8], options: OpenOptions) -> Result<Self> {
Self::from_vec_with_options(data.to_vec(), options)
}
pub fn from_vec(data: Vec<u8>) -> Result<Self> {
Self::from_vec_with_options(data, OpenOptions::default())
}
pub fn from_vec_with_options(data: Vec<u8>, options: OpenOptions) -> Result<Self> {
Self::from_storage_with_options(Arc::new(BytesStorage::new(data)), options)
}
pub fn from_mmap_with_options(mmap: Mmap, options: OpenOptions) -> Result<Self> {
Self::from_storage_with_options(Arc::new(MmapStorage::new(mmap)), options)
}
pub fn from_storage(storage: DynStorage) -> Result<Self> {
Self::from_storage_with_options(storage, OpenOptions::default())
}
pub fn from_storage_with_options(storage: DynStorage, options: OpenOptions) -> Result<Self> {
Self::from_storage_impl(storage, options)
}
pub fn superblock(&self) -> &Superblock {
&self.context.superblock
}
pub fn storage(&self) -> &dyn Storage {
self.context.storage.as_ref()
}
pub fn get_or_parse_header(&self, addr: u64) -> Result<Arc<ObjectHeader>> {
self.context.get_or_parse_header(addr)
}
pub fn root_group(&self) -> Result<Group> {
let addr = self.context.superblock.root_object_header_address()?;
Ok(Group::new(
self.context.clone(),
addr,
"/".to_string(),
addr, ))
}
pub fn dataset(&self, path: &str) -> Result<Dataset> {
let parts: Vec<&str> = path
.trim_start_matches('/')
.split('/')
.filter(|s| !s.is_empty())
.collect();
let normalized_path = format!("/{}", parts.join("/"));
if parts.is_empty() {
return Err(Error::DatasetNotFound(path.to_string()).with_context(path));
}
if let Some(template) = self
.context
.dataset_path_cache
.lock()
.get(&normalized_path)
.cloned()
{
return Ok(Dataset::from_template(self.context.clone(), template));
}
let mut group = self.root_group()?;
for &part in &parts[..parts.len() - 1] {
group = group.group(part).map_err(|e| e.with_context(path))?;
}
let dataset = group
.dataset(parts[parts.len() - 1])
.map_err(|e| e.with_context(path))?;
self.context
.dataset_path_cache
.lock()
.insert(normalized_path, dataset.template());
Ok(dataset)
}
pub fn group(&self, path: &str) -> Result<Group> {
let parts: Vec<&str> = path
.trim_start_matches('/')
.split('/')
.filter(|s| !s.is_empty())
.collect();
let mut group = self.root_group()?;
for &part in &parts {
group = group.group(part)?;
}
Ok(group)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_open_options_default() {
let opts = OpenOptions::default();
assert_eq!(opts.chunk_cache_bytes, 64 * 1024 * 1024);
assert_eq!(opts.chunk_cache_slots, 521);
}
#[test]
fn test_invalid_file() {
let data = b"this is not an HDF5 file";
let result = Hdf5File::from_bytes(data);
assert!(result.is_err());
}
}