use hdf5::{File, H5Type, Result};
use hdf5_metno as hdf5;
use hdf5_metno::types::FixedUnicode;
use ndarray::arr2;
use ndarray::Array1;
use ndarray::Array2;
use rand::distr::{Bernoulli, Distribution};
use rand::rngs::StdRng;
use rand::SeedableRng;
use std::path::PathBuf;
pub fn get_all_of_type(
group: &hdf5::Group,
loc_type: hdf5::LocationType,
) -> hdf5::Result<Vec<(String, hdf5::Location)>> {
group.iter_visit_default(vec![], |group, name, _info, objects| {
if let Ok(info) = group.loc_info_by_name(name) {
if info.loc_type == loc_type {
if let Ok(loc) = group.open_by_token(info.token) {
objects.push((name.to_string(), loc));
return true; }
} else {
return true; }
}
false })
}
pub fn groups(group: &hdf5::Group) -> hdf5::Result<Vec<(String, hdf5::Group)>> {
get_all_of_type(group, hdf5::LocationType::Group).map(|vec| {
vec.into_iter()
.map(|(name, obj)| (name, obj.as_group().unwrap()))
.collect()
})
}
pub fn datasets(group: &hdf5::Group) -> hdf5::Result<Vec<(String, hdf5::Dataset)>> {
get_all_of_type(group, hdf5::LocationType::Dataset).map(|vec| {
vec.into_iter()
.map(|(name, obj)| (name, obj.as_dataset().unwrap()))
.collect()
})
}
pub fn type_descriptor_to_text(dt: hdf5::types::TypeDescriptor) -> String {
match dt {
hdf5::types::TypeDescriptor::Compound(ct) => {
let mut rep: String = "compound(\n size: ".into();
rep.push_str(&ct.size.to_string());
rep.push_str(",\n");
rep.push_str(" fields:\n");
for field in ct.fields {
let field_text = type_descriptor_to_text(field.ty);
let prefix_indentation = " ";
let mut lines = field_text.lines();
let mut indented = String::new();
if let Some(first) = lines.next() {
indented.push_str(first);
}
for line in lines {
indented.push('\n');
indented.push_str(prefix_indentation);
indented.push_str(line);
}
rep.push_str(&format!(
"{}{}: {}\n",
prefix_indentation, &field.name, &indented
));
}
rep.push_str(")");
rep
}
non_compound => format!("{}", non_compound),
}
}
#[derive(H5Type, Clone, PartialEq, Debug)] #[repr(u8)]
pub enum Color {
R = 1,
G = 2,
B = 3,
}
#[derive(H5Type, Clone, PartialEq, Debug)] #[repr(C)]
pub struct Pixel {
x: i64,
y: i64,
color: Color,
field1: i32,
field2: i32,
field3: i32,
field4: i32,
field5: i32,
field6: i32,
field7: i32,
field8: i32,
field9: i32,
field10: i32,
field11: i32,
field12: i32,
field13: i32,
field14: i32,
field15: i32,
field16: i32,
field17: i32,
field18: i32,
field19: i32,
field20: i32,
field21: i32,
field22: i32,
field23: i32,
field24: i32,
field25: i32,
field26: i32,
field27: i32,
field28: i32,
field29: i32,
field30: i32,
field31: i32,
field32: i32,
field33: i32,
field34: i32,
field35: i32,
field36: i32,
field37: i32,
field38: i32,
field39: i32,
field40: i32,
field41: i32,
field42: i32,
field43: i32,
field44: i32,
field45: i32,
field46: i32,
field47: i32,
field48: i32,
field49: i32,
field50: i32,
}
impl Pixel {
pub fn new(x: i64, y: i64, color: Color) -> Self {
Self {
x,
y,
color,
field1: 0,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
field7: 0,
field8: 0,
field9: 0,
field10: 0,
field11: 0,
field12: 0,
field13: 0,
field14: 0,
field15: 0,
field16: 0,
field17: 0,
field18: 0,
field19: 0,
field20: 0,
field21: 0,
field22: 0,
field23: 0,
field24: 0,
field25: 0,
field26: 0,
field27: 0,
field28: 0,
field29: 0,
field30: 0,
field31: 0,
field32: 0,
field33: 0,
field34: 0,
field35: 0,
field36: 0,
field37: 0,
field38: 0,
field39: 0,
field40: 0,
field41: 0,
field42: 0,
field43: 0,
field44: 0,
field45: 0,
field46: 0,
field47: 0,
field48: 0,
field49: 0,
field50: 0,
}
}
}
#[allow(dead_code)]
pub fn generate_dummy_file() -> Result<()> {
let file = File::create("dummy.h5")?;
generate_dummy_core(&file)
}
#[allow(dead_code)]
pub fn generate_dummy_split_file() -> Result<()> {
let file = File::with_options()
.with_fapl(|p| p.split_options("-m.h5", "-r.h5"))
.create("dummy_split.h5")?;
generate_dummy_core(&file)
}
fn generate_dummy_core(file: &File) -> Result<()> {
let mut rng = StdRng::seed_from_u64(42);
let bernoulli = Bernoulli::new(0.5).unwrap(); let sums_arr: Array1<f32> = Array1::from_vec(
(0..1000)
.map(|_| {
let sum: f32 = (0..10)
.map(|_| bernoulli.sample(&mut rng) as u8 as f32)
.sum(); sum
})
.collect(),
);
let sums_ds = file
.new_dataset::<f32>()
.shape(1000)
.create("sums_of_bernoulli")?;
sums_ds.write(&sums_arr)?;
let (ny, nx) = (100, 100);
let arr = Array2::from_shape_fn((ny, nx), |(j, i)| (1000 * j + i) as f32);
let ds = file
.new_dataset::<f32>()
.chunk((1, ny, nx)) .shape((1.., ny, nx)) .deflate(3)
.create("variable")?;
ds.write_slice(&arr, (0, .., ..))?;
ds.resize((10, ny, nx))?;
ds.write_slice(&arr, (1, .., ..))?;
let chunksize = ds.chunk().unwrap();
assert_eq!(chunksize, &[1, ny, nx]);
let shape = ds.shape();
assert_eq!(shape, &[10, ny, nx]);
for k in 0..shape[0] {
let _arr: Array2<f32> = ds.read_slice((k, .., ..))?;
}
let group1 = file.create_group("group1")?;
let group1_d1 = group1
.new_dataset::<bool>()
.shape((10,))
.create("bool_ds")?;
group1_d1.write(&vec![
true, false, true, true, false, true, true, false, true, false,
])?;
let group1_d1 = group1
.new_dataset::<i64>()
.shape((4,))
.create("large_num_ds")?;
group1_d1.write(&vec![
1029830192830923098i64,
9283928390909203,
187309128309182309,
2832098095820958,
])?;
let group1_d1 = group1
.new_dataset::<f64>()
.shape((4,))
.create("small_num_ds")?;
group1_d1.write(&vec![
0.00000000001283798723,
0.0000000000000023092839,
0.00000000000083092839,
0.0000000000000023092839,
])?;
let large_ds_len_1st_half = 1_000_000;
let large_ds_len_2nd_half = 200_000;
let large_ds = group1
.new_dataset::<i64>()
.shape((large_ds_len_1st_half + large_ds_len_2nd_half,))
.create("large_rand")?;
let range = 1_000_000_001..std::i64::MAX; let random_numbers: Vec<i64> = (0..(large_ds_len_1st_half + large_ds_len_2nd_half))
.map(|x| {
if x < large_ds_len_1st_half {
rand::random_range(range.clone())
} else {
rand::random_range(0..99_999)
}
})
.collect();
large_ds.write(&random_numbers)?;
let dataset: hdf5::Dataset = group1
.new_dataset::<FixedUnicode<5>>()
.create("string_dataset")?;
dataset.write_scalar(&unsafe { FixedUnicode::<5>::from_str_unchecked("asdfg") })?;
use Color::*;
let builder = group1.new_dataset_builder();
let ds = builder
.with_data(&arr2(&[
[Pixel::new(1, 2, R), Pixel::new(2, 3, B)],
[Pixel::new(3, 4, G), Pixel::new(4, 5, R)],
[Pixel::new(5, 6, B), Pixel::new(6, 7, G)],
]))
.create("pixels")?;
let attr = ds.new_attr::<Color>().shape([3]).create("colors")?;
attr.write(&[R, G, B])?;
let group2 = group1.create_group("group2")?;
let group2_d1 = group2
.new_dataset::<i32>()
.shape((ny, nx))
.create("qweqwe")?;
group2_d1.write(&arr)?;
let arr = Array1::from_vec(vec![1, 2, 3]);
let group3 = file.create_group("group3")?;
for i in 0..2000 {
let dataset = group3
.new_dataset::<i32>()
.shape(3)
.create(format!("dataset_{}", i).as_str())?;
dataset.write(&arr)?;
}
Ok(())
}
pub fn open_file(file_path: &PathBuf) -> Result<hdf5::File> {
let file = hdf5::File::with_options()
.with_fapl(|p| p.sec2())
.open(file_path.clone());
if file.is_ok() {
return file;
}
let split_file = hdf5::File::with_options()
.with_fapl(|p| p.split_options("-m.h5", "-r.h5"))
.open(file_path.clone());
if split_file.is_ok() {
return split_file;
}
let mut clean_path = file_path.clone();
if let Some(file_name) = file_path.file_name().and_then(|n| n.to_str()) {
if file_name.ends_with("-m.h5") {
clean_path.set_file_name(&file_name[..file_name.len() - 5]);
} else if file_name.ends_with("-r.h5") {
clean_path.set_file_name(&file_name[..file_name.len() - 5]);
}
}
let split_file = hdf5::File::with_options()
.with_fapl(|p| p.split_options("-m.h5", "-r.h5"))
.open(clean_path);
if split_file.is_ok() {
return split_file;
}
if !file_path.exists() {
return Err(format!("File path doesn't exist: {file_path:?}").into());
}
Err("Couldn't open file".into())
}
#[allow(dead_code)]
pub fn generate_large_file() -> Result<()> {
let file = File::create("dummy_large.h5")?;
println!("Creating dummy_large.h5 with two 8GB datasets...");
let rows1 = 32768; let cols1 = 65536;
println!(
"Creating first dataset: {}x{} f32 array (~8GB)",
rows1, cols1
);
let ds1 = file
.new_dataset::<f32>()
.chunk((1024, 1024)) .shape((rows1, cols1))
.deflate(1) .create("large_float_dataset")?;
let chunk_rows = 1024;
for start_row in (0..rows1).step_by(chunk_rows) {
let end_row = (start_row + chunk_rows).min(rows1);
let chunk_height = end_row - start_row;
let chunk_data = Array2::from_shape_fn((chunk_height, cols1), |(i, j)| {
((start_row + i) as f32 * 0.001 + j as f32 * 0.0001).sin()
});
ds1.write_slice(&chunk_data, (start_row..end_row, ..))?;
if start_row % (chunk_rows * 10) == 0 {
println!("Written {}/{} rows for first dataset", start_row, rows1);
}
}
let len2 = 1_073_741_824;
println!("Creating second dataset: {} i64 elements (~8GB)", len2);
let ds2 = file
.new_dataset::<i64>()
.chunk(1048576) .shape(len2)
.deflate(1) .create("large_int_dataset")?;
let chunk_size = 1_048_576; for start in (0..len2).step_by(chunk_size) {
let end = (start + chunk_size).min(len2);
let chunk_len = end - start;
let chunk_data: Vec<i64> = (0..chunk_len)
.map(|i| {
let idx = start + i;
if idx % 1000 == 0 {
idx as i64 * 1_000_000_000 } else {
(idx as i64).wrapping_mul(31).wrapping_add(17) }
})
.collect();
ds2.write_slice(&chunk_data, start..end)?;
if start % (chunk_size * 100) == 0 {
println!("Written {}/{} elements for second dataset", start, len2);
}
}
println!("Successfully created dummy6.h5 with two ~8GB datasets");
Ok(())
}