use crate::h5_utils;
use crate::num_utils::{IsNan, MyToPrimitive, Summable};
use core::f64;
use dtoa;
#[allow(unused_imports)]
use hdf5::{File, H5Type};
use hdf5_metno::{self as hdf5, Dataset};
use ndarray::{self, Array1};
use num_traits::{self, ToPrimitive, Zero};
use serde::{Deserialize, Serialize};
use std::error::Error;
use std::fmt::Display;
use std::path::PathBuf;
use std::sync::Arc;
use std::vec;
pub type HistogramData = Vec<(f32, u32)>;
#[derive(Debug, Serialize, Deserialize)]
pub enum AnalysisResult {
Stats(Vec<(String, String)>, Option<HistogramData>),
NotAvailable,
Failed(String),
}
fn compute_histogram(d: &Array1<f64>) -> Result<HistogramData, Box<dyn Error>> {
let n_bins = 30; let min = d.iter().fold(f64::INFINITY, |acc, &x| f64::min(acc, x));
let max = d.iter().fold(f64::NEG_INFINITY, |acc, &x| f64::max(acc, x));
let bin_width = (max - min) / (n_bins - 1) as f64;
if !min.is_finite() || !max.is_finite() || bin_width <= 0.0 {
return Err(format!(
"Problem with histogram gen: min = {}, max = {}, bin_width = {}",
min, max, bin_width
)
.into());
}
let mut counts = vec![0; n_bins];
for &value in d.iter() {
if !value.is_nan() {
let bin_index = ((value - min) / bin_width).floor() as usize;
if bin_index < n_bins {
counts[bin_index] += 1;
}
}
}
let mut result = Vec::new();
for i in 0..n_bins {
let bin_center = min + (bin_width * (i as f64)) + (bin_width / 2.0);
let count = counts[i];
result.push((bin_center as f32, count));
}
Ok(result)
}
fn analysis_1d<T>(d: Arc<Dataset>) -> Result<AnalysisResult, Box<dyn Error>>
where
T: H5Type + Summable + IsNan + Clone + Display + MyToPrimitive,
{
let mut info: Vec<(String, String)> = Vec::new();
let v: Array1<T> = d.read_1d()?;
let sum: T::AccumulatorType = v.iter().fold(T::AccumulatorType::zero(), |acc, x| {
acc + x.to_owned().into()
});
let mean: f64 = (sum.to_f64().unwrap_or(f64::NAN)) / (v.len() as f64);
info.push((
"Mean".to_owned(),
dtoa::Buffer::new().format(mean).to_string(),
));
info.push((
"NaN count".to_owned(),
v.mapv(|x| x.my_is_nan() as u32).sum().to_string(),
));
let arr_f64: Array1<f64> = v.mapv(|x| x.my_to_f64().unwrap_or(f64::NAN));
let std: f64 = arr_f64.std(1.);
info.push((
"Std".to_owned(),
dtoa::Buffer::new().format(std).to_string(),
));
info.push(("Data preview".to_owned(), format!("{}", v)));
let hist = compute_histogram(&arr_f64).ok();
Ok(AnalysisResult::Stats(info, hist))
}
pub fn hdf5_dataset_analysis_from_path(
file_path: &PathBuf,
dataset_path: &str,
) -> Result<AnalysisResult, Box<dyn Error>> {
let file = h5_utils::open_file(file_path)?;
let dataset = file.dataset(dataset_path)?;
let d = Arc::new(dataset);
let dtype = d.dtype()?;
if d.ndim() != 1 || d.size() == 0 {
log::info!(
"Dataset is not 1D or is empty: ndim: {}, size: {}",
d.ndim(),
d.size()
);
return Ok(AnalysisResult::NotAvailable);
}
log::info!("Dataset dtype: {:?}", dtype.to_descriptor());
let result = if dtype.is::<f32>() {
analysis_1d::<f32>(d)
} else if dtype.is::<f64>() {
analysis_1d::<f64>(d)
} else if dtype.is::<i8>() {
analysis_1d::<i8>(d)
} else if dtype.is::<u8>() {
analysis_1d::<u8>(d)
} else if dtype.is::<i16>() {
analysis_1d::<i16>(d)
} else if dtype.is::<u16>() {
analysis_1d::<u16>(d)
} else if dtype.is::<i32>() {
analysis_1d::<i32>(d)
} else if dtype.is::<u32>() {
analysis_1d::<u32>(d)
} else if dtype.is::<i64>() {
analysis_1d::<i64>(d)
} else if dtype.is::<u64>() {
analysis_1d::<u64>(d)
} else if dtype.is::<bool>() {
analysis_1d::<bool>(d)
} else {
Ok(AnalysisResult::NotAvailable)
};
result
}