use crate::{FloatElement, Tensor, TensorElement};
use scirs2_core::ndarray::{Array1, Array2};
use scirs2_core::numeric::ToPrimitive;
use std::collections::HashMap;
use torsh_core::error::{Result, TorshError};
pub struct SciRS2StatsProcessor {
config: StatsConfig,
}
#[derive(Debug, Clone)]
pub struct StatsConfig {
pub confidence_level: f64,
pub missing_value_strategy: MissingValueStrategy,
pub bootstrap_samples: usize,
pub random_seed: Option<u64>,
pub numerical_precision: f64,
}
#[derive(Debug, Clone, Copy)]
pub enum MissingValueStrategy {
ListwiseDeletion,
MeanImputation,
MedianImputation,
ModeImputation,
ForwardFill,
BackwardFill,
}
impl Default for StatsConfig {
fn default() -> Self {
Self {
confidence_level: 0.95,
missing_value_strategy: MissingValueStrategy::ListwiseDeletion,
bootstrap_samples: 1000,
random_seed: None,
numerical_precision: 1e-10,
}
}
}
impl SciRS2StatsProcessor {
pub fn new(config: StatsConfig) -> Self {
Self { config }
}
pub fn default() -> Self {
Self::new(StatsConfig::default())
}
pub fn config(&self) -> &StatsConfig {
&self.config
}
pub fn describe<T: FloatElement>(&self, tensor: &Tensor<T>) -> Result<DescriptiveStats<T>> {
let data = self.tensor_to_array1(tensor)?;
let n = data.len() as f64;
let mean = data
.iter()
.map(|&x| ToPrimitive::to_f64(&x).expect("f64 conversion should succeed"))
.sum::<f64>()
/ n;
let variance = data
.iter()
.map(|&x| {
let diff = ToPrimitive::to_f64(&x).expect("f64 conversion should succeed") - mean;
diff * diff
})
.sum::<f64>()
/ (n - 1.0);
let std_dev = variance.sqrt();
let skewness = if std_dev > 0.0 {
data.iter()
.map(|&x| {
let z = (ToPrimitive::to_f64(&x).expect("f64 conversion should succeed")
- mean)
/ std_dev;
z * z * z
})
.sum::<f64>()
/ n
} else {
0.0
};
let kurtosis = if std_dev > 0.0 {
data.iter()
.map(|&x| {
let z = (ToPrimitive::to_f64(&x).expect("f64 conversion should succeed")
- mean)
/ std_dev;
z * z * z * z
})
.sum::<f64>()
/ n
- 3.0 } else {
0.0
};
let mut sorted_data: Vec<f64> = data
.iter()
.map(|&x| ToPrimitive::to_f64(&x).expect("f64 conversion should succeed"))
.collect();
sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let q25 = self.percentile(&sorted_data, 25.0);
let median = self.percentile(&sorted_data, 50.0);
let q75 = self.percentile(&sorted_data, 75.0);
Ok(DescriptiveStats {
count: n as usize,
mean: T::from_f64(mean).expect("f64 conversion should succeed"),
std_dev: T::from_f64(std_dev).expect("f64 conversion should succeed"),
variance: T::from_f64(variance).expect("f64 conversion should succeed"),
skewness: T::from_f64(skewness).expect("f64 conversion should succeed"),
kurtosis: T::from_f64(kurtosis).expect("f64 conversion should succeed"),
min: T::from_f64(sorted_data[0]).expect("f64 conversion should succeed"),
max: T::from_f64(sorted_data[sorted_data.len() - 1])
.expect("f64 conversion should succeed"),
q25: T::from_f64(q25).expect("f64 conversion should succeed"),
median: T::from_f64(median).expect("f64 conversion should succeed"),
q75: T::from_f64(q75).expect("f64 conversion should succeed"),
iqr: T::from_f64(q75 - q25).expect("f64 conversion should succeed"),
})
}
pub fn correlation_matrix<T: FloatElement>(
&self,
tensor: &Tensor<T>,
) -> Result<CorrelationResult<T>> {
let data = self.tensor_to_array2(tensor)?;
let (n_rows, n_cols) = data.dim();
let mut correlation_matrix = Array2::zeros((n_cols, n_cols));
for i in 0..n_cols {
for j in 0..n_cols {
let col_i: Vec<f64> = (0..n_rows)
.map(|row| {
ToPrimitive::to_f64(&data[[row, i]]).expect("f64 conversion should succeed")
})
.collect();
let col_j: Vec<f64> = (0..n_rows)
.map(|row| {
ToPrimitive::to_f64(&data[[row, j]]).expect("f64 conversion should succeed")
})
.collect();
let correlation = self.pearson_correlation(&col_i, &col_j);
correlation_matrix[[i, j]] = correlation;
}
}
let corr_tensor = self.array2_to_tensor(&correlation_matrix)?;
Ok(CorrelationResult {
correlation_matrix: corr_tensor,
method: CorrelationMethod::Pearson,
significant_pairs: Vec::new(), })
}
pub fn one_sample_ttest<T: FloatElement>(
&self,
data: &Tensor<T>,
expected_mean: T,
) -> Result<TTestResult<T>> {
let values = self.tensor_to_array1(data)?;
let n = values.len() as f64;
let expected = ToPrimitive::to_f64(&expected_mean).expect("f64 conversion should succeed");
let sample_mean = values
.iter()
.map(|&x| ToPrimitive::to_f64(&x).expect("f64 conversion should succeed"))
.sum::<f64>()
/ n;
let sample_var = values
.iter()
.map(|&x| {
let diff =
ToPrimitive::to_f64(&x).expect("f64 conversion should succeed") - sample_mean;
diff * diff
})
.sum::<f64>()
/ (n - 1.0);
let standard_error = (sample_var / n).sqrt();
let t_statistic = (sample_mean - expected) / standard_error;
let degrees_of_freedom = n - 1.0;
let p_value = 2.0 * (1.0 - self.t_cdf(t_statistic.abs(), degrees_of_freedom));
Ok(TTestResult {
t_statistic: T::from_f64(t_statistic).expect("f64 conversion should succeed"),
p_value: T::from_f64(p_value).expect("f64 conversion should succeed"),
degrees_of_freedom: T::from_f64(degrees_of_freedom)
.expect("f64 conversion should succeed"),
confidence_interval: self.compute_confidence_interval(
sample_mean,
standard_error,
degrees_of_freedom,
),
effect_size: T::from_f64((sample_mean - expected) / sample_var.sqrt())
.expect("f64 conversion should succeed"),
})
}
pub fn two_sample_ttest<T: FloatElement>(
&self,
group1: &Tensor<T>,
group2: &Tensor<T>,
equal_variance: bool,
) -> Result<TTestResult<T>> {
let data1 = self.tensor_to_array1(group1)?;
let data2 = self.tensor_to_array1(group2)?;
let n1 = data1.len() as f64;
let n2 = data2.len() as f64;
let mean1 = data1
.iter()
.map(|&x| ToPrimitive::to_f64(&x).expect("f64 conversion should succeed"))
.sum::<f64>()
/ n1;
let mean2 = data2
.iter()
.map(|&x| ToPrimitive::to_f64(&x).expect("f64 conversion should succeed"))
.sum::<f64>()
/ n2;
let var1 = data1
.iter()
.map(|&x| {
let diff = ToPrimitive::to_f64(&x).expect("f64 conversion should succeed") - mean1;
diff * diff
})
.sum::<f64>()
/ (n1 - 1.0);
let var2 = data2
.iter()
.map(|&x| {
let diff = ToPrimitive::to_f64(&x).expect("f64 conversion should succeed") - mean2;
diff * diff
})
.sum::<f64>()
/ (n2 - 1.0);
let (t_statistic, degrees_of_freedom, standard_error) = if equal_variance {
let pooled_var = ((n1 - 1.0) * var1 + (n2 - 1.0) * var2) / (n1 + n2 - 2.0);
let se = (pooled_var * (1.0 / n1 + 1.0 / n2)).sqrt();
let t = (mean1 - mean2) / se;
let df = n1 + n2 - 2.0;
(t, df, se)
} else {
let se = (var1 / n1 + var2 / n2).sqrt();
let t = (mean1 - mean2) / se;
let df = (var1 / n1 + var2 / n2).powi(2)
/ ((var1 / n1).powi(2) / (n1 - 1.0) + (var2 / n2).powi(2) / (n2 - 1.0));
(t, df, se)
};
let p_value = 2.0 * (1.0 - self.t_cdf(t_statistic.abs(), degrees_of_freedom));
Ok(TTestResult {
t_statistic: T::from_f64(t_statistic).expect("f64 conversion should succeed"),
p_value: T::from_f64(p_value).expect("f64 conversion should succeed"),
degrees_of_freedom: T::from_f64(degrees_of_freedom)
.expect("f64 conversion should succeed"),
confidence_interval: self.compute_confidence_interval(
mean1 - mean2,
standard_error,
degrees_of_freedom,
),
effect_size: T::from_f64((mean1 - mean2) / ((var1 + var2) / 2.0).sqrt())
.expect("f64 conversion should succeed"), })
}
pub fn linear_regression<T: FloatElement>(
&self,
x: &Tensor<T>,
y: &Tensor<T>,
) -> Result<RegressionResult<T>> {
let x_data = self.tensor_to_array1(x)?;
let y_data = self.tensor_to_array1(y)?;
if x_data.len() != y_data.len() {
return Err(TorshError::RuntimeError(
"X and Y must have same length".to_string(),
));
}
let n = x_data.len() as f64;
let x_vals: Vec<f64> = x_data
.iter()
.map(|&v| ToPrimitive::to_f64(&v).expect("f64 conversion should succeed"))
.collect();
let y_vals: Vec<f64> = y_data
.iter()
.map(|&v| ToPrimitive::to_f64(&v).expect("f64 conversion should succeed"))
.collect();
let x_mean = x_vals.iter().sum::<f64>() / n;
let y_mean = y_vals.iter().sum::<f64>() / n;
let numerator: f64 = x_vals
.iter()
.zip(y_vals.iter())
.map(|(&x, &y)| (x - x_mean) * (y - y_mean))
.sum();
let denominator: f64 = x_vals.iter().map(|&x| (x - x_mean) * (x - x_mean)).sum();
let slope = numerator / denominator;
let intercept = y_mean - slope * x_mean;
let y_pred: Vec<f64> = x_vals.iter().map(|&x| intercept + slope * x).collect();
let ss_res: f64 = y_vals
.iter()
.zip(y_pred.iter())
.map(|(&y, &pred)| (y - pred) * (y - pred))
.sum();
let ss_tot: f64 = y_vals.iter().map(|&y| (y - y_mean) * (y - y_mean)).sum();
let r_squared = 1.0 - ss_res / ss_tot;
let mse = ss_res / (n - 2.0);
let slope_se = (mse / denominator).sqrt();
let intercept_se = (mse * (1.0 / n + x_mean * x_mean / denominator)).sqrt();
Ok(RegressionResult {
coefficients: vec![
T::from_f64(intercept).expect("f64 conversion should succeed"),
T::from_f64(slope).expect("f64 conversion should succeed"),
],
standard_errors: vec![
T::from_f64(intercept_se).expect("f64 conversion should succeed"),
T::from_f64(slope_se).expect("f64 conversion should succeed"),
],
r_squared: T::from_f64(r_squared).expect("f64 conversion should succeed"),
adjusted_r_squared: T::from_f64(1.0 - (1.0 - r_squared) * (n - 1.0) / (n - 2.0))
.expect("f64 conversion should succeed"),
f_statistic: T::from_f64(r_squared * (n - 2.0) / (1.0 - r_squared))
.expect("f64 conversion should succeed"),
residuals: {
let residuals_data: Vec<T> = y_vals
.iter()
.zip(y_pred.iter())
.map(|(&y, &pred)| {
T::from_f64(y - pred).expect("f64 conversion should succeed")
})
.collect();
let len = residuals_data.len();
Tensor::from_vec(residuals_data, &[len])?
},
})
}
pub fn fit_normal_distribution<T: FloatElement>(
&self,
data: &Tensor<T>,
) -> Result<DistributionFit<T>> {
let values = self.tensor_to_array1(data)?;
let n = values.len() as f64;
let mean = values
.iter()
.map(|&x| ToPrimitive::to_f64(&x).expect("f64 conversion should succeed"))
.sum::<f64>()
/ n;
let variance = values
.iter()
.map(|&x| {
let diff = ToPrimitive::to_f64(&x).expect("f64 conversion should succeed") - mean;
diff * diff
})
.sum::<f64>()
/ (n - 1.0);
let std_dev = variance.sqrt();
let log_likelihood = values
.iter()
.map(|&x| {
let z = (ToPrimitive::to_f64(&x).expect("f64 conversion should succeed") - mean)
/ std_dev;
-0.5 * (2.0 * std::f64::consts::PI).ln() - std_dev.ln() - 0.5 * z * z
})
.sum::<f64>();
Ok(DistributionFit {
distribution_type: DistributionType::Normal,
parameters: HashMap::from([
(
"mean".to_string(),
T::from_f64(mean).expect("f64 conversion should succeed"),
),
(
"std_dev".to_string(),
T::from_f64(std_dev).expect("f64 conversion should succeed"),
),
]),
log_likelihood: T::from_f64(log_likelihood).expect("f64 conversion should succeed"),
aic: T::from_f64(-2.0 * log_likelihood + 2.0 * 2.0)
.expect("f64 conversion should succeed"), bic: T::from_f64(-2.0 * log_likelihood + 2.0 * n.ln())
.expect("f64 conversion should succeed"),
goodness_of_fit: {
let values_f64: Vec<f64> = values
.iter()
.map(|&x| ToPrimitive::to_f64(&x).expect("f64 conversion should succeed"))
.collect();
self.kolmogorov_smirnov_test(&values_f64, mean, std_dev)
},
})
}
fn tensor_to_array1<T: TensorElement>(&self, tensor: &Tensor<T>) -> Result<Array1<T>> {
let data = tensor.to_vec()?;
let shape = tensor.shape();
if shape.dims().len() != 1 {
return Err(TorshError::RuntimeError("Expected 1D tensor".to_string()));
}
Array1::from_vec(data)
.into_shape_with_order((shape.dims()[0],))
.map_err(|e| TorshError::RuntimeError(format!("Array conversion failed: {}", e)))
}
fn tensor_to_array2<T: TensorElement>(&self, tensor: &Tensor<T>) -> Result<Array2<T>> {
let data = tensor.to_vec()?;
let shape = tensor.shape();
if shape.dims().len() != 2 {
return Err(TorshError::RuntimeError("Expected 2D tensor".to_string()));
}
Array2::from_shape_vec((shape.dims()[0], shape.dims()[1]), data)
.map_err(|e| TorshError::RuntimeError(format!("Array conversion failed: {}", e)))
}
fn array2_to_tensor<T: TensorElement>(&self, array: &Array2<f64>) -> Result<Tensor<T>> {
let data: Vec<T> = array
.iter()
.map(|&x| T::from_f64(x).expect("f64 conversion should succeed"))
.collect();
let shape = vec![array.nrows(), array.ncols()];
Tensor::from_vec(data, &shape)
}
fn percentile(&self, sorted_data: &[f64], percentile: f64) -> f64 {
let n = sorted_data.len();
let index = (percentile / 100.0) * ((n - 1) as f64);
let lower = index.floor() as usize;
let upper = index.ceil() as usize;
if lower == upper {
sorted_data[lower]
} else {
let weight = index - (lower as f64);
sorted_data[lower] * (1.0 - weight) + sorted_data[upper] * weight
}
}
fn pearson_correlation(&self, x: &[f64], y: &[f64]) -> f64 {
let n = x.len() as f64;
let x_mean = x.iter().sum::<f64>() / n;
let y_mean = y.iter().sum::<f64>() / n;
let numerator: f64 = x
.iter()
.zip(y.iter())
.map(|(&xi, &yi)| (xi - x_mean) * (yi - y_mean))
.sum();
let x_var: f64 = x.iter().map(|&xi| (xi - x_mean) * (xi - x_mean)).sum();
let y_var: f64 = y.iter().map(|&yi| (yi - y_mean) * (yi - y_mean)).sum();
numerator / (x_var * y_var).sqrt()
}
fn t_cdf(&self, t: f64, df: f64) -> f64 {
0.5 + 0.5 * (t / (1.0 + t * t / df).sqrt()).tanh()
}
fn compute_confidence_interval<T: FloatElement>(
&self,
estimate: f64,
standard_error: f64,
degrees_of_freedom: f64,
) -> (T, T) {
let alpha = 1.0 - self.config.confidence_level;
let t_critical = if degrees_of_freedom > 30.0 {
if alpha < 0.02 {
2.576 } else {
1.96 }
} else {
let base_t = if alpha < 0.02 { 2.8 } else { 2.1 };
base_t * (1.0 + 5.0 / degrees_of_freedom).sqrt()
};
let _ = (alpha, degrees_of_freedom, t_critical);
let margin_of_error = t_critical * standard_error;
(
T::from_f64(estimate - margin_of_error).expect("f64 conversion should succeed"),
T::from_f64(estimate + margin_of_error).expect("f64 conversion should succeed"),
)
}
fn kolmogorov_smirnov_test(&self, data: &[f64], mean: f64, std_dev: f64) -> f64 {
if data.is_empty() || std_dev <= 0.0 {
return 1.0; }
let n = data.len() as f64;
let mut sorted_data = data.to_vec();
sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let mut max_deviation = 0.0f64;
for (i, &x) in sorted_data.iter().enumerate() {
let empirical_cdf = (i + 1) as f64 / n;
let z = (x - mean) / std_dev;
let theoretical_cdf = 0.5 * (1.0 + libm::erf(z / std::f64::consts::SQRT_2));
let deviation = (empirical_cdf - theoretical_cdf).abs();
max_deviation = max_deviation.max(deviation);
}
let ks_statistic = max_deviation * n.sqrt();
let p_value = (-2.0 * ks_statistic * ks_statistic).exp();
1.0 - p_value
}
}
#[derive(Debug, Clone)]
pub struct DescriptiveStats<T: TensorElement> {
pub count: usize,
pub mean: T,
pub std_dev: T,
pub variance: T,
pub skewness: T,
pub kurtosis: T,
pub min: T,
pub max: T,
pub q25: T,
pub median: T,
pub q75: T,
pub iqr: T,
}
#[derive(Debug, Clone)]
pub struct CorrelationResult<T: TensorElement> {
pub correlation_matrix: Tensor<T>,
pub method: CorrelationMethod,
pub significant_pairs: Vec<(usize, usize, T)>, }
#[derive(Debug, Clone, Copy)]
pub enum CorrelationMethod {
Pearson,
Spearman,
Kendall,
}
#[derive(Debug, Clone)]
pub struct TTestResult<T: TensorElement> {
pub t_statistic: T,
pub p_value: T,
pub degrees_of_freedom: T,
pub confidence_interval: (T, T),
pub effect_size: T,
}
#[derive(Debug, Clone)]
pub struct RegressionResult<T: TensorElement> {
pub coefficients: Vec<T>,
pub standard_errors: Vec<T>,
pub r_squared: T,
pub adjusted_r_squared: T,
pub f_statistic: T,
pub residuals: Tensor<T>,
}
#[derive(Debug, Clone)]
pub struct DistributionFit<T: TensorElement> {
pub distribution_type: DistributionType,
pub parameters: HashMap<String, T>,
pub log_likelihood: T,
pub aic: T,
pub bic: T,
pub goodness_of_fit: f64,
}
#[derive(Debug, Clone, Copy)]
pub enum DistributionType {
Normal,
Exponential,
Gamma,
Beta,
Poisson,
Binomial,
Uniform,
}
pub use crate::stats::{HistogramConfig, StatMode};