use crate::{FloatElement, Tensor, TensorElement};
use torsh_core::error::{Result, TorshError};
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum StatMode {
Population,
Sample,
}
#[derive(Debug, Clone)]
pub struct HistogramConfig {
pub bins: usize,
pub min_val: Option<f64>,
pub max_val: Option<f64>,
pub include_outliers: bool,
}
impl Default for HistogramConfig {
fn default() -> Self {
Self {
bins: 50,
min_val: None,
max_val: None,
include_outliers: true,
}
}
}
#[derive(Debug, Clone)]
pub struct Histogram {
pub counts: Vec<usize>,
pub edges: Vec<f64>,
pub total_count: usize,
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum CorrelationMethod {
Pearson,
Spearman,
Kendall,
}
#[derive(Debug, Clone)]
pub struct StatSummary {
pub count: usize,
pub mean: f64,
pub std: f64,
pub min: f64,
pub max: f64,
pub q25: f64, pub q50: f64, pub q75: f64, }
impl<
T: TensorElement
+ FloatElement
+ Copy
+ Default
+ std::ops::Add<Output = T>
+ std::ops::AddAssign
+ std::ops::Sub<Output = T>
+ std::ops::Mul<Output = T>
+ std::ops::MulAssign
+ std::ops::Div<Output = T>
+ PartialOrd
+ num_traits::FromPrimitive
+ std::iter::Sum,
> Tensor<T>
{
pub fn mean_stats(&self, dims: Option<&[usize]>, keepdim: bool) -> Result<Self> {
let sum = if let Some(dims) = dims {
self.sum_dim(&dims.iter().map(|&d| d as i32).collect::<Vec<_>>(), keepdim)?
} else {
self.sum()?
};
let count = if let Some(dims) = dims {
dims.iter()
.map(|&d| self.shape().dims()[d])
.product::<usize>() as f64
} else {
self.numel() as f64
};
sum.div_scalar(
<T as num_traits::FromPrimitive>::from_f64(count)
.unwrap_or_else(|| <T as num_traits::One>::one()),
)
}
pub fn var(&self, dims: Option<&[usize]>, keepdim: bool, mode: StatMode) -> Result<Self> {
let mean = self.mean(dims, false)?; let mean_value = mean.item()?; let diff = self.sub_scalar(mean_value)?;
let squared_diff = diff.mul_op(&diff)?;
let sum_sq = if let Some(dims) = dims {
squared_diff.sum_dim(&dims.iter().map(|&d| d as i32).collect::<Vec<_>>(), keepdim)?
} else {
squared_diff.sum()?
};
let count = if let Some(dims) = dims {
dims.iter()
.map(|&d| self.shape().dims()[d])
.product::<usize>()
} else {
self.numel()
};
let divisor = match mode {
StatMode::Population => count,
StatMode::Sample => count - 1,
};
if divisor == 0 {
return Err(TorshError::InvalidArgument(
"Cannot compute variance with zero degrees of freedom".to_string(),
));
}
sum_sq.div_scalar(
<T as num_traits::FromPrimitive>::from_usize(divisor)
.unwrap_or_else(|| <T as num_traits::One>::one()),
)
}
pub fn std(&self, dims: Option<&[usize]>, keepdim: bool, mode: StatMode) -> Result<Self> {
let variance = self.var(dims, keepdim, mode)?;
variance.sqrt()
}
pub fn percentile(&self, q: f64, dim: Option<usize>, _keepdim: bool) -> Result<Self> {
if !(0.0..=100.0).contains(&q) {
return Err(TorshError::InvalidArgument(format!(
"Percentile must be between 0 and 100, got {q}"
)));
}
let dim = dim.unwrap_or(self.shape().ndim() - 1);
if dim >= self.shape().ndim() {
return Err(TorshError::dimension_error(
&format!(
"Dimension {} out of bounds for tensor with {} dimensions",
dim,
self.shape().ndim()
),
"tensor statistics operation",
));
}
let (sorted, _indices) = self.sort(Some(dim as i32), false)?; let size = self.shape().dims()[dim];
let pos = q / 100.0 * (size - 1) as f64;
let lower_idx = pos.floor() as usize;
let upper_idx = (pos.ceil() as usize).min(size - 1);
let weight = pos - pos.floor();
if lower_idx == upper_idx {
sorted.select(dim as i32, lower_idx as i64)
} else {
let lower = sorted.select(dim as i32, lower_idx as i64)?;
let upper = sorted.select(dim as i32, upper_idx as i64)?;
let diff = upper.sub(&lower)?;
let weight_scalar = <T as TensorElement>::from_f64(weight)
.unwrap_or_else(|| <T as TensorElement>::from_f64(0.0).unwrap_or_default());
let weighted_diff = diff.mul_scalar(weight_scalar)?;
lower.add_op(&weighted_diff)
}
}
pub fn median(&self, dim: Option<usize>, keepdim: bool) -> Result<Self> {
self.percentile(50.0, dim, keepdim)
}
pub fn quantile(&self, q: &[f64], dim: Option<usize>, keepdim: bool) -> Result<Vec<Self>> {
let mut results = Vec::new();
for &quantile in q {
results.push(self.percentile(quantile * 100.0, dim, keepdim)?);
}
Ok(results)
}
pub fn histogram(&self, config: &HistogramConfig) -> Result<Histogram> {
let data = self.to_vec()?;
if data.is_empty() {
return Ok(Histogram {
counts: vec![0; config.bins],
edges: (0..=config.bins).map(|i| i as f64).collect(),
total_count: 0,
});
}
let min_val = config.min_val.unwrap_or_else(|| {
data.iter()
.map(|&x| TensorElement::to_f64(&x).expect("f64 conversion should succeed"))
.fold(f64::INFINITY, f64::min)
});
let max_val = config.max_val.unwrap_or_else(|| {
data.iter()
.map(|&x| TensorElement::to_f64(&x).expect("f64 conversion should succeed"))
.fold(f64::NEG_INFINITY, f64::max)
});
if min_val >= max_val {
return Err(TorshError::InvalidArgument(
"Minimum value must be less than maximum value".to_string(),
));
}
let bin_width = (max_val - min_val) / config.bins as f64;
let edges: Vec<f64> = (0..=config.bins)
.map(|i| min_val + i as f64 * bin_width)
.collect();
let mut counts = vec![0; config.bins];
for &value in data.iter() {
let val = TensorElement::to_f64(&value).expect("f64 conversion should succeed");
let bin_idx = if val <= min_val {
if config.include_outliers {
0
} else {
continue;
}
} else if val >= max_val {
if config.include_outliers {
config.bins - 1
} else {
continue;
}
} else {
((val - min_val) / bin_width).floor() as usize
};
let bin_idx = bin_idx.min(config.bins - 1);
counts[bin_idx] += 1;
}
Ok(Histogram {
counts,
edges,
total_count: data.len(),
})
}
pub fn correlation(&self, other: &Self, method: CorrelationMethod) -> Result<T> {
if self.shape() != other.shape() {
return Err(TorshError::ShapeMismatch {
expected: self.shape().dims().to_vec(),
got: other.shape().dims().to_vec(),
});
}
match method {
CorrelationMethod::Pearson => self.pearson_correlation(other),
CorrelationMethod::Spearman => self.spearman_correlation(other),
CorrelationMethod::Kendall => self.kendall_correlation(other),
}
}
fn pearson_correlation(&self, other: &Self) -> Result<T> {
let n = self.numel() as f64;
if n < 2.0 {
return Err(TorshError::InvalidArgument(
"Need at least 2 values for correlation".to_string(),
));
}
let mean_x = self.mean(None, false)?;
let mean_y = other.mean(None, false)?;
let mean_x_data = mean_x.to_vec()?;
let mean_y_data = mean_y.to_vec()?;
let mean_x_val = mean_x_data[0];
let mean_y_val = mean_y_data[0];
let self_data = self.to_vec()?;
let other_data = other.to_vec()?;
let mut sum_xy = 0.0;
let mut sum_xx = 0.0;
let mut sum_yy = 0.0;
for (&x, &y) in self_data.iter().zip(other_data.iter()) {
let dx = TensorElement::to_f64(&x).expect("f64 conversion should succeed")
- TensorElement::to_f64(&mean_x_val).expect("f64 conversion should succeed");
let dy = TensorElement::to_f64(&y).expect("f64 conversion should succeed")
- TensorElement::to_f64(&mean_y_val).expect("f64 conversion should succeed");
sum_xy += dx * dy;
sum_xx += dx * dx;
sum_yy += dy * dy;
}
let denominator = (sum_xx * sum_yy).sqrt();
if denominator.abs() < f64::EPSILON {
return Err(TorshError::InvalidArgument(
"Cannot compute correlation: one variable has zero variance".to_string(),
));
}
let correlation = sum_xy / denominator;
Ok(<T as TensorElement>::from_f64(correlation).expect("f64 conversion should succeed"))
}
fn spearman_correlation(&self, other: &Self) -> Result<T> {
let self_ranks = self.rank()?;
let other_ranks = other.rank()?;
self_ranks.pearson_correlation(&other_ranks)
}
fn kendall_correlation(&self, other: &Self) -> Result<T> {
let n = self.numel();
if n < 2 {
return Err(TorshError::InvalidArgument(
"Need at least 2 values for Kendall correlation".to_string(),
));
}
let self_data = self.to_vec()?;
let other_data = other.to_vec()?;
let mut concordant = 0;
let mut discordant = 0;
let mut tied_x = 0;
let mut tied_y = 0;
let mut tied_xy = 0;
for i in 0..n {
for j in i + 1..n {
let x1 =
TensorElement::to_f64(&self_data[i]).expect("f64 conversion should succeed");
let x2 =
TensorElement::to_f64(&self_data[j]).expect("f64 conversion should succeed");
let y1 =
TensorElement::to_f64(&other_data[i]).expect("f64 conversion should succeed");
let y2 =
TensorElement::to_f64(&other_data[j]).expect("f64 conversion should succeed");
let dx = x2 - x1;
let dy = y2 - y1;
if dx.abs() < f64::EPSILON && dy.abs() < f64::EPSILON {
tied_xy += 1;
} else if dx.abs() < f64::EPSILON {
tied_x += 1;
} else if dy.abs() < f64::EPSILON {
tied_y += 1;
} else if dx * dy > 0.0 {
concordant += 1;
} else {
discordant += 1;
}
}
}
let total_pairs = n * (n - 1) / 2;
let effective_pairs = total_pairs - tied_x - tied_y - tied_xy;
if effective_pairs == 0 {
return Ok(<T as TensorElement>::from_f64(0.0).expect("f64 conversion should succeed"));
}
let tau = (concordant as f64 - discordant as f64) / effective_pairs as f64;
Ok(<T as TensorElement>::from_f64(tau).expect("f64 conversion should succeed"))
}
fn rank(&self) -> Result<Self> {
let data = self.to_vec()?;
let n = data.len();
let mut indexed_values: Vec<(usize, T)> =
data.iter().enumerate().map(|(i, &val)| (i, val)).collect();
indexed_values.sort_by(|a, b| {
TensorElement::to_f64(&a.1)
.expect("f64 conversion should succeed")
.partial_cmp(&TensorElement::to_f64(&b.1).expect("f64 conversion should succeed"))
.unwrap_or(std::cmp::Ordering::Equal)
});
let mut ranks = vec![T::default(); n];
let mut i = 0;
while i < n {
let mut j = i;
while j < n
&& TensorElement::to_f64(&indexed_values[j].1)
.expect("f64 conversion should succeed")
== TensorElement::to_f64(&indexed_values[i].1)
.expect("f64 conversion should succeed")
{
j += 1;
}
let avg_rank = (i + j + 1) as f64 / 2.0;
for k in i..j {
ranks[indexed_values[k].0] = <T as TensorElement>::from_f64(avg_rank)
.expect("f64 conversion should succeed");
}
i = j;
}
Self::from_data(ranks, self.shape().dims().to_vec(), self.device)
}
pub fn describe(&self) -> Result<StatSummary> {
let data = self.to_vec()?;
if data.is_empty() {
return Err(TorshError::InvalidArgument(
"Cannot compute statistics on empty tensor".to_string(),
));
}
let count = data.len();
let values: Vec<f64> = data
.iter()
.map(|&x| TensorElement::to_f64(&x).expect("f64 conversion should succeed"))
.collect();
let sum: f64 = values.iter().sum();
let mean = sum / count as f64;
let variance =
values.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / (count - 1).max(1) as f64;
let std = variance.sqrt();
let min = values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
let max = values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
let mut sorted_values = values.clone();
sorted_values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let q25 = percentile_sorted(&sorted_values, 25.0);
let q50 = percentile_sorted(&sorted_values, 50.0);
let q75 = percentile_sorted(&sorted_values, 75.0);
Ok(StatSummary {
count,
mean,
std,
min,
max,
q25,
q50,
q75,
})
}
pub fn cov(&self, mode: StatMode) -> Result<Self> {
let shape = self.shape();
if shape.ndim() != 2 {
return Err(TorshError::dimension_error(
"Covariance matrix requires 2D tensor",
"covariance computation",
));
}
let (n_samples, _n_features) = (shape.dims()[0], shape.dims()[1]);
if n_samples < 2 {
return Err(TorshError::InvalidArgument(
"Need at least 2 samples for covariance".to_string(),
));
}
let global_mean = self.mean(None, false)?;
let mean_value = global_mean.item()?;
let centered = self.sub_scalar(mean_value)?;
let centered_t = centered.transpose(1, 0)?;
let cov_unnormalized = centered_t.matmul(¢ered)?;
let divisor = match mode {
StatMode::Population => n_samples,
StatMode::Sample => n_samples - 1,
};
cov_unnormalized.div_scalar(
<T as num_traits::FromPrimitive>::from_usize(divisor)
.unwrap_or_else(|| <T as num_traits::One>::one()),
)
}
pub fn corrcoef(&self) -> Result<Self> {
let cov_matrix = self.cov(StatMode::Sample)?;
let cov_data = cov_matrix.to_vec()?;
let n_features = cov_matrix.shape().dims()[0];
let mut std_devs = Vec::with_capacity(n_features);
for i in 0..n_features {
let variance = TensorElement::to_f64(&cov_data[i * n_features + i])
.expect("f64 conversion should succeed");
std_devs.push(variance.sqrt());
}
let mut corr_data = Vec::with_capacity(cov_data.len());
for i in 0..n_features {
for j in 0..n_features {
let cov_val = TensorElement::to_f64(&cov_data[i * n_features + j])
.expect("f64 conversion should succeed");
let corr_val = if std_devs[i] > f64::EPSILON && std_devs[j] > f64::EPSILON {
cov_val / (std_devs[i] * std_devs[j])
} else {
0.0
};
corr_data.push(
<T as TensorElement>::from_f64(corr_val)
.expect("f64 conversion should succeed"),
);
}
}
Self::from_data(corr_data, vec![n_features, n_features], self.device)
}
}
fn percentile_sorted(sorted_values: &[f64], q: f64) -> f64 {
if sorted_values.is_empty() {
return 0.0;
}
let pos = q / 100.0 * (sorted_values.len() - 1) as f64;
let lower_idx = pos.floor() as usize;
let upper_idx = (pos.ceil() as usize).min(sorted_values.len() - 1);
if lower_idx == upper_idx {
sorted_values[lower_idx]
} else {
let weight = pos - pos.floor();
sorted_values[lower_idx] * (1.0 - weight) + sorted_values[upper_idx] * weight
}
}
#[cfg(test)]
mod tests {
use super::*;
use torsh_core::device::DeviceType;
#[test]
fn test_basic_statistics() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let tensor = Tensor::from_data(data, vec![5], DeviceType::Cpu)
.expect("tensor creation should succeed");
let mean = tensor.mean(None, false).expect("mean should succeed");
assert!(
(mean.to_vec().expect("to_vec conversion should succeed")[0] - 3.0_f32).abs()
< 1e-6_f32
);
let var_sample = tensor
.var(None, false, StatMode::Sample)
.expect("variance should succeed");
assert!(
(var_sample
.to_vec()
.expect("to_vec conversion should succeed")[0]
- 2.5_f32)
.abs()
< 1e-6_f32
);
let std_sample = tensor
.std(None, false, StatMode::Sample)
.expect("std should succeed");
assert!(
(std_sample
.to_vec()
.expect("to_vec conversion should succeed")[0]
- 2.5_f32.sqrt())
.abs()
< 1e-6
);
}
#[test]
fn test_percentiles() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
let tensor = Tensor::from_data(data, vec![10], DeviceType::Cpu)
.expect("tensor creation should succeed");
let median = tensor.median(None, false).expect("median should succeed");
assert!(
(median.to_vec().expect("to_vec conversion should succeed")[0] - 5.5_f32).abs()
< 1e-6_f32
);
let q25 = tensor
.percentile(25.0, None, false)
.expect("percentile should succeed");
assert!(
(q25.to_vec().expect("to_vec conversion should succeed")[0] - 3.25_f32).abs()
< 1e-6_f32
);
let q75 = tensor
.percentile(75.0, None, false)
.expect("percentile should succeed");
assert!(
(q75.to_vec().expect("to_vec conversion should succeed")[0] - 7.75_f32).abs()
< 1e-6_f32
);
}
#[test]
fn test_histogram() {
let data = vec![1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 5.0];
let tensor = Tensor::from_data(data, vec![9], DeviceType::Cpu)
.expect("tensor creation should succeed");
let config = HistogramConfig {
bins: 5,
min_val: Some(1.0),
max_val: Some(5.0),
include_outliers: true,
};
let hist = tensor.histogram(&config).expect("histogram should succeed");
assert_eq!(hist.total_count, 9);
assert_eq!(hist.counts.len(), 5);
assert_eq!(hist.edges.len(), 6);
}
#[test]
fn test_correlation() {
let x_data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let y_data = vec![2.0, 4.0, 6.0, 8.0, 10.0];
let x = Tensor::from_data(x_data, vec![5], DeviceType::Cpu)
.expect("tensor creation should succeed");
let y = Tensor::from_data(y_data, vec![5], DeviceType::Cpu)
.expect("tensor creation should succeed");
let corr = x
.correlation(&y, CorrelationMethod::Pearson)
.expect("correlation should succeed");
assert!((corr - 1.0_f32).abs() < 1e-6_f32); }
#[test]
fn test_statistical_summary() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
let tensor = Tensor::from_data(data, vec![10], DeviceType::Cpu)
.expect("tensor creation should succeed");
let summary = tensor.describe().expect("describe should succeed");
assert_eq!(summary.count, 10);
assert!((summary.mean - 5.5).abs() < 1e-6);
assert!((summary.q50 - 5.5).abs() < 1e-6); assert_eq!(summary.min, 1.0);
assert_eq!(summary.max, 10.0);
}
#[test]
fn test_covariance_matrix() {
let data = vec![1.0, 2.0, 2.0, 4.0, 3.0, 6.0, 4.0, 8.0];
let tensor = Tensor::from_data(data, vec![4, 2], DeviceType::Cpu)
.expect("tensor creation should succeed");
let cov_matrix = tensor
.cov(StatMode::Sample)
.expect("covariance should succeed");
assert_eq!(cov_matrix.shape().dims(), &[2, 2]);
let cov_data = cov_matrix
.to_vec()
.expect("to_vec conversion should succeed");
assert!((cov_data[1] as f64 - cov_data[2] as f64).abs() < 1e-6);
}
#[test]
fn test_ranks() {
let data = vec![3.0, 1.0, 4.0, 2.0, 2.0]; let tensor = Tensor::from_data(data, vec![5], DeviceType::Cpu)
.expect("tensor creation should succeed");
let ranks = tensor.rank().expect("rank should be available");
let rank_data = ranks.to_vec().expect("to_vec conversion should succeed");
for &rank in rank_data.iter() {
assert!((1.0_f32..=5.0_f32).contains(&rank));
}
}
}