1use crate::{FloatElement, Tensor, TensorElement};
8use torsh_core::error::{Result, TorshError};
9
10#[derive(Debug, Clone, Copy, PartialEq)]
12pub enum StatMode {
13 Population,
15 Sample,
17}
18
19#[derive(Debug, Clone)]
21pub struct HistogramConfig {
22 pub bins: usize,
24 pub min_val: Option<f64>,
26 pub max_val: Option<f64>,
28 pub include_outliers: bool,
30}
31
32impl Default for HistogramConfig {
33 fn default() -> Self {
34 Self {
35 bins: 50,
36 min_val: None,
37 max_val: None,
38 include_outliers: true,
39 }
40 }
41}
42
43#[derive(Debug, Clone)]
45pub struct Histogram {
46 pub counts: Vec<usize>,
48 pub edges: Vec<f64>,
50 pub total_count: usize,
52}
53
54#[derive(Debug, Clone, Copy, PartialEq)]
56pub enum CorrelationMethod {
57 Pearson,
59 Spearman,
61 Kendall,
63}
64
65#[derive(Debug, Clone)]
67pub struct StatSummary {
68 pub count: usize,
69 pub mean: f64,
70 pub std: f64,
71 pub min: f64,
72 pub max: f64,
73 pub q25: f64, pub q50: f64, pub q75: f64, }
77
78impl<
80 T: TensorElement
81 + FloatElement
82 + Copy
83 + Default
84 + std::ops::Add<Output = T>
85 + std::ops::AddAssign
86 + std::ops::Sub<Output = T>
87 + std::ops::Mul<Output = T>
88 + std::ops::MulAssign
89 + std::ops::Div<Output = T>
90 + PartialOrd
91 + num_traits::FromPrimitive
92 + std::iter::Sum,
93 > Tensor<T>
94{
95 pub fn mean_stats(&self, dims: Option<&[usize]>, keepdim: bool) -> Result<Self> {
97 let sum = if let Some(dims) = dims {
98 self.sum_dim(&dims.iter().map(|&d| d as i32).collect::<Vec<_>>(), keepdim)?
99 } else {
100 self.sum()?
101 };
102 let count = if let Some(dims) = dims {
103 dims.iter()
104 .map(|&d| self.shape().dims()[d])
105 .product::<usize>() as f64
106 } else {
107 self.numel() as f64
108 };
109
110 sum.div_scalar(
111 <T as num_traits::FromPrimitive>::from_f64(count)
112 .unwrap_or_else(|| <T as num_traits::One>::one()),
113 )
114 }
115
116 pub fn var(&self, dims: Option<&[usize]>, keepdim: bool, mode: StatMode) -> Result<Self> {
118 let mean = self.mean(dims, false)?; let mean_value = mean.item()?; let diff = self.sub_scalar(mean_value)?;
121 let squared_diff = diff.mul_op(&diff)?;
122 let sum_sq = if let Some(dims) = dims {
123 squared_diff.sum_dim(&dims.iter().map(|&d| d as i32).collect::<Vec<_>>(), keepdim)?
124 } else {
125 squared_diff.sum()?
126 };
127
128 let count = if let Some(dims) = dims {
129 dims.iter()
130 .map(|&d| self.shape().dims()[d])
131 .product::<usize>()
132 } else {
133 self.numel()
134 };
135
136 let divisor = match mode {
137 StatMode::Population => count,
138 StatMode::Sample => count - 1,
139 };
140
141 if divisor == 0 {
142 return Err(TorshError::InvalidArgument(
143 "Cannot compute variance with zero degrees of freedom".to_string(),
144 ));
145 }
146
147 sum_sq.div_scalar(
148 <T as num_traits::FromPrimitive>::from_usize(divisor)
149 .unwrap_or_else(|| <T as num_traits::One>::one()),
150 )
151 }
152
153 pub fn std(&self, dims: Option<&[usize]>, keepdim: bool, mode: StatMode) -> Result<Self> {
155 let variance = self.var(dims, keepdim, mode)?;
156 variance.sqrt()
157 }
158
159 pub fn percentile(&self, q: f64, dim: Option<usize>, _keepdim: bool) -> Result<Self> {
161 if !(0.0..=100.0).contains(&q) {
162 return Err(TorshError::InvalidArgument(format!(
163 "Percentile must be between 0 and 100, got {q}"
164 )));
165 }
166
167 let dim = dim.unwrap_or(self.shape().ndim() - 1);
168 if dim >= self.shape().ndim() {
169 return Err(TorshError::dimension_error(
170 &format!(
171 "Dimension {} out of bounds for tensor with {} dimensions",
172 dim,
173 self.shape().ndim()
174 ),
175 "tensor statistics operation",
176 ));
177 }
178
179 let (sorted, _indices) = self.sort(Some(dim as i32), false)?; let size = self.shape().dims()[dim];
183
184 let pos = q / 100.0 * (size - 1) as f64;
186 let lower_idx = pos.floor() as usize;
187 let upper_idx = (pos.ceil() as usize).min(size - 1);
188 let weight = pos - pos.floor();
189
190 if lower_idx == upper_idx {
191 sorted.select(dim as i32, lower_idx as i64)
193 } else {
194 let lower = sorted.select(dim as i32, lower_idx as i64)?;
196 let upper = sorted.select(dim as i32, upper_idx as i64)?;
197 let diff = upper.sub(&lower)?;
198 let weight_scalar = <T as TensorElement>::from_f64(weight)
199 .unwrap_or_else(|| <T as TensorElement>::from_f64(0.0).unwrap_or_default());
200 let weighted_diff = diff.mul_scalar(weight_scalar)?;
201 lower.add_op(&weighted_diff)
202 }
203 }
204
205 pub fn median(&self, dim: Option<usize>, keepdim: bool) -> Result<Self> {
207 self.percentile(50.0, dim, keepdim)
208 }
209
210 pub fn quantile(&self, q: &[f64], dim: Option<usize>, keepdim: bool) -> Result<Vec<Self>> {
212 let mut results = Vec::new();
213 for &quantile in q {
214 results.push(self.percentile(quantile * 100.0, dim, keepdim)?);
215 }
216 Ok(results)
217 }
218
219 pub fn histogram(&self, config: &HistogramConfig) -> Result<Histogram> {
221 let data = self.to_vec()?;
222
223 if data.is_empty() {
224 return Ok(Histogram {
225 counts: vec![0; config.bins],
226 edges: (0..=config.bins).map(|i| i as f64).collect(),
227 total_count: 0,
228 });
229 }
230
231 let min_val = config.min_val.unwrap_or_else(|| {
233 data.iter()
234 .map(|&x| TensorElement::to_f64(&x).expect("f64 conversion should succeed"))
235 .fold(f64::INFINITY, f64::min)
236 });
237 let max_val = config.max_val.unwrap_or_else(|| {
238 data.iter()
239 .map(|&x| TensorElement::to_f64(&x).expect("f64 conversion should succeed"))
240 .fold(f64::NEG_INFINITY, f64::max)
241 });
242
243 if min_val >= max_val {
244 return Err(TorshError::InvalidArgument(
245 "Minimum value must be less than maximum value".to_string(),
246 ));
247 }
248
249 let bin_width = (max_val - min_val) / config.bins as f64;
251 let edges: Vec<f64> = (0..=config.bins)
252 .map(|i| min_val + i as f64 * bin_width)
253 .collect();
254
255 let mut counts = vec![0; config.bins];
257 for &value in data.iter() {
258 let val = TensorElement::to_f64(&value).expect("f64 conversion should succeed");
259
260 let bin_idx = if val <= min_val {
261 if config.include_outliers {
262 0
263 } else {
264 continue;
265 }
266 } else if val >= max_val {
267 if config.include_outliers {
268 config.bins - 1
269 } else {
270 continue;
271 }
272 } else {
273 ((val - min_val) / bin_width).floor() as usize
274 };
275
276 let bin_idx = bin_idx.min(config.bins - 1);
277 counts[bin_idx] += 1;
278 }
279
280 Ok(Histogram {
281 counts,
282 edges,
283 total_count: data.len(),
284 })
285 }
286
287 pub fn correlation(&self, other: &Self, method: CorrelationMethod) -> Result<T> {
289 if self.shape() != other.shape() {
290 return Err(TorshError::ShapeMismatch {
291 expected: self.shape().dims().to_vec(),
292 got: other.shape().dims().to_vec(),
293 });
294 }
295
296 match method {
297 CorrelationMethod::Pearson => self.pearson_correlation(other),
298 CorrelationMethod::Spearman => self.spearman_correlation(other),
299 CorrelationMethod::Kendall => self.kendall_correlation(other),
300 }
301 }
302
303 fn pearson_correlation(&self, other: &Self) -> Result<T> {
305 let n = self.numel() as f64;
306 if n < 2.0 {
307 return Err(TorshError::InvalidArgument(
308 "Need at least 2 values for correlation".to_string(),
309 ));
310 }
311
312 let mean_x = self.mean(None, false)?;
314 let mean_y = other.mean(None, false)?;
315
316 let mean_x_data = mean_x.to_vec()?;
317 let mean_y_data = mean_y.to_vec()?;
318 let mean_x_val = mean_x_data[0];
319 let mean_y_val = mean_y_data[0];
320
321 let self_data = self.to_vec()?;
323 let other_data = other.to_vec()?;
324
325 let mut sum_xy = 0.0;
326 let mut sum_xx = 0.0;
327 let mut sum_yy = 0.0;
328
329 for (&x, &y) in self_data.iter().zip(other_data.iter()) {
330 let dx = TensorElement::to_f64(&x).expect("f64 conversion should succeed")
331 - TensorElement::to_f64(&mean_x_val).expect("f64 conversion should succeed");
332 let dy = TensorElement::to_f64(&y).expect("f64 conversion should succeed")
333 - TensorElement::to_f64(&mean_y_val).expect("f64 conversion should succeed");
334
335 sum_xy += dx * dy;
336 sum_xx += dx * dx;
337 sum_yy += dy * dy;
338 }
339
340 let denominator = (sum_xx * sum_yy).sqrt();
341 if denominator.abs() < f64::EPSILON {
342 return Err(TorshError::InvalidArgument(
343 "Cannot compute correlation: one variable has zero variance".to_string(),
344 ));
345 }
346
347 let correlation = sum_xy / denominator;
348 Ok(<T as TensorElement>::from_f64(correlation).expect("f64 conversion should succeed"))
349 }
350
351 fn spearman_correlation(&self, other: &Self) -> Result<T> {
353 let self_ranks = self.rank()?;
355 let other_ranks = other.rank()?;
356 self_ranks.pearson_correlation(&other_ranks)
357 }
358
359 fn kendall_correlation(&self, other: &Self) -> Result<T> {
361 let n = self.numel();
362 if n < 2 {
363 return Err(TorshError::InvalidArgument(
364 "Need at least 2 values for Kendall correlation".to_string(),
365 ));
366 }
367
368 let self_data = self.to_vec()?;
369 let other_data = other.to_vec()?;
370
371 let mut concordant = 0;
372 let mut discordant = 0;
373 let mut tied_x = 0;
374 let mut tied_y = 0;
375 let mut tied_xy = 0;
376
377 for i in 0..n {
378 for j in i + 1..n {
379 let x1 =
380 TensorElement::to_f64(&self_data[i]).expect("f64 conversion should succeed");
381 let x2 =
382 TensorElement::to_f64(&self_data[j]).expect("f64 conversion should succeed");
383 let y1 =
384 TensorElement::to_f64(&other_data[i]).expect("f64 conversion should succeed");
385 let y2 =
386 TensorElement::to_f64(&other_data[j]).expect("f64 conversion should succeed");
387
388 let dx = x2 - x1;
389 let dy = y2 - y1;
390
391 if dx.abs() < f64::EPSILON && dy.abs() < f64::EPSILON {
392 tied_xy += 1;
393 } else if dx.abs() < f64::EPSILON {
394 tied_x += 1;
395 } else if dy.abs() < f64::EPSILON {
396 tied_y += 1;
397 } else if dx * dy > 0.0 {
398 concordant += 1;
399 } else {
400 discordant += 1;
401 }
402 }
403 }
404
405 let total_pairs = n * (n - 1) / 2;
406 let effective_pairs = total_pairs - tied_x - tied_y - tied_xy;
407
408 if effective_pairs == 0 {
409 return Ok(<T as TensorElement>::from_f64(0.0).expect("f64 conversion should succeed"));
410 }
411
412 let tau = (concordant as f64 - discordant as f64) / effective_pairs as f64;
413 Ok(<T as TensorElement>::from_f64(tau).expect("f64 conversion should succeed"))
414 }
415
416 fn rank(&self) -> Result<Self> {
418 let data = self.to_vec()?;
419 let n = data.len();
420
421 let mut indexed_values: Vec<(usize, T)> =
423 data.iter().enumerate().map(|(i, &val)| (i, val)).collect();
424
425 indexed_values.sort_by(|a, b| {
426 TensorElement::to_f64(&a.1)
427 .expect("f64 conversion should succeed")
428 .partial_cmp(&TensorElement::to_f64(&b.1).expect("f64 conversion should succeed"))
429 .unwrap_or(std::cmp::Ordering::Equal)
430 });
431
432 let mut ranks = vec![T::default(); n];
434 let mut i = 0;
435
436 while i < n {
437 let mut j = i;
438 while j < n
439 && TensorElement::to_f64(&indexed_values[j].1)
440 .expect("f64 conversion should succeed")
441 == TensorElement::to_f64(&indexed_values[i].1)
442 .expect("f64 conversion should succeed")
443 {
444 j += 1;
445 }
446
447 let avg_rank = (i + j + 1) as f64 / 2.0;
449 for k in i..j {
450 ranks[indexed_values[k].0] = <T as TensorElement>::from_f64(avg_rank)
451 .expect("f64 conversion should succeed");
452 }
453 i = j;
454 }
455
456 Self::from_data(ranks, self.shape().dims().to_vec(), self.device)
457 }
458
459 pub fn describe(&self) -> Result<StatSummary> {
461 let data = self.to_vec()?;
462 if data.is_empty() {
463 return Err(TorshError::InvalidArgument(
464 "Cannot compute statistics on empty tensor".to_string(),
465 ));
466 }
467
468 let count = data.len();
469 let values: Vec<f64> = data
470 .iter()
471 .map(|&x| TensorElement::to_f64(&x).expect("f64 conversion should succeed"))
472 .collect();
473
474 let sum: f64 = values.iter().sum();
476 let mean = sum / count as f64;
477
478 let variance =
479 values.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / (count - 1).max(1) as f64;
480 let std = variance.sqrt();
481
482 let min = values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
483 let max = values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
484
485 let mut sorted_values = values.clone();
487 sorted_values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
488
489 let q25 = percentile_sorted(&sorted_values, 25.0);
490 let q50 = percentile_sorted(&sorted_values, 50.0);
491 let q75 = percentile_sorted(&sorted_values, 75.0);
492
493 Ok(StatSummary {
494 count,
495 mean,
496 std,
497 min,
498 max,
499 q25,
500 q50,
501 q75,
502 })
503 }
504
505 pub fn cov(&self, mode: StatMode) -> Result<Self> {
507 let shape = self.shape();
508 if shape.ndim() != 2 {
509 return Err(TorshError::dimension_error(
510 "Covariance matrix requires 2D tensor",
511 "covariance computation",
512 ));
513 }
514
515 let (n_samples, _n_features) = (shape.dims()[0], shape.dims()[1]);
516 if n_samples < 2 {
517 return Err(TorshError::InvalidArgument(
518 "Need at least 2 samples for covariance".to_string(),
519 ));
520 }
521
522 let global_mean = self.mean(None, false)?;
526 let mean_value = global_mean.item()?;
527 let centered = self.sub_scalar(mean_value)?;
528
529 let centered_t = centered.transpose(1, 0)?;
531 let cov_unnormalized = centered_t.matmul(¢ered)?;
532
533 let divisor = match mode {
534 StatMode::Population => n_samples,
535 StatMode::Sample => n_samples - 1,
536 };
537
538 cov_unnormalized.div_scalar(
539 <T as num_traits::FromPrimitive>::from_usize(divisor)
540 .unwrap_or_else(|| <T as num_traits::One>::one()),
541 )
542 }
543
544 pub fn corrcoef(&self) -> Result<Self> {
546 let cov_matrix = self.cov(StatMode::Sample)?;
547 let cov_data = cov_matrix.to_vec()?;
548 let n_features = cov_matrix.shape().dims()[0];
549
550 let mut std_devs = Vec::with_capacity(n_features);
552 for i in 0..n_features {
553 let variance = TensorElement::to_f64(&cov_data[i * n_features + i])
554 .expect("f64 conversion should succeed");
555 std_devs.push(variance.sqrt());
556 }
557
558 let mut corr_data = Vec::with_capacity(cov_data.len());
560 for i in 0..n_features {
561 for j in 0..n_features {
562 let cov_val = TensorElement::to_f64(&cov_data[i * n_features + j])
563 .expect("f64 conversion should succeed");
564 let corr_val = if std_devs[i] > f64::EPSILON && std_devs[j] > f64::EPSILON {
565 cov_val / (std_devs[i] * std_devs[j])
566 } else {
567 0.0
568 };
569 corr_data.push(
570 <T as TensorElement>::from_f64(corr_val)
571 .expect("f64 conversion should succeed"),
572 );
573 }
574 }
575
576 Self::from_data(corr_data, vec![n_features, n_features], self.device)
577 }
578}
579
580fn percentile_sorted(sorted_values: &[f64], q: f64) -> f64 {
582 if sorted_values.is_empty() {
583 return 0.0;
584 }
585
586 let pos = q / 100.0 * (sorted_values.len() - 1) as f64;
587 let lower_idx = pos.floor() as usize;
588 let upper_idx = (pos.ceil() as usize).min(sorted_values.len() - 1);
589
590 if lower_idx == upper_idx {
591 sorted_values[lower_idx]
592 } else {
593 let weight = pos - pos.floor();
594 sorted_values[lower_idx] * (1.0 - weight) + sorted_values[upper_idx] * weight
595 }
596}
597
598#[cfg(test)]
599mod tests {
600 use super::*;
601 use torsh_core::device::DeviceType;
602
603 #[test]
604 fn test_basic_statistics() {
605 let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
606 let tensor = Tensor::from_data(data, vec![5], DeviceType::Cpu)
607 .expect("tensor creation should succeed");
608
609 let mean = tensor.mean(None, false).expect("mean should succeed");
610 assert!(
611 (mean.to_vec().expect("to_vec conversion should succeed")[0] - 3.0_f32).abs()
612 < 1e-6_f32
613 );
614
615 let var_sample = tensor
616 .var(None, false, StatMode::Sample)
617 .expect("variance should succeed");
618 assert!(
619 (var_sample
620 .to_vec()
621 .expect("to_vec conversion should succeed")[0]
622 - 2.5_f32)
623 .abs()
624 < 1e-6_f32
625 );
626
627 let std_sample = tensor
628 .std(None, false, StatMode::Sample)
629 .expect("std should succeed");
630 assert!(
631 (std_sample
632 .to_vec()
633 .expect("to_vec conversion should succeed")[0]
634 - 2.5_f32.sqrt())
635 .abs()
636 < 1e-6
637 );
638 }
639
640 #[test]
641 fn test_percentiles() {
642 let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
643 let tensor = Tensor::from_data(data, vec![10], DeviceType::Cpu)
644 .expect("tensor creation should succeed");
645
646 let median = tensor.median(None, false).expect("median should succeed");
647 assert!(
648 (median.to_vec().expect("to_vec conversion should succeed")[0] - 5.5_f32).abs()
649 < 1e-6_f32
650 );
651
652 let q25 = tensor
653 .percentile(25.0, None, false)
654 .expect("percentile should succeed");
655 assert!(
656 (q25.to_vec().expect("to_vec conversion should succeed")[0] - 3.25_f32).abs()
657 < 1e-6_f32
658 );
659
660 let q75 = tensor
661 .percentile(75.0, None, false)
662 .expect("percentile should succeed");
663 assert!(
664 (q75.to_vec().expect("to_vec conversion should succeed")[0] - 7.75_f32).abs()
665 < 1e-6_f32
666 );
667 }
668
669 #[test]
670 fn test_histogram() {
671 let data = vec![1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 5.0];
672 let tensor = Tensor::from_data(data, vec![9], DeviceType::Cpu)
673 .expect("tensor creation should succeed");
674
675 let config = HistogramConfig {
676 bins: 5,
677 min_val: Some(1.0),
678 max_val: Some(5.0),
679 include_outliers: true,
680 };
681
682 let hist = tensor.histogram(&config).expect("histogram should succeed");
683 assert_eq!(hist.total_count, 9);
684 assert_eq!(hist.counts.len(), 5);
685 assert_eq!(hist.edges.len(), 6);
686 }
687
688 #[test]
689 fn test_correlation() {
690 let x_data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
691 let y_data = vec![2.0, 4.0, 6.0, 8.0, 10.0]; let x = Tensor::from_data(x_data, vec![5], DeviceType::Cpu)
694 .expect("tensor creation should succeed");
695 let y = Tensor::from_data(y_data, vec![5], DeviceType::Cpu)
696 .expect("tensor creation should succeed");
697
698 let corr = x
699 .correlation(&y, CorrelationMethod::Pearson)
700 .expect("correlation should succeed");
701 assert!((corr - 1.0_f32).abs() < 1e-6_f32); }
703
704 #[test]
705 fn test_statistical_summary() {
706 let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
707 let tensor = Tensor::from_data(data, vec![10], DeviceType::Cpu)
708 .expect("tensor creation should succeed");
709
710 let summary = tensor.describe().expect("describe should succeed");
711 assert_eq!(summary.count, 10);
712 assert!((summary.mean - 5.5).abs() < 1e-6);
713 assert!((summary.q50 - 5.5).abs() < 1e-6); assert_eq!(summary.min, 1.0);
715 assert_eq!(summary.max, 10.0);
716 }
717
718 #[test]
719 fn test_covariance_matrix() {
720 let data = vec![1.0, 2.0, 2.0, 4.0, 3.0, 6.0, 4.0, 8.0];
722 let tensor = Tensor::from_data(data, vec![4, 2], DeviceType::Cpu)
723 .expect("tensor creation should succeed");
724
725 let cov_matrix = tensor
726 .cov(StatMode::Sample)
727 .expect("covariance should succeed");
728 assert_eq!(cov_matrix.shape().dims(), &[2, 2]);
729
730 let cov_data = cov_matrix
732 .to_vec()
733 .expect("to_vec conversion should succeed");
734 assert!((cov_data[1] as f64 - cov_data[2] as f64).abs() < 1e-6);
737 }
738
739 #[test]
740 fn test_ranks() {
741 let data = vec![3.0, 1.0, 4.0, 2.0, 2.0]; let tensor = Tensor::from_data(data, vec![5], DeviceType::Cpu)
743 .expect("tensor creation should succeed");
744
745 let ranks = tensor.rank().expect("rank should be available");
746 let rank_data = ranks.to_vec().expect("to_vec conversion should succeed");
747
748 for &rank in rank_data.iter() {
750 assert!((1.0_f32..=5.0_f32).contains(&rank));
751 }
752 }
753}