Crate scirs2_stats

Source
Expand description

Statistical functions module

This module provides implementations of various statistical algorithms, modeled after SciPy’s stats module.

§Overview

  • Descriptive statistics

    • Basic statistics (mean, median, variance, etc.)
    • Advanced statistics (skewness, kurtosis, moments)
    • Correlation measures (Pearson, Spearman, Kendall tau, partial correlation)
    • Dispersion measures (MAD, median absolute deviation, IQR, range, coefficient of variation)
  • Statistical distributions

    • Normal distribution
    • Uniform distribution
    • Student’s t distribution
    • Chi-square distribution
    • F distribution
    • Poisson distribution
    • Gamma distribution
    • Beta distribution
    • Exponential distribution
    • Hypergeometric distribution
    • Laplace distribution
    • Logistic distribution
    • Cauchy distribution
    • Pareto distribution
    • Weibull distribution
    • Multivariate distributions (multivariate normal, multivariate t, dirichlet, wishart, etc.)
  • Statistical tests

    • Parametric tests (t-tests, ANOVA)
    • Non-parametric tests (Mann-Whitney U)
    • Normality tests (Shapiro-Wilk, Anderson-Darling, D’Agostino’s K²)
    • Goodness-of-fit tests (Chi-square)
  • Random number generation

  • Regression models (linear, regularized, robust)

  • Contingency table functions

  • Masked array statistics

  • Quasi-Monte Carlo

  • Statistical sampling

§Examples

§Descriptive Statistics

use ndarray::array;
use scirs2_stats::{mean, median, std, var, skew, kurtosis};

let data = array![1.0, 2.0, 3.0, 4.0, 5.0];

// Calculate basic statistics
let mean_val = mean(&data.view()).unwrap();
let median_val = median(&data.view()).unwrap();
let var_val = var(&data.view(), 0).unwrap();  // ddof = 0 for population variance
let std_val = std(&data.view(), 0).unwrap();  // ddof = 0 for population standard deviation

// Advanced statistics
let skewness = skew(&data.view(), false).unwrap();  // bias = false
let kurt = kurtosis(&data.view(), true, false).unwrap();  // fisher = true, bias = false

§Correlation Measures

use ndarray::{array, Array2};
use scirs2_stats::{pearson_r, pearsonr, spearman_r, kendall_tau, corrcoef};

let x = array![1.0, 2.0, 3.0, 4.0, 5.0];
let y = array![5.0, 4.0, 3.0, 2.0, 1.0];

// Calculate Pearson correlation coefficient (linear correlation)
let r = pearson_r(&x.view(), &y.view()).unwrap();
println!("Pearson correlation: {}", r);  // Should be -1.0 (perfect negative correlation)

// Calculate Pearson correlation with p-value
let (r, p) = pearsonr(&x.view(), &y.view(), "two-sided").unwrap();
println!("Pearson correlation: {}, p-value: {}", r, p);

// Spearman rank correlation (monotonic relationship)
let rho = spearman_r(&x.view(), &y.view()).unwrap();
println!("Spearman correlation: {}", rho);

// Kendall tau rank correlation
let tau = kendall_tau(&x.view(), &y.view(), "b").unwrap();
println!("Kendall tau correlation: {}", tau);

// Correlation matrix for multiple variables
let data = array![
    [1.0, 5.0, 10.0],
    [2.0, 4.0, 9.0],
    [3.0, 3.0, 8.0],
    [4.0, 2.0, 7.0],
    [5.0, 1.0, 6.0]
];

let corr_matrix = corrcoef(&data.view(), "pearson").unwrap();
println!("Correlation matrix:\n{:?}", corr_matrix);

§Dispersion Measures

use ndarray::array;
use scirs2_stats::{
    mean_abs_deviation, median_abs_deviation, iqr, data_range, coef_variation
};

let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 100.0];  // Note the outlier

// Mean absolute deviation (from mean)
let mad = mean_abs_deviation(&data.view(), None).unwrap();
println!("Mean absolute deviation: {}", mad);

// Median absolute deviation (robust to outliers)
let median_ad = median_abs_deviation(&data.view(), None, None).unwrap();
println!("Median absolute deviation: {}", median_ad);

// Scaled median absolute deviation (consistent with std dev for normal distributions)
let median_ad_scaled = median_abs_deviation(&data.view(), None, Some(1.4826)).unwrap();
println!("Scaled median absolute deviation: {}", median_ad_scaled);

// Interquartile range (Q3 - Q1)
let iqr_val = iqr(&data.view(), None).unwrap();
println!("Interquartile range: {}", iqr_val);

// Range (max - min)
let range_val = data_range(&data.view()).unwrap();
println!("Range: {}", range_val);

// Coefficient of variation (std/mean, unitless measure)
let cv = coef_variation(&data.view(), 1).unwrap();
println!("Coefficient of variation: {}", cv);

§Statistical Distributions

use scirs2_stats::distributions;

// Normal distribution
let normal = distributions::norm(0.0f64, 1.0).unwrap();
let pdf = normal.pdf(0.0);
let cdf = normal.cdf(1.96);
let samples = normal.rvs(100).unwrap();

// Poisson distribution
let poisson = distributions::poisson(3.0f64, 0.0).unwrap();
let pmf = poisson.pmf(2.0);
let cdf = poisson.cdf(4.0);
let samples = poisson.rvs(100).unwrap();

// Gamma distribution
let gamma = distributions::gamma(2.0f64, 1.0, 0.0).unwrap();
let pdf = gamma.pdf(1.0);
let cdf = gamma.cdf(2.0);
let samples = gamma.rvs(100).unwrap();

// Beta distribution
let beta = distributions::beta(2.0f64, 3.0, 0.0, 1.0).unwrap();
let pdf = beta.pdf(0.5);
let samples = beta.rvs(100).unwrap();

// Exponential distribution
let exp = distributions::expon(1.0f64, 0.0).unwrap();
let pdf = exp.pdf(1.0);
let mean = exp.mean(); // Should be 1.0

// Multivariate normal distribution
use ndarray::array;
let mvn_mean = array![0.0, 0.0];
let mvn_cov = array![[1.0, 0.5], [0.5, 2.0]];
let mvn = distributions::multivariate::multivariate_normal(mvn_mean, mvn_cov).unwrap();
let pdf = mvn.pdf(&array![0.0, 0.0]);
let samples = mvn.rvs(100).unwrap();

§Statistical Tests

use ndarray::{array, Array2};
use scirs2_stats::{
    ttest_1samp, ttest_ind, ttest_rel, kstest, shapiro, mann_whitney,
    shapiro_wilk, anderson_darling, dagostino_k2, wilcoxon, kruskal_wallis, friedman,
    ks_2samp, distributions, Alternative
};

// One-sample t-test (we'll use a larger sample for normality tests)
let data = array![
    5.1, 4.9, 6.2, 5.7, 5.5, 5.1, 5.2, 5.0, 5.3, 5.4,
    5.6, 5.8, 5.9, 6.0, 5.2, 5.4, 5.3, 5.1, 5.2, 5.0
];
let result = ttest_1samp(&data.view(), 5.0, Alternative::TwoSided, "propagate").unwrap();
let t_stat = result.statistic;
let p_value = result.pvalue;
println!("One-sample t-test: t={}, p={}", t_stat, p_value);

// Two-sample t-test
let group1 = array![5.1, 4.9, 6.2, 5.7, 5.5];
let group2 = array![4.8, 5.2, 5.1, 4.7, 4.9];
let result = ttest_ind(&group1.view(), &group2.view(), true, Alternative::TwoSided, "propagate").unwrap();
let t_stat = result.statistic;
let p_value = result.pvalue;
println!("Two-sample t-test: t={}, p={}", t_stat, p_value);

// Normality tests
let (w_stat, p_value) = shapiro(&data.view()).unwrap();
println!("Shapiro-Wilk test: W={}, p={}", w_stat, p_value);

// More accurate Shapiro-Wilk test implementation
let (w_stat, p_value) = shapiro_wilk(&data.view()).unwrap();
println!("Improved Shapiro-Wilk test: W={}, p={}", w_stat, p_value);

// Anderson-Darling test for normality
let (a2_stat, p_value) = anderson_darling(&data.view()).unwrap();
println!("Anderson-Darling test: A²={}, p={}", a2_stat, p_value);

// D'Agostino's K² test combining skewness and kurtosis
let (k2_stat, p_value) = dagostino_k2(&data.view()).unwrap();
println!("D'Agostino K² test: K²={}, p={}", k2_stat, p_value);

// Non-parametric tests

// Wilcoxon signed-rank test (paired samples)
let before = array![125.0, 115.0, 130.0, 140.0, 140.0];
let after = array![110.0, 122.0, 125.0, 120.0, 140.0];
let (w, p_value) = wilcoxon(&before.view(), &after.view(), "wilcox", true).unwrap();
println!("Wilcoxon signed-rank test: W={}, p={}", w, p_value);

// Mann-Whitney U test (independent samples)
let males = array![19.0, 22.0, 16.0, 29.0, 24.0];
let females = array![20.0, 11.0, 17.0, 12.0];
let (u, p_value) = mann_whitney(&males.view(), &females.view(), "two-sided", true).unwrap();
println!("Mann-Whitney U test: U={}, p={}", u, p_value);

// Kruskal-Wallis test (unpaired samples)
let group1 = array![2.9, 3.0, 2.5, 2.6, 3.2];
let group2 = array![3.8, 3.7, 3.9, 4.0, 4.2];
let group3 = array![2.8, 3.4, 3.7, 2.2, 2.0];
let samples = vec![group1.view(), group2.view(), group3.view()];
let (h, p_value) = kruskal_wallis(&samples).unwrap();
println!("Kruskal-Wallis test: H={}, p={}", h, p_value);

// Friedman test (repeated measures)
let data = array![
    [7.0, 9.0, 8.0],
    [6.0, 5.0, 7.0],
    [9.0, 7.0, 6.0],
    [8.0, 5.0, 6.0]
];
let (chi2, p_value) = friedman(&data.view()).unwrap();
println!("Friedman test: Chi²={}, p={}", chi2, p_value);

// One-sample distribution fit test
let normal = distributions::norm(0.0f64, 1.0).unwrap();
let standardized_data = array![0.1, -0.2, 0.3, -0.1, 0.2];
let (ks_stat, p_value) = kstest(&standardized_data.view(), |x| normal.cdf(x)).unwrap();
println!("Kolmogorov-Smirnov one-sample test: D={}, p={}", ks_stat, p_value);

// Two-sample KS test
let sample1 = array![0.1, 0.2, 0.3, 0.4, 0.5];
let sample2 = array![0.6, 0.7, 0.8, 0.9, 1.0];
let (ks_stat, p_value) = ks_2samp(&sample1.view(), &sample2.view(), "two-sided").unwrap();
println!("Kolmogorov-Smirnov two-sample test: D={}, p={}", ks_stat, p_value);

§Random Number Generation

use scirs2_stats::random::{uniform, randn, randint, choice};
use ndarray::array;

// Generate uniform random numbers between 0 and 1
let uniform_samples = uniform(0.0, 1.0, 10, Some(42)).unwrap();

// Generate standard normal random numbers
let normal_samples = randn(10, Some(123)).unwrap();

// Generate random integers between 1 and 100
let int_samples = randint(1, 101, 5, Some(456)).unwrap();

// Randomly choose elements from an array
let options = array!["apple", "banana", "cherry", "date", "elderberry"];
let choices = choice(&options.view(), 3, false, None, Some(789)).unwrap();

§Statistical Sampling

use scirs2_stats::sampling;
use ndarray::array;

// Create an array
let data = array![1.0, 2.0, 3.0, 4.0, 5.0];

// Generate bootstrap samples
let bootstrap_samples = sampling::bootstrap(&data.view(), 10, Some(42)).unwrap();

// Generate a random permutation
let permutation = sampling::permutation(&data.view(), Some(123)).unwrap();

Re-exports§

pub use error::StatsError;
pub use error::StatsResult;
pub use tests::anova::one_way_anova;
pub use tests::anova::tukey_hsd;
pub use tests::chi2_test::chi2_gof;
pub use tests::chi2_test::chi2_independence;
pub use tests::chi2_test::chi2_yates;
pub use tests::nonparametric::mann_whitney as mannwhitneyu;
pub use tests::nonparametric::friedman;
pub use tests::nonparametric::kruskal_wallis;
pub use tests::nonparametric::mann_whitney;
pub use tests::nonparametric::wilcoxon;
pub use tests::normality::anderson_darling;
pub use tests::normality::dagostino_k2;
pub use tests::normality::ks_2samp;
pub use tests::normality::shapiro_wilk;
pub use tests::ttest::ttest_1samp;
pub use tests::ttest::ttest_ind;
pub use tests::ttest::ttest_ind_from_stats;
pub use tests::ttest::ttest_rel;
pub use tests::ttest::Alternative;
pub use tests::ttest::TTestResult;
pub use distribution_characteristics::cross_entropy;
pub use distribution_characteristics::entropy;
pub use distribution_characteristics::kl_divergence;
pub use distribution_characteristics::kurtosis_ci;
pub use distribution_characteristics::mode;
pub use distribution_characteristics::skewness_ci;
pub use distribution_characteristics::ConfidenceInterval;
pub use distribution_characteristics::Mode;
pub use distribution_characteristics::ModeMethod;
pub use regression::elastic_net;
pub use regression::group_lasso;
pub use regression::huber_regression;
pub use regression::lasso_regression;
pub use regression::linear_regression;
pub use regression::linregress;
pub use regression::multilinear_regression;
pub use regression::odr;
pub use regression::polyfit;
pub use regression::ransac;
pub use regression::ridge_regression;
pub use regression::stepwise_regression;
pub use regression::theilslopes;
pub use regression::HuberT;
pub use regression::RegressionResults;
pub use regression::StepwiseCriterion;
pub use regression::StepwiseDirection;
pub use regression::StepwiseResults;
pub use regression::TheilSlopesResult;
pub use tests::*;
pub use random::*;

Modules§

contingency
Contingency table functions
distribution_characteristics
Distribution characteristic statistics
distributions
Statistical distributions
error
Error types for the SciRS2 statistics module
mstats
Masked array statistics
qmc
Quasi-Monte Carlo
random
Random number generation
regression
Regression analysis module
sampling
Statistical sampling
tests
Statistical tests module
traits
Trait definitions for distributions and statistical objects

Enums§

QuantileInterpolation
Methods for interpolating quantiles

Functions§

boxplot_stats
Compute boxplot statistics for a dataset.
coef_variation
Compute the coefficient of variation (CV) of a dataset.
corrcoef
Compute a correlation matrix for a set of variables.
data_range
Compute the range of a dataset.
deciles
Compute the deciles of a dataset.
gini_coefficient
Compute the Gini coefficient of a dataset.
icc
Calculates the intraclass correlation coefficient (ICC) with confidence intervals.
iqr
Compute the interquartile range (IQR) of a dataset.
kendall_tau
Compute the Kendall tau correlation coefficient between two arrays.
kendallr
Calculates the Kendall tau rank correlation coefficient and p-value.
kurtosis
Compute the kurtosis of a data set.
mean
Compute the arithmetic mean of a data set.
mean_abs_deviation
Compute the mean absolute deviation (MAD) of a dataset.
median
Compute the median of a data set.
median_abs_deviation
Compute the median absolute deviation (MAD) of a dataset.
moment
Compute the moment of a distribution.
partial_corr
Compute the partial correlation coefficient between two variables, controlling for one or more additional variables.
partial_corrr
Calculates the partial correlation coefficient and p-value.
pearson_r
Compute the Pearson correlation coefficient between two arrays.
pearsonr
Calculates the Pearson correlation coefficient and p-value for testing non-correlation.
percentile
Compute the percentile of a dataset.
point_biserial
Compute the point-biserial correlation coefficient between a binary variable and a continuous variable.
point_biserialr
Calculates the point-biserial correlation coefficient and p-value.
quantile
Compute the quantile of a dataset.
quartiles
Compute the quartiles of a dataset.
quintiles
Compute the quintiles of a dataset.
skew
Compute the skewness of a data set.
spearman_r
Compute the Spearman rank correlation coefficient between two arrays.
spearmanr
Calculates the Spearman rank correlation coefficient and p-value.
std
Compute the standard deviation of a data set.
var
Compute the variance of a data set.
weighted_mean
Compute the weighted average of a data set.
winsorized_mean
Compute the winsorized mean of a dataset.
winsorized_variance
Compute the winsorized variance of a dataset.