pub mod categorical;
pub mod descriptive;
pub mod inference;
pub mod regression;
pub mod sampling;
pub mod distributions;
pub mod hypothesis;
pub mod nonparametric;
#[cfg(cuda_available)]
pub mod gpu;
use crate::dataframe::DataFrame;
use crate::error::{Error, PandRSError, Result};
use std::collections::HashMap;
use std::fmt::Debug;
use std::hash::Hash;
#[derive(Debug, Clone)]
pub struct DescriptiveStats {
pub count: usize,
pub mean: f64,
pub std: f64,
pub min: f64,
pub q1: f64,
pub median: f64,
pub q3: f64,
pub max: f64,
}
#[derive(Debug, Clone)]
pub struct TTestResult {
pub statistic: f64,
pub pvalue: f64,
pub significant: bool,
pub df: usize,
}
#[derive(Debug, Clone)]
pub struct LinearRegressionResult {
pub intercept: f64,
pub coefficients: Vec<f64>,
pub r_squared: f64,
pub adj_r_squared: f64,
pub p_values: Vec<f64>,
pub fitted_values: Vec<f64>,
pub residuals: Vec<f64>,
}
#[derive(Debug, Clone)]
pub struct AnovaResult {
pub f_statistic: f64,
pub p_value: f64,
pub ss_between: f64,
pub ss_within: f64,
pub ss_total: f64,
pub df_between: usize,
pub df_within: usize,
pub df_total: usize,
pub ms_between: f64,
pub ms_within: f64,
pub significant: bool,
}
#[derive(Debug, Clone)]
pub struct MannWhitneyResult {
pub u_statistic: f64,
pub p_value: f64,
pub significant: bool,
}
#[derive(Debug, Clone)]
pub struct ChiSquareResult {
pub chi2_statistic: f64,
pub p_value: f64,
pub df: usize,
pub significant: bool,
pub expected_freq: Vec<Vec<f64>>,
}
pub fn describe<T: AsRef<[f64]>>(data: T) -> Result<DescriptiveStats> {
let summary = advanced_descriptive::describe(data.as_ref())?;
Ok(DescriptiveStats {
count: summary.count,
mean: summary.mean,
std: summary.std,
min: summary.min,
q1: summary.quartiles.q1,
median: summary.median,
q3: summary.quartiles.q3,
max: summary.max,
})
}
pub fn correlation<T: AsRef<[f64]>, U: AsRef<[f64]>>(x: T, y: U) -> Result<f64> {
advanced_descriptive::pearson_correlation(x.as_ref(), y.as_ref())
}
pub fn covariance<T: AsRef<[f64]>, U: AsRef<[f64]>>(x: T, y: U) -> Result<f64> {
let data1 = x.as_ref();
let data2 = y.as_ref();
if data1.len() != data2.len() {
return Err(Error::DimensionMismatch(
"Arrays must have the same length".into(),
));
}
if data1.is_empty() {
return Err(Error::EmptyData(
"Cannot calculate covariance of empty arrays".into(),
));
}
let n = data1.len() as f64;
let mean1 = data1.iter().sum::<f64>() / n;
let mean2 = data2.iter().sum::<f64>() / n;
let cov = data1
.iter()
.zip(data2.iter())
.map(|(&x, &y)| (x - mean1) * (y - mean2))
.sum::<f64>()
/ (n - 1.0);
Ok(cov)
}
pub fn ttest<T: AsRef<[f64]>, U: AsRef<[f64]>>(
sample1: T,
sample2: U,
alpha: f64,
equal_var: bool,
) -> Result<TTestResult> {
inference::ttest_impl(sample1.as_ref(), sample2.as_ref(), alpha, equal_var)
}
pub fn linear_regression(
df: &DataFrame,
y_column: &str,
x_columns: &[&str],
) -> Result<LinearRegressionResult> {
regression::linear_regression_impl(df, y_column, x_columns)
}
pub fn sample(df: &DataFrame, fraction: f64, replace: bool) -> Result<DataFrame> {
sampling::sample_impl(df, fraction, replace)
}
pub fn bootstrap<T: AsRef<[f64]>>(data: T, n_samples: usize) -> Result<Vec<Vec<f64>>> {
sampling::bootstrap_impl(data.as_ref(), n_samples)
}
pub fn anova<T: AsRef<[f64]>>(groups: &HashMap<&str, T>, alpha: f64) -> Result<AnovaResult> {
if groups.len() < 2 {
return Err(Error::InsufficientData(
"At least 2 groups are needed for ANOVA".into(),
));
}
let groups_converted: HashMap<&str, &[f64]> =
groups.iter().map(|(k, v)| (*k, v.as_ref())).collect();
inference::anova_impl(&groups_converted, alpha)
}
pub fn mann_whitney_u<T: AsRef<[f64]>, U: AsRef<[f64]>>(
sample1: T,
sample2: U,
alpha: f64,
) -> Result<MannWhitneyResult> {
inference::mann_whitney_u_impl(sample1.as_ref(), sample2.as_ref(), alpha)
}
pub fn chi_square_test(observed: &[Vec<f64>], alpha: f64) -> Result<ChiSquareResult> {
inference::chi_square_test_impl(observed, alpha)
}
pub use categorical::ContingencyTable;
pub fn contingency_table_from_df(
df: &DataFrame,
col1: &str,
col2: &str,
) -> Result<ContingencyTable> {
categorical::dataframe_contingency_table(df, col1, col2)
}
pub fn chi_square_independence(
df: &DataFrame,
col1: &str,
col2: &str,
alpha: f64,
) -> Result<ChiSquareResult> {
categorical::dataframe_chi_square_test(df, col1, col2, alpha)
}
pub fn cramers_v_from_df(df: &DataFrame, col1: &str, col2: &str) -> Result<f64> {
categorical::dataframe_cramers_v(df, col1, col2)
}
pub fn categorical_anova_from_df(
df: &DataFrame,
cat_col: &str,
numeric_col: &str,
alpha: f64,
) -> Result<AnovaResult> {
categorical::dataframe_categorical_anova(df, cat_col, numeric_col, alpha)
}
pub fn normalized_mutual_info(df: &DataFrame, col1: &str, col2: &str) -> Result<f64> {
categorical::dataframe_normalized_mutual_information(df, col1, col2)
}
pub use advanced_descriptive::correlation_matrix;
pub use regression::linear_regression as simple_linear_regression;
pub use sampling::stratified_sample_impl as stratified_sample;
pub use categorical::entropy;
pub use categorical::frequency_distribution;
pub use categorical::mode;
#[cfg(cuda_available)]
pub use gpu::{
correlation_matrix as gpu_correlation_matrix, covariance_matrix as gpu_covariance_matrix,
describe_gpu, feature_importance, kmeans, linear_regression as gpu_linear_regression, pca,
};
pub use distributions::{
Binomial, ChiSquared, Distribution, FDistribution, Normal, Poisson, StandardNormal,
TDistribution,
};
pub use hypothesis::{
adjust_p_values, chi_square_independence as chi_square_test_independence, correlation_test,
independent_ttest, one_sample_ttest, one_way_anova, paired_ttest, shapiro_wilk_test,
AlternativeHypothesis, EffectSize, MultipleComparisonCorrection,
TestResult as HypothesisTestResult,
};
pub use nonparametric::{
bootstrap_confidence_interval, friedman_test, kruskal_wallis_test, ks_two_sample_test,
mann_whitney_u_test as mann_whitney_u_advanced, permutation_test, runs_test,
wilcoxon_signed_rank_test,
};
pub use descriptive as advanced_descriptive;
pub struct StatisticalAnalyzer;
impl StatisticalAnalyzer {
pub fn new() -> Self {
StatisticalAnalyzer
}
pub fn analyze_column(
&self,
df: &DataFrame,
column_name: &str,
) -> Result<advanced_descriptive::StatisticalSummary> {
let column = df.get_column::<f64>(column_name)?;
let values = column.as_f64()?;
advanced_descriptive::describe(&values)
}
pub fn correlate_columns(
&self,
df: &DataFrame,
col1: &str,
col2: &str,
method: CorrelationMethod,
) -> Result<f64> {
let series1 = df.get_column::<f64>(col1)?;
let series2 = df.get_column::<f64>(col2)?;
let values1 = series1.as_f64()?;
let values2 = series2.as_f64()?;
match method {
CorrelationMethod::Pearson => {
advanced_descriptive::pearson_correlation(&values1, &values2)
}
CorrelationMethod::Spearman => {
advanced_descriptive::spearman_correlation(&values1, &values2)
}
}
}
pub fn test_columns(
&self,
df: &DataFrame,
col1: &str,
col2: &str,
test_type: HypothesisTestType,
alternative: AlternativeHypothesis,
) -> Result<HypothesisTestResult> {
let series1 = df.get_column::<f64>(col1)?;
let series2 = df.get_column::<f64>(col2)?;
let values1 = series1.as_f64()?;
let values2 = series2.as_f64()?;
match test_type {
HypothesisTestType::TTest => independent_ttest(&values1, &values2, alternative, true),
HypothesisTestType::WelchTTest => {
independent_ttest(&values1, &values2, alternative, false)
}
HypothesisTestType::MannWhitneyU => {
mann_whitney_u_advanced(&values1, &values2, alternative)
}
HypothesisTestType::KolmogorovSmirnov => {
ks_two_sample_test(&values1, &values2, alternative)
}
}
}
pub fn anova_by_group(
&self,
_df: &DataFrame,
_value_column: &str,
_group_column: &str,
_parametric: bool,
) -> Result<HypothesisTestResult> {
Err(Error::NotImplemented(
"ANOVA by group temporarily disabled due to Series API changes".into(),
))
}
pub fn correlation_matrix(
&self,
df: &DataFrame,
columns: &[String],
method: CorrelationMethod,
) -> Result<Vec<Vec<f64>>> {
let mut data = Vec::new();
for column in columns {
let series = df.get_column::<f64>(column)?;
let values = series.as_f64()?;
data.push(values.to_vec());
}
match method {
CorrelationMethod::Pearson => advanced_descriptive::correlation_matrix(&data),
CorrelationMethod::Spearman => {
let n_vars = data.len();
let mut corr_matrix = vec![vec![0.0; n_vars]; n_vars];
for i in 0..n_vars {
for j in 0..n_vars {
if i == j {
corr_matrix[i][j] = 1.0;
} else {
corr_matrix[i][j] =
advanced_descriptive::spearman_correlation(&data[i], &data[j])?;
}
}
}
Ok(corr_matrix)
}
}
}
pub fn detect_outliers(
&self,
df: &DataFrame,
column_name: &str,
method: OutlierMethod,
) -> Result<Vec<usize>> {
let column = df.get_column::<f64>(column_name)?;
let values = column.as_f64()?;
let summary = advanced_descriptive::describe(&values)?;
let mut outlier_indices = Vec::new();
match method {
OutlierMethod::IQR => {
let iqr_lower = summary.quartiles.q1 - 1.5 * summary.iqr;
let iqr_upper = summary.quartiles.q3 + 1.5 * summary.iqr;
for (i, &value) in values.iter().enumerate() {
if value < iqr_lower || value > iqr_upper {
outlier_indices.push(i);
}
}
}
OutlierMethod::ZScore => {
for (i, &value) in values.iter().enumerate() {
let z_score = (value - summary.mean) / summary.std;
if z_score.abs() > 3.0 {
outlier_indices.push(i);
}
}
}
OutlierMethod::ModifiedZScore => {
let median = summary.median;
let mad = {
let deviations: Vec<f64> = values.iter().map(|&x| (x - median).abs()).collect();
let mut sorted_deviations = deviations;
sorted_deviations
.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
advanced_descriptive::percentile(&sorted_deviations, 50.0)?
};
if mad > 0.0 {
for (i, &value) in values.iter().enumerate() {
let modified_z = 0.6745 * (value - median) / mad;
if modified_z.abs() > 3.5 {
outlier_indices.push(i);
}
}
}
}
}
Ok(outlier_indices)
}
}
#[derive(Debug, Clone)]
pub enum CorrelationMethod {
Pearson,
Spearman,
}
#[derive(Debug, Clone)]
pub enum HypothesisTestType {
TTest,
WelchTTest,
MannWhitneyU,
KolmogorovSmirnov,
}
#[derive(Debug, Clone)]
pub enum OutlierMethod {
IQR,
ZScore,
ModifiedZScore,
}