Skip to main content

single_statistics/testing/
mod.rs

1//! Statistical testing framework for single-cell data analysis.
2//!
3//! This module provides a comprehensive suite of statistical tests and methods specifically designed
4//! for single-cell RNA-seq data analysis. It includes parametric and non-parametric tests, multiple
5//! testing correction methods, and effect size calculations.
6//!
7//! ## Key Components
8//!
9//! - **Core Data Structures**: [`TestResult`] and [`MultipleTestResults`] for storing test outcomes
10//! - **Test Methods**: [`TestMethod`] enum defining available statistical tests
11//! - **Matrix Operations**: [`MatrixStatTests`] trait for running tests on sparse matrices
12//!
13//! ## Submodules
14//!
15//! - [`correction`]: Multiple testing correction methods (FDR, Bonferroni, etc.)
16//! - [`effect`]: Effect size calculations (Cohen's d, etc.)
17//! - [`inference`]: Core statistical inference implementations
18//! - [`utils`]: Utility functions for data preparation and validation
19//!
20//! ## Usage
21//!
22//! Use the `MatrixStatTests` trait on sparse matrices to perform differential expression
23//! analysis with various statistical methods and automatic multiple testing correction.
24
25use single_utilities::traits::FloatOps;
26use std::collections::HashMap;
27
28pub mod correction;
29pub mod effect;
30pub mod inference;
31
32pub mod utils;
33
34/// Statistical test methods available for differential expression analysis.
35///
36/// This enum defines the different statistical tests that can be applied to single-cell data.
37/// Each method has specific assumptions and use cases.
38#[derive(Debug, Clone, Copy)]
39pub enum TestMethod {
40    /// Student's or Welch's t-test for comparing means between two groups.
41    /// 
42    /// **Use when**: Data is approximately normal, comparing continuous expression values.
43    /// **Best for**: Most differential expression analyses in single-cell data.
44    TTest(TTestType),
45    
46    /// Mann-Whitney U test (Wilcoxon rank-sum test) for non-parametric comparison.
47    /// 
48    /// **Use when**: Data is not normally distributed, or you want a robust rank-based test.
49    /// **Best for**: Highly skewed expression data or small sample sizes.
50    MannWhitney,
51
52    /// Fisher's Exact test for comparing frequencies of expression.
53    /// 
54    /// **Use when**: Comparing proportions (e.g., cell type proportions or expression frequency).
55    /// **Best for**: Categorical data and binary (expressed/not expressed) comparisons.
56    FisherExact,
57    
58    /// Negative binomial test for count data with overdispersion.
59    /// 
60    /// **Use when**: Working with raw UMI counts and modeling overdispersion.
61    /// **Best for**: Count-based differential expression (like DESeq2/edgeR approach).
62    NegativeBinomial,
63    
64    /// Zero-inflated models for data with excess zeros.
65    /// 
66    /// **Use when**: High proportion of zero values (dropout) needs explicit modeling.
67    /// **Best for**: Single-cell data with significant technical dropout.
68    ZeroInflated,
69}
70
71/// Type of t-test to perform, differing in variance assumptions.
72#[derive(Debug, Clone, Copy)]
73pub enum TTestType {
74    /// Student's t-test assuming equal variances between groups.
75    /// 
76    /// **Use when**: Groups have similar variance (homoscedasticity).
77    /// **Faster** but less robust than Welch's t-test.
78    Student,
79    
80    /// Welch's t-test allowing unequal variances between groups.
81    /// 
82    /// **Use when**: Groups may have different variances (heteroscedasticity).
83    /// **Recommended** for most single-cell analyses as the default choice.
84    Welch,
85}
86
87/// Alternative hypothesis for statistical tests.
88#[derive(Debug, Clone, Copy)]
89pub enum Alternative {
90    /// Two-sided test: group means are not equal (μ₁ ≠ μ₂).
91    /// 
92    /// **Most common** choice for differential expression analysis.
93    TwoSided,
94    
95    /// One-sided test: group 1 mean is less than group 2 (μ₁ < μ₂).
96    /// 
97    /// **Use when**: You specifically want to test for downregulation.
98    Less,
99    
100    /// One-sided test: group 1 mean is greater than group 2 (μ₁ > μ₂).
101    /// 
102    /// **Use when**: You specifically want to test for upregulation.
103    Greater,
104}
105
106/// Result of a single statistical test.
107///
108/// Contains the test statistic, p-value, and optional additional information like effect sizes
109/// and confidence intervals. This structure is used for individual gene/feature tests.
110///
111
112#[derive(Debug, Clone)]
113pub struct TestResult<T> {
114    /// The test statistic value (e.g., t-statistic, U statistic)
115    pub statistic: T,
116    /// The p-value of the test
117    pub p_value: T,
118    /// Confidence interval for the effect size/difference (if available)
119    pub confidence_interval: Option<(T, T)>,
120    /// Degrees of freedom (for parametric tests)
121    pub degrees_of_freedom: Option<T>,
122    /// Effect size measurement (Cohen's d, etc.)
123    pub effect_size: Option<T>,
124    /// Standard error of the effect size or test statistic
125    pub standard_error: Option<T>,
126    /// Additional test-specific information
127    pub metadata: HashMap<String, T>,
128}
129
130impl<T> TestResult<T>
131where
132    T: FloatOps,
133{
134    /// Create a new test result with minimal information
135    pub fn new(statistic: T, p_value: T) -> Self {
136        TestResult {
137            statistic,
138            p_value,
139            confidence_interval: None,
140            degrees_of_freedom: None,
141            effect_size: None,
142            standard_error: None,
143            metadata: HashMap::new(),
144        }
145    }
146
147    /// Create a new test result with effect size
148    pub fn with_effect_size(statistic: T, p_value: T, effect_size: T) -> Self {
149        TestResult {
150            statistic,
151            p_value,
152            confidence_interval: None,
153            degrees_of_freedom: None,
154            effect_size: Some(effect_size),
155            standard_error: None,
156            metadata: HashMap::new(),
157        }
158    }
159
160    /// Add confidence interval to the result
161    pub fn with_confidence_interval(mut self, lower: T, upper: T) -> Self {
162        self.confidence_interval = Some((lower, upper));
163        self
164    }
165
166    /// Add degrees of freedom to the result
167    pub fn with_degrees_of_freedom(mut self, df: T) -> Self {
168        self.degrees_of_freedom = Some(df);
169        self
170    }
171
172    /// Add standard error to the result
173    pub fn with_standard_error(mut self, se: T) -> Self {
174        self.standard_error = Some(se);
175        self
176    }
177
178    /// Add additional metadata
179    pub fn with_metadata(mut self, key: &str, value: T) -> Self {
180        self.metadata.insert(key.to_string(), value);
181        self
182    }
183
184    /// Check if the result is statistically significant at the given threshold
185    pub fn is_significant(&self, alpha: T) -> bool {
186        self.p_value < alpha
187    }
188}
189
190/// Results from multiple statistical tests, typically for differential expression analysis.
191///
192/// This structure contains results from testing multiple features (genes) simultaneously,
193/// including multiple testing correction. It's the primary output of differential expression
194/// analysis workflows.
195///
196
197#[derive(Debug, Clone)]
198pub struct MultipleTestResults<T> {
199    /// Test statistics for each feature/gene
200    pub statistics: Vec<T>,
201    /// Raw (unadjusted) p-values
202    pub p_values: Vec<T>,
203    /// Adjusted p-values (after multiple testing correction)
204    pub adjusted_p_values: Option<Vec<T>>,
205    /// Effect sizes (if calculated)
206    pub effect_sizes: Option<Vec<T>>,
207    /// Confidence intervals (if calculated)
208    pub confidence_intervals: Option<Vec<(T, T)>>,
209    /// Feature-specific metadata
210    pub feature_metadata: Option<Vec<HashMap<String, T>>>,
211    /// Global metadata about the test
212    pub global_metadata: HashMap<String, String>,
213}
214
215impl<T> MultipleTestResults<T>
216where
217    T: FloatOps,
218{
219    /// Create a new results object from p-values
220    pub fn new(statistics: Vec<T>, p_values: Vec<T>) -> Self {
221        MultipleTestResults {
222            statistics,
223            p_values,
224            adjusted_p_values: None,
225            effect_sizes: None,
226            confidence_intervals: None,
227            feature_metadata: None,
228            global_metadata: HashMap::new(),
229        }
230    }
231
232    /// Add adjusted p-values to the results
233    pub fn with_adjusted_p_values(mut self, adjusted_p_values: Vec<T>) -> Self {
234        self.adjusted_p_values = Some(adjusted_p_values);
235        self
236    }
237
238    /// Add effect sizes to the results
239    pub fn with_effect_sizes(mut self, effect_sizes: Vec<T>) -> Self {
240        self.effect_sizes = Some(effect_sizes);
241        self
242    }
243
244    /// Add confidence intervals to the results
245    pub fn with_confidence_intervals(mut self, confidence_intervals: Vec<(T, T)>) -> Self {
246        self.confidence_intervals = Some(confidence_intervals);
247        self
248    }
249
250    /// Add global metadata about the test
251    pub fn with_global_metadata(mut self, key: &str, value: &str) -> Self {
252        self.global_metadata
253            .insert(key.to_string(), value.to_string());
254        self
255    }
256
257    /// Get indices of significant features at the given threshold
258    pub fn significant_indices(&self, alpha: T) -> Vec<usize> {
259        match &self.adjusted_p_values {
260            Some(adj_p_values) => adj_p_values
261                .iter()
262                .enumerate()
263                .filter_map(|(i, &p)| if p < alpha { Some(i) } else { None })
264                .collect(),
265            None => self
266                .p_values
267                .iter()
268                .enumerate()
269                .filter_map(|(i, &p)| if p < alpha { Some(i) } else { None })
270                .collect(),
271        }
272    }
273
274    /// Get the number of significant features at the given threshold
275    pub fn num_significant(&self, alpha: T) -> usize {
276        self.significant_indices(alpha).len()
277    }
278
279    /// Get top n features by p-value
280    pub fn top_features(&self, n: usize) -> Vec<usize> {
281        let p_values = match &self.adjusted_p_values {
282            Some(adj_p) => adj_p,
283            None => &self.p_values,
284        };
285
286        let mut indices: Vec<usize> = (0..p_values.len()).collect();
287        indices.sort_by(|&a, &b| {
288            p_values[a]
289                .partial_cmp(&p_values[b])
290                .unwrap_or(std::cmp::Ordering::Equal)
291        });
292        indices.truncate(n);
293        indices
294    }
295}