single_statistics/testing/mod.rs
1//! Statistical testing framework for single-cell data analysis.
2//!
3//! This module provides a comprehensive suite of statistical tests and methods specifically designed
4//! for single-cell RNA-seq data analysis. It includes parametric and non-parametric tests, multiple
5//! testing correction methods, and effect size calculations.
6//!
7//! ## Key Components
8//!
9//! - **Core Data Structures**: [`TestResult`] and [`MultipleTestResults`] for storing test outcomes
10//! - **Test Methods**: [`TestMethod`] enum defining available statistical tests
11//! - **Matrix Operations**: [`MatrixStatTests`] trait for running tests on sparse matrices
12//!
13//! ## Submodules
14//!
15//! - [`correction`]: Multiple testing correction methods (FDR, Bonferroni, etc.)
16//! - [`effect`]: Effect size calculations (Cohen's d, etc.)
17//! - [`inference`]: Core statistical inference implementations
18//! - [`utils`]: Utility functions for data preparation and validation
19//!
20//! ## Usage
21//!
22//! Use the `MatrixStatTests` trait on sparse matrices to perform differential expression
23//! analysis with various statistical methods and automatic multiple testing correction.
24
25use single_utilities::traits::FloatOps;
26use std::collections::HashMap;
27
28pub mod correction;
29pub mod effect;
30pub mod inference;
31
32pub mod utils;
33
34/// Statistical test methods available for differential expression analysis.
35///
36/// This enum defines the different statistical tests that can be applied to single-cell data.
37/// Each method has specific assumptions and use cases.
38#[derive(Debug, Clone, Copy)]
39pub enum TestMethod {
40 /// Student's or Welch's t-test for comparing means between two groups.
41 ///
42 /// **Use when**: Data is approximately normal, comparing continuous expression values.
43 /// **Best for**: Most differential expression analyses in single-cell data.
44 TTest(TTestType),
45
46 /// Mann-Whitney U test (Wilcoxon rank-sum test) for non-parametric comparison.
47 ///
48 /// **Use when**: Data is not normally distributed, or you want a robust rank-based test.
49 /// **Best for**: Highly skewed expression data or small sample sizes.
50 MannWhitney,
51
52 /// Fisher's Exact test for comparing frequencies of expression.
53 ///
54 /// **Use when**: Comparing proportions (e.g., cell type proportions or expression frequency).
55 /// **Best for**: Categorical data and binary (expressed/not expressed) comparisons.
56 FisherExact,
57
58 /// Negative binomial test for count data with overdispersion.
59 ///
60 /// **Use when**: Working with raw UMI counts and modeling overdispersion.
61 /// **Best for**: Count-based differential expression (like DESeq2/edgeR approach).
62 NegativeBinomial,
63
64 /// Zero-inflated models for data with excess zeros.
65 ///
66 /// **Use when**: High proportion of zero values (dropout) needs explicit modeling.
67 /// **Best for**: Single-cell data with significant technical dropout.
68 ZeroInflated,
69}
70
71/// Type of t-test to perform, differing in variance assumptions.
72#[derive(Debug, Clone, Copy)]
73pub enum TTestType {
74 /// Student's t-test assuming equal variances between groups.
75 ///
76 /// **Use when**: Groups have similar variance (homoscedasticity).
77 /// **Faster** but less robust than Welch's t-test.
78 Student,
79
80 /// Welch's t-test allowing unequal variances between groups.
81 ///
82 /// **Use when**: Groups may have different variances (heteroscedasticity).
83 /// **Recommended** for most single-cell analyses as the default choice.
84 Welch,
85}
86
87/// Alternative hypothesis for statistical tests.
88#[derive(Debug, Clone, Copy)]
89pub enum Alternative {
90 /// Two-sided test: group means are not equal (μ₁ ≠ μ₂).
91 ///
92 /// **Most common** choice for differential expression analysis.
93 TwoSided,
94
95 /// One-sided test: group 1 mean is less than group 2 (μ₁ < μ₂).
96 ///
97 /// **Use when**: You specifically want to test for downregulation.
98 Less,
99
100 /// One-sided test: group 1 mean is greater than group 2 (μ₁ > μ₂).
101 ///
102 /// **Use when**: You specifically want to test for upregulation.
103 Greater,
104}
105
106/// Result of a single statistical test.
107///
108/// Contains the test statistic, p-value, and optional additional information like effect sizes
109/// and confidence intervals. This structure is used for individual gene/feature tests.
110///
111
112#[derive(Debug, Clone)]
113pub struct TestResult<T> {
114 /// The test statistic value (e.g., t-statistic, U statistic)
115 pub statistic: T,
116 /// The p-value of the test
117 pub p_value: T,
118 /// Confidence interval for the effect size/difference (if available)
119 pub confidence_interval: Option<(T, T)>,
120 /// Degrees of freedom (for parametric tests)
121 pub degrees_of_freedom: Option<T>,
122 /// Effect size measurement (Cohen's d, etc.)
123 pub effect_size: Option<T>,
124 /// Standard error of the effect size or test statistic
125 pub standard_error: Option<T>,
126 /// Additional test-specific information
127 pub metadata: HashMap<String, T>,
128}
129
130impl<T> TestResult<T>
131where
132 T: FloatOps,
133{
134 /// Create a new test result with minimal information
135 pub fn new(statistic: T, p_value: T) -> Self {
136 TestResult {
137 statistic,
138 p_value,
139 confidence_interval: None,
140 degrees_of_freedom: None,
141 effect_size: None,
142 standard_error: None,
143 metadata: HashMap::new(),
144 }
145 }
146
147 /// Create a new test result with effect size
148 pub fn with_effect_size(statistic: T, p_value: T, effect_size: T) -> Self {
149 TestResult {
150 statistic,
151 p_value,
152 confidence_interval: None,
153 degrees_of_freedom: None,
154 effect_size: Some(effect_size),
155 standard_error: None,
156 metadata: HashMap::new(),
157 }
158 }
159
160 /// Add confidence interval to the result
161 pub fn with_confidence_interval(mut self, lower: T, upper: T) -> Self {
162 self.confidence_interval = Some((lower, upper));
163 self
164 }
165
166 /// Add degrees of freedom to the result
167 pub fn with_degrees_of_freedom(mut self, df: T) -> Self {
168 self.degrees_of_freedom = Some(df);
169 self
170 }
171
172 /// Add standard error to the result
173 pub fn with_standard_error(mut self, se: T) -> Self {
174 self.standard_error = Some(se);
175 self
176 }
177
178 /// Add additional metadata
179 pub fn with_metadata(mut self, key: &str, value: T) -> Self {
180 self.metadata.insert(key.to_string(), value);
181 self
182 }
183
184 /// Check if the result is statistically significant at the given threshold
185 pub fn is_significant(&self, alpha: T) -> bool {
186 self.p_value < alpha
187 }
188}
189
190/// Results from multiple statistical tests, typically for differential expression analysis.
191///
192/// This structure contains results from testing multiple features (genes) simultaneously,
193/// including multiple testing correction. It's the primary output of differential expression
194/// analysis workflows.
195///
196
197#[derive(Debug, Clone)]
198pub struct MultipleTestResults<T> {
199 /// Test statistics for each feature/gene
200 pub statistics: Vec<T>,
201 /// Raw (unadjusted) p-values
202 pub p_values: Vec<T>,
203 /// Adjusted p-values (after multiple testing correction)
204 pub adjusted_p_values: Option<Vec<T>>,
205 /// Effect sizes (if calculated)
206 pub effect_sizes: Option<Vec<T>>,
207 /// Confidence intervals (if calculated)
208 pub confidence_intervals: Option<Vec<(T, T)>>,
209 /// Feature-specific metadata
210 pub feature_metadata: Option<Vec<HashMap<String, T>>>,
211 /// Global metadata about the test
212 pub global_metadata: HashMap<String, String>,
213}
214
215impl<T> MultipleTestResults<T>
216where
217 T: FloatOps,
218{
219 /// Create a new results object from p-values
220 pub fn new(statistics: Vec<T>, p_values: Vec<T>) -> Self {
221 MultipleTestResults {
222 statistics,
223 p_values,
224 adjusted_p_values: None,
225 effect_sizes: None,
226 confidence_intervals: None,
227 feature_metadata: None,
228 global_metadata: HashMap::new(),
229 }
230 }
231
232 /// Add adjusted p-values to the results
233 pub fn with_adjusted_p_values(mut self, adjusted_p_values: Vec<T>) -> Self {
234 self.adjusted_p_values = Some(adjusted_p_values);
235 self
236 }
237
238 /// Add effect sizes to the results
239 pub fn with_effect_sizes(mut self, effect_sizes: Vec<T>) -> Self {
240 self.effect_sizes = Some(effect_sizes);
241 self
242 }
243
244 /// Add confidence intervals to the results
245 pub fn with_confidence_intervals(mut self, confidence_intervals: Vec<(T, T)>) -> Self {
246 self.confidence_intervals = Some(confidence_intervals);
247 self
248 }
249
250 /// Add global metadata about the test
251 pub fn with_global_metadata(mut self, key: &str, value: &str) -> Self {
252 self.global_metadata
253 .insert(key.to_string(), value.to_string());
254 self
255 }
256
257 /// Get indices of significant features at the given threshold
258 pub fn significant_indices(&self, alpha: T) -> Vec<usize> {
259 match &self.adjusted_p_values {
260 Some(adj_p_values) => adj_p_values
261 .iter()
262 .enumerate()
263 .filter_map(|(i, &p)| if p < alpha { Some(i) } else { None })
264 .collect(),
265 None => self
266 .p_values
267 .iter()
268 .enumerate()
269 .filter_map(|(i, &p)| if p < alpha { Some(i) } else { None })
270 .collect(),
271 }
272 }
273
274 /// Get the number of significant features at the given threshold
275 pub fn num_significant(&self, alpha: T) -> usize {
276 self.significant_indices(alpha).len()
277 }
278
279 /// Get top n features by p-value
280 pub fn top_features(&self, n: usize) -> Vec<usize> {
281 let p_values = match &self.adjusted_p_values {
282 Some(adj_p) => adj_p,
283 None => &self.p_values,
284 };
285
286 let mut indices: Vec<usize> = (0..p_values.len()).collect();
287 indices.sort_by(|&a, &b| {
288 p_values[a]
289 .partial_cmp(&p_values[b])
290 .unwrap_or(std::cmp::Ordering::Equal)
291 });
292 indices.truncate(n);
293 indices
294 }
295}