single_statistics/testing/mod.rs
1//! Statistical testing framework for single-cell data analysis.
2//!
3//! This module provides a comprehensive suite of statistical tests and methods specifically designed
4//! for single-cell RNA-seq data analysis. It includes parametric and non-parametric tests, multiple
5//! testing correction methods, and effect size calculations.
6//!
7//! ## Key Components
8//!
9//! - **Core Data Structures**: [`TestResult`] and [`MultipleTestResults`] for storing test outcomes
10//! - **Test Methods**: [`TestMethod`] enum defining available statistical tests
11//! - **Matrix Operations**: [`MatrixStatTests`] trait for running tests on sparse matrices
12//!
13//! ## Submodules
14//!
15//! - [`correction`]: Multiple testing correction methods (FDR, Bonferroni, etc.)
16//! - [`effect`]: Effect size calculations (Cohen's d, etc.)
17//! - [`inference`]: Core statistical inference implementations
18//! - [`utils`]: Utility functions for data preparation and validation
19//!
20//! ## Usage
21//!
22//! Use the `MatrixStatTests` trait on sparse matrices to perform differential expression
23//! analysis with various statistical methods and automatic multiple testing correction.
24
25use single_utilities::traits::FloatOps;
26use std::collections::HashMap;
27
28pub mod correction;
29pub mod effect;
30pub mod inference;
31
32pub mod utils;
33
34/// Statistical test methods available for differential expression analysis.
35///
36/// This enum defines the different statistical tests that can be applied to single-cell data.
37/// Each method has specific assumptions and use cases.
38#[derive(Debug, Clone, Copy)]
39pub enum TestMethod {
40 /// Student's or Welch's t-test for comparing means between two groups.
41 ///
42 /// **Use when**: Data is approximately normal, comparing continuous expression values.
43 /// **Best for**: Most differential expression analyses in single-cell data.
44 TTest(TTestType),
45
46 /// Mann-Whitney U test (Wilcoxon rank-sum test) for non-parametric comparison.
47 ///
48 /// **Use when**: Data is not normally distributed, or you want a robust rank-based test.
49 /// **Best for**: Highly skewed expression data or small sample sizes.
50 MannWhitney,
51
52 /// Negative binomial test for count data with overdispersion.
53 ///
54 /// **Use when**: Working with raw UMI counts and modeling overdispersion.
55 /// **Best for**: Count-based differential expression (like DESeq2/edgeR approach).
56 NegativeBinomial,
57
58 /// Zero-inflated models for data with excess zeros.
59 ///
60 /// **Use when**: High proportion of zero values (dropout) needs explicit modeling.
61 /// **Best for**: Single-cell data with significant technical dropout.
62 ZeroInflated,
63}
64
65/// Type of t-test to perform, differing in variance assumptions.
66#[derive(Debug, Clone, Copy)]
67pub enum TTestType {
68 /// Student's t-test assuming equal variances between groups.
69 ///
70 /// **Use when**: Groups have similar variance (homoscedasticity).
71 /// **Faster** but less robust than Welch's t-test.
72 Student,
73
74 /// Welch's t-test allowing unequal variances between groups.
75 ///
76 /// **Use when**: Groups may have different variances (heteroscedasticity).
77 /// **Recommended** for most single-cell analyses as the default choice.
78 Welch,
79}
80
81/// Alternative hypothesis for statistical tests.
82#[derive(Debug, Clone, Copy)]
83pub enum Alternative {
84 /// Two-sided test: group means are not equal (μ₁ ≠ μ₂).
85 ///
86 /// **Most common** choice for differential expression analysis.
87 TwoSided,
88
89 /// One-sided test: group 1 mean is less than group 2 (μ₁ < μ₂).
90 ///
91 /// **Use when**: You specifically want to test for downregulation.
92 Less,
93
94 /// One-sided test: group 1 mean is greater than group 2 (μ₁ > μ₂).
95 ///
96 /// **Use when**: You specifically want to test for upregulation.
97 Greater,
98}
99
100/// Result of a single statistical test.
101///
102/// Contains the test statistic, p-value, and optional additional information like effect sizes
103/// and confidence intervals. This structure is used for individual gene/feature tests.
104///
105
106#[derive(Debug, Clone)]
107pub struct TestResult<T> {
108 /// The test statistic value (e.g., t-statistic, U statistic)
109 pub statistic: T,
110 /// The p-value of the test
111 pub p_value: T,
112 /// Confidence interval for the effect size/difference (if available)
113 pub confidence_interval: Option<(T, T)>,
114 /// Degrees of freedom (for parametric tests)
115 pub degrees_of_freedom: Option<T>,
116 /// Effect size measurement (Cohen's d, etc.)
117 pub effect_size: Option<T>,
118 /// Standard error of the effect size or test statistic
119 pub standard_error: Option<T>,
120 /// Additional test-specific information
121 pub metadata: HashMap<String, T>,
122}
123
124impl<T> TestResult<T>
125where
126 T: FloatOps,
127{
128 /// Create a new test result with minimal information
129 pub fn new(statistic: T, p_value: T) -> Self {
130 TestResult {
131 statistic,
132 p_value,
133 confidence_interval: None,
134 degrees_of_freedom: None,
135 effect_size: None,
136 standard_error: None,
137 metadata: HashMap::new(),
138 }
139 }
140
141 /// Create a new test result with effect size
142 pub fn with_effect_size(statistic: T, p_value: T, effect_size: T) -> Self {
143 TestResult {
144 statistic,
145 p_value,
146 confidence_interval: None,
147 degrees_of_freedom: None,
148 effect_size: Some(effect_size),
149 standard_error: None,
150 metadata: HashMap::new(),
151 }
152 }
153
154 /// Add confidence interval to the result
155 pub fn with_confidence_interval(mut self, lower: T, upper: T) -> Self {
156 self.confidence_interval = Some((lower, upper));
157 self
158 }
159
160 /// Add degrees of freedom to the result
161 pub fn with_degrees_of_freedom(mut self, df: T) -> Self {
162 self.degrees_of_freedom = Some(df);
163 self
164 }
165
166 /// Add standard error to the result
167 pub fn with_standard_error(mut self, se: T) -> Self {
168 self.standard_error = Some(se);
169 self
170 }
171
172 /// Add additional metadata
173 pub fn with_metadata(mut self, key: &str, value: T) -> Self {
174 self.metadata.insert(key.to_string(), value);
175 self
176 }
177
178 /// Check if the result is statistically significant at the given threshold
179 pub fn is_significant(&self, alpha: T) -> bool {
180 self.p_value < alpha
181 }
182}
183
184/// Results from multiple statistical tests, typically for differential expression analysis.
185///
186/// This structure contains results from testing multiple features (genes) simultaneously,
187/// including multiple testing correction. It's the primary output of differential expression
188/// analysis workflows.
189///
190
191#[derive(Debug, Clone)]
192pub struct MultipleTestResults<T> {
193 /// Test statistics for each feature/gene
194 pub statistics: Vec<T>,
195 /// Raw (unadjusted) p-values
196 pub p_values: Vec<T>,
197 /// Adjusted p-values (after multiple testing correction)
198 pub adjusted_p_values: Option<Vec<T>>,
199 /// Effect sizes (if calculated)
200 pub effect_sizes: Option<Vec<T>>,
201 /// Confidence intervals (if calculated)
202 pub confidence_intervals: Option<Vec<(T, T)>>,
203 /// Feature-specific metadata
204 pub feature_metadata: Option<Vec<HashMap<String, T>>>,
205 /// Global metadata about the test
206 pub global_metadata: HashMap<String, String>,
207}
208
209impl<T> MultipleTestResults<T>
210where
211 T: FloatOps,
212{
213 /// Create a new results object from p-values
214 pub fn new(statistics: Vec<T>, p_values: Vec<T>) -> Self {
215 MultipleTestResults {
216 statistics,
217 p_values,
218 adjusted_p_values: None,
219 effect_sizes: None,
220 confidence_intervals: None,
221 feature_metadata: None,
222 global_metadata: HashMap::new(),
223 }
224 }
225
226 /// Add adjusted p-values to the results
227 pub fn with_adjusted_p_values(mut self, adjusted_p_values: Vec<T>) -> Self {
228 self.adjusted_p_values = Some(adjusted_p_values);
229 self
230 }
231
232 /// Add effect sizes to the results
233 pub fn with_effect_sizes(mut self, effect_sizes: Vec<T>) -> Self {
234 self.effect_sizes = Some(effect_sizes);
235 self
236 }
237
238 /// Add confidence intervals to the results
239 pub fn with_confidence_intervals(mut self, confidence_intervals: Vec<(T, T)>) -> Self {
240 self.confidence_intervals = Some(confidence_intervals);
241 self
242 }
243
244 /// Add global metadata about the test
245 pub fn with_global_metadata(mut self, key: &str, value: &str) -> Self {
246 self.global_metadata
247 .insert(key.to_string(), value.to_string());
248 self
249 }
250
251 /// Get indices of significant features at the given threshold
252 pub fn significant_indices(&self, alpha: T) -> Vec<usize> {
253 match &self.adjusted_p_values {
254 Some(adj_p_values) => adj_p_values
255 .iter()
256 .enumerate()
257 .filter_map(|(i, &p)| if p < alpha { Some(i) } else { None })
258 .collect(),
259 None => self
260 .p_values
261 .iter()
262 .enumerate()
263 .filter_map(|(i, &p)| if p < alpha { Some(i) } else { None })
264 .collect(),
265 }
266 }
267
268 /// Get the number of significant features at the given threshold
269 pub fn num_significant(&self, alpha: T) -> usize {
270 self.significant_indices(alpha).len()
271 }
272
273 /// Get top n features by p-value
274 pub fn top_features(&self, n: usize) -> Vec<usize> {
275 let p_values = match &self.adjusted_p_values {
276 Some(adj_p) => adj_p,
277 None => &self.p_values,
278 };
279
280 let mut indices: Vec<usize> = (0..p_values.len()).collect();
281 indices.sort_by(|&a, &b| {
282 p_values[a]
283 .partial_cmp(&p_values[b])
284 .unwrap_or(std::cmp::Ordering::Equal)
285 });
286 indices.truncate(n);
287 indices
288 }
289}