single_statistics/testing/
mod.rs

1//! Statistical testing framework for single-cell data analysis.
2//!
3//! This module provides a comprehensive suite of statistical tests and methods specifically designed
4//! for single-cell RNA-seq data analysis. It includes parametric and non-parametric tests, multiple
5//! testing correction methods, and effect size calculations.
6//!
7//! ## Key Components
8//!
9//! - **Core Data Structures**: [`TestResult`] and [`MultipleTestResults`] for storing test outcomes
10//! - **Test Methods**: [`TestMethod`] enum defining available statistical tests
11//! - **Matrix Operations**: [`MatrixStatTests`] trait for running tests on sparse matrices
12//!
13//! ## Submodules
14//!
15//! - [`correction`]: Multiple testing correction methods (FDR, Bonferroni, etc.)
16//! - [`effect`]: Effect size calculations (Cohen's d, etc.)
17//! - [`inference`]: Core statistical inference implementations
18//! - [`utils`]: Utility functions for data preparation and validation
19//!
20//! ## Usage
21//!
22//! Use the `MatrixStatTests` trait on sparse matrices to perform differential expression
23//! analysis with various statistical methods and automatic multiple testing correction.
24
25use single_utilities::traits::FloatOps;
26use std::collections::HashMap;
27
28pub mod correction;
29pub mod effect;
30pub mod inference;
31
32pub mod utils;
33
34/// Statistical test methods available for differential expression analysis.
35///
36/// This enum defines the different statistical tests that can be applied to single-cell data.
37/// Each method has specific assumptions and use cases.
38#[derive(Debug, Clone, Copy)]
39pub enum TestMethod {
40    /// Student's or Welch's t-test for comparing means between two groups.
41    /// 
42    /// **Use when**: Data is approximately normal, comparing continuous expression values.
43    /// **Best for**: Most differential expression analyses in single-cell data.
44    TTest(TTestType),
45    
46    /// Mann-Whitney U test (Wilcoxon rank-sum test) for non-parametric comparison.
47    /// 
48    /// **Use when**: Data is not normally distributed, or you want a robust rank-based test.
49    /// **Best for**: Highly skewed expression data or small sample sizes.
50    MannWhitney,
51    
52    /// Negative binomial test for count data with overdispersion.
53    /// 
54    /// **Use when**: Working with raw UMI counts and modeling overdispersion.
55    /// **Best for**: Count-based differential expression (like DESeq2/edgeR approach).
56    NegativeBinomial,
57    
58    /// Zero-inflated models for data with excess zeros.
59    /// 
60    /// **Use when**: High proportion of zero values (dropout) needs explicit modeling.
61    /// **Best for**: Single-cell data with significant technical dropout.
62    ZeroInflated,
63}
64
65/// Type of t-test to perform, differing in variance assumptions.
66#[derive(Debug, Clone, Copy)]
67pub enum TTestType {
68    /// Student's t-test assuming equal variances between groups.
69    /// 
70    /// **Use when**: Groups have similar variance (homoscedasticity).
71    /// **Faster** but less robust than Welch's t-test.
72    Student,
73    
74    /// Welch's t-test allowing unequal variances between groups.
75    /// 
76    /// **Use when**: Groups may have different variances (heteroscedasticity).
77    /// **Recommended** for most single-cell analyses as the default choice.
78    Welch,
79}
80
81/// Alternative hypothesis for statistical tests.
82#[derive(Debug, Clone, Copy)]
83pub enum Alternative {
84    /// Two-sided test: group means are not equal (μ₁ ≠ μ₂).
85    /// 
86    /// **Most common** choice for differential expression analysis.
87    TwoSided,
88    
89    /// One-sided test: group 1 mean is less than group 2 (μ₁ < μ₂).
90    /// 
91    /// **Use when**: You specifically want to test for downregulation.
92    Less,
93    
94    /// One-sided test: group 1 mean is greater than group 2 (μ₁ > μ₂).
95    /// 
96    /// **Use when**: You specifically want to test for upregulation.
97    Greater,
98}
99
100/// Result of a single statistical test.
101///
102/// Contains the test statistic, p-value, and optional additional information like effect sizes
103/// and confidence intervals. This structure is used for individual gene/feature tests.
104///
105
106#[derive(Debug, Clone)]
107pub struct TestResult<T> {
108    /// The test statistic value (e.g., t-statistic, U statistic)
109    pub statistic: T,
110    /// The p-value of the test
111    pub p_value: T,
112    /// Confidence interval for the effect size/difference (if available)
113    pub confidence_interval: Option<(T, T)>,
114    /// Degrees of freedom (for parametric tests)
115    pub degrees_of_freedom: Option<T>,
116    /// Effect size measurement (Cohen's d, etc.)
117    pub effect_size: Option<T>,
118    /// Standard error of the effect size or test statistic
119    pub standard_error: Option<T>,
120    /// Additional test-specific information
121    pub metadata: HashMap<String, T>,
122}
123
124impl<T> TestResult<T>
125where
126    T: FloatOps,
127{
128    /// Create a new test result with minimal information
129    pub fn new(statistic: T, p_value: T) -> Self {
130        TestResult {
131            statistic,
132            p_value,
133            confidence_interval: None,
134            degrees_of_freedom: None,
135            effect_size: None,
136            standard_error: None,
137            metadata: HashMap::new(),
138        }
139    }
140
141    /// Create a new test result with effect size
142    pub fn with_effect_size(statistic: T, p_value: T, effect_size: T) -> Self {
143        TestResult {
144            statistic,
145            p_value,
146            confidence_interval: None,
147            degrees_of_freedom: None,
148            effect_size: Some(effect_size),
149            standard_error: None,
150            metadata: HashMap::new(),
151        }
152    }
153
154    /// Add confidence interval to the result
155    pub fn with_confidence_interval(mut self, lower: T, upper: T) -> Self {
156        self.confidence_interval = Some((lower, upper));
157        self
158    }
159
160    /// Add degrees of freedom to the result
161    pub fn with_degrees_of_freedom(mut self, df: T) -> Self {
162        self.degrees_of_freedom = Some(df);
163        self
164    }
165
166    /// Add standard error to the result
167    pub fn with_standard_error(mut self, se: T) -> Self {
168        self.standard_error = Some(se);
169        self
170    }
171
172    /// Add additional metadata
173    pub fn with_metadata(mut self, key: &str, value: T) -> Self {
174        self.metadata.insert(key.to_string(), value);
175        self
176    }
177
178    /// Check if the result is statistically significant at the given threshold
179    pub fn is_significant(&self, alpha: T) -> bool {
180        self.p_value < alpha
181    }
182}
183
184/// Results from multiple statistical tests, typically for differential expression analysis.
185///
186/// This structure contains results from testing multiple features (genes) simultaneously,
187/// including multiple testing correction. It's the primary output of differential expression
188/// analysis workflows.
189///
190
191#[derive(Debug, Clone)]
192pub struct MultipleTestResults<T> {
193    /// Test statistics for each feature/gene
194    pub statistics: Vec<T>,
195    /// Raw (unadjusted) p-values
196    pub p_values: Vec<T>,
197    /// Adjusted p-values (after multiple testing correction)
198    pub adjusted_p_values: Option<Vec<T>>,
199    /// Effect sizes (if calculated)
200    pub effect_sizes: Option<Vec<T>>,
201    /// Confidence intervals (if calculated)
202    pub confidence_intervals: Option<Vec<(T, T)>>,
203    /// Feature-specific metadata
204    pub feature_metadata: Option<Vec<HashMap<String, T>>>,
205    /// Global metadata about the test
206    pub global_metadata: HashMap<String, String>,
207}
208
209impl<T> MultipleTestResults<T>
210where
211    T: FloatOps,
212{
213    /// Create a new results object from p-values
214    pub fn new(statistics: Vec<T>, p_values: Vec<T>) -> Self {
215        MultipleTestResults {
216            statistics,
217            p_values,
218            adjusted_p_values: None,
219            effect_sizes: None,
220            confidence_intervals: None,
221            feature_metadata: None,
222            global_metadata: HashMap::new(),
223        }
224    }
225
226    /// Add adjusted p-values to the results
227    pub fn with_adjusted_p_values(mut self, adjusted_p_values: Vec<T>) -> Self {
228        self.adjusted_p_values = Some(adjusted_p_values);
229        self
230    }
231
232    /// Add effect sizes to the results
233    pub fn with_effect_sizes(mut self, effect_sizes: Vec<T>) -> Self {
234        self.effect_sizes = Some(effect_sizes);
235        self
236    }
237
238    /// Add confidence intervals to the results
239    pub fn with_confidence_intervals(mut self, confidence_intervals: Vec<(T, T)>) -> Self {
240        self.confidence_intervals = Some(confidence_intervals);
241        self
242    }
243
244    /// Add global metadata about the test
245    pub fn with_global_metadata(mut self, key: &str, value: &str) -> Self {
246        self.global_metadata
247            .insert(key.to_string(), value.to_string());
248        self
249    }
250
251    /// Get indices of significant features at the given threshold
252    pub fn significant_indices(&self, alpha: T) -> Vec<usize> {
253        match &self.adjusted_p_values {
254            Some(adj_p_values) => adj_p_values
255                .iter()
256                .enumerate()
257                .filter_map(|(i, &p)| if p < alpha { Some(i) } else { None })
258                .collect(),
259            None => self
260                .p_values
261                .iter()
262                .enumerate()
263                .filter_map(|(i, &p)| if p < alpha { Some(i) } else { None })
264                .collect(),
265        }
266    }
267
268    /// Get the number of significant features at the given threshold
269    pub fn num_significant(&self, alpha: T) -> usize {
270        self.significant_indices(alpha).len()
271    }
272
273    /// Get top n features by p-value
274    pub fn top_features(&self, n: usize) -> Vec<usize> {
275        let p_values = match &self.adjusted_p_values {
276            Some(adj_p) => adj_p,
277            None => &self.p_values,
278        };
279
280        let mut indices: Vec<usize> = (0..p_values.len()).collect();
281        indices.sort_by(|&a, &b| {
282            p_values[a]
283                .partial_cmp(&p_values[b])
284                .unwrap_or(std::cmp::Ordering::Equal)
285        });
286        indices.truncate(n);
287        indices
288    }
289}