single-statistics 0.9.0

A specialized Rust library for statistical analysis of single-cell data, part of the single-rust ecosystem.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
use single_statistics::testing::inference::parametric::{fast_t_test_from_sums, t_test_matrix_groups};
use single_statistics::testing::{TTestType, TestResult};
use nalgebra_sparse::{CsrMatrix, CooMatrix};

#[cfg(test)]
mod quick_test {
    use super::*;

    #[test]
    fn check_if_ttest_works() {
        // Simple test: two clearly different groups
        // Group 1: [1, 2, 3] -> mean=2, should have low variance
        // Group 2: [7, 8, 9] -> mean=8, should have low variance  
        // These groups are clearly different, so p-value should be very small (< 0.05)
        
        let sum1 = 6.0;      // 1+2+3
        let sum_sq1 = 14.0;  // 1²+2²+3² = 1+4+9
        let n1 = 3.0;
        
        let sum2 = 24.0;     // 7+8+9
        let sum_sq2 = 194.0; // 7²+8²+9² = 49+64+81
        let n2 = 3.0;
        
        // Call your function (adjust the function name as needed)
        let result: TestResult<f64> = fast_t_test_from_sums(
            sum1, sum_sq1, n1,
            sum2, sum_sq2, n2,
            TTestType::Student
        );
        
        println!("=== T-TEST RESULTS ===");
        println!("Group 1 stats: sum={}, sum_sq={}, n={}", sum1, sum_sq1, n1);
        println!("Group 2 stats: sum={}, sum_sq={}, n={}", sum2, sum_sq2, n2);
        println!("T-statistic: {}", result.statistic);
        println!("P-value: {}", result.p_value);
        
        // Check if results make sense
        if result.p_value < 0.05 && result.statistic.abs() > 2.0 {
            println!("✅ PASS: T-test is working correctly!");
            println!("   - P-value is small ({}), indicating significant difference", result.p_value);
            println!("   - T-statistic is large ({}), as expected", result.statistic);
        } else {
            println!("❌ FAIL: T-test results look wrong!");
            println!("   - Expected: p-value < 0.05 and |t-stat| > 2.0");
            println!("   - Got: p-value = {}, t-stat = {}", result.p_value, result.statistic);
            panic!("T-test implementation has issues!");
        }
    }
    
    #[test] 
    fn check_identical_groups() {
        // Identical groups should give p-value ≈ 1.0 and t-stat ≈ 0.0
        let sum1 = 15.0;     // 5+5+5
        let sum_sq1 = 75.0;  // 5²+5²+5² = 25+25+25
        let n1 = 3.0;
        
        let sum2 = 15.0;     // 5+5+5 (identical)
        let sum_sq2 = 75.0;  // 5²+5²+5²
        let n2 = 3.0;
        
        let result: TestResult<f64> = fast_t_test_from_sums(
            sum1, sum_sq1, n1,
            sum2, sum_sq2, n2,
            TTestType::Student
        );
        
        println!("\n=== IDENTICAL GROUPS TEST ===");
        println!("T-statistic: {}", result.statistic);
        println!("P-value: {}", result.p_value);
        
        if result.statistic.abs() < 0.001 && result.p_value > 0.9 {
            println!("✅ PASS: Identical groups handled correctly!");
        } else {
            println!("❌ FAIL: Identical groups not handled correctly!");
            println!("   - Expected: t-stat ≈ 0, p-value ≈ 1.0");
            println!("   - Got: t-stat = {}, p-value = {}", result.statistic, result.p_value);
        }
    }
    
    #[test] 
    fn check_identical_groups_welch() {
        // Test Welch t-test with identical groups
        let sum1 = 15.0;     // 5+5+5
        let sum_sq1 = 75.0;  // 5²+5²+5² = 25+25+25
        let n1 = 3.0;
        
        let sum2 = 15.0;     // 5+5+5 (identical)
        let sum_sq2 = 75.0;  // 5²+5²+5²
        let n2 = 3.0;
        
        let result: TestResult<f64> = fast_t_test_from_sums(
            sum1, sum_sq1, n1,
            sum2, sum_sq2, n2,
            TTestType::Welch
        );
        
        println!("\n=== IDENTICAL GROUPS TEST (WELCH) ===");
        println!("T-statistic: {}", result.statistic);
        println!("P-value: {}", result.p_value);
        
        if result.statistic.abs() < 0.001 && result.p_value > 0.9 {
            println!("✅ PASS: Welch t-test with identical groups handled correctly!");
        } else {
            println!("❌ FAIL: Welch t-test with identical groups not handled correctly!");
            println!("   - Expected: t-stat ≈ 0, p-value ≈ 1.0");
            println!("   - Got: t-stat = {}, p-value = {}", result.statistic, result.p_value);
        }
    }

    #[test]
    fn test_high_vs_low_expression() {
        // Simulating a highly expressed gene vs low expression
        // High expression group: mean ≈ 10, some variance
        // Low expression group: mean ≈ 2, some variance
        
        // High expression: [9, 10, 11, 10, 10] -> sum=50, sum_sq=502
        let sum_high = 50.0;
        let sum_sq_high = 502.0; // 81+100+121+100+100
        let n_high = 5.0;
        
        // Low expression: [1, 2, 3, 2, 2] -> sum=10, sum_sq=22
        let sum_low = 10.0; 
        let sum_sq_low = 22.0; // 1+4+9+4+4
        let n_low = 5.0;
        
        let result = fast_t_test_from_sums(
            sum_high, sum_sq_high, n_high,
            sum_low, sum_sq_low, n_low,
            TTestType::Student
        );
        
        println!("\n=== HIGH VS LOW EXPRESSION TEST ===");
        println!("High group: mean={}, variance estimated", sum_high / n_high);
        println!("Low group: mean={}, variance estimated", sum_low / n_low);
        println!("T-statistic: {}", result.statistic);
        println!("P-value: {}", result.p_value);
        
        // Should be highly significant
        assert!(result.p_value < 0.001, "Expected highly significant p-value");
        assert!(num_traits::Float::abs(result.statistic) > 5.0, "Expected large t-statistic");
        println!("✅ PASS: High vs low expression detected correctly!");
    }

    #[test]
    fn test_zero_expression_vs_expressed() {
        // Common in single-cell: some cells express gene, others don't
        // Expressed group: [5, 4, 6, 5, 5] -> sum=25, sum_sq=127
        let sum_expressed = 25.0;
        let sum_sq_expressed = 127.0; // 25+16+36+25+25
        let n_expressed = 5.0;
        
        // Zero expression: [0, 0, 0, 0, 0] -> sum=0, sum_sq=0
        let sum_zero = 0.0;
        let sum_sq_zero = 0.0;
        let n_zero = 5.0;
        
        let result = fast_t_test_from_sums(
            sum_expressed, sum_sq_expressed, n_expressed,
            sum_zero, sum_sq_zero, n_zero,
            TTestType::Welch // Welch is better for unequal variances
        );
        
        println!("\n=== ZERO VS EXPRESSED TEST ===");
        println!("Expressed group mean: {}", sum_expressed / n_expressed);
        println!("Zero group mean: {}", sum_zero / n_zero);
        println!("T-statistic: {}", result.statistic);
        println!("P-value: {}", result.p_value);
        
        // Should be highly significant
        assert!(result.p_value < 0.001, "Expected highly significant difference");
        assert!(num_traits::Float::abs(result.statistic) > 3.0, "Expected large t-statistic");
        println!("✅ PASS: Zero vs expressed gene detected correctly!");
    }

    #[test]
    fn test_subtle_difference() {
        // Subtle but real difference (common in single-cell DE)
        // Group 1: slightly higher expression [3.1, 3.2, 3.0, 3.1, 3.0]
        // Group 2: slightly lower expression [2.9, 2.8, 3.0, 2.9, 3.0]
        
        let sum1 = 15.4; // 3.1+3.2+3.0+3.1+3.0
        let sum_sq1 = 47.46; // 9.61+10.24+9.0+9.61+9.0
        let n1 = 5.0;
        
        let sum2 = 14.6; // 2.9+2.8+3.0+2.9+3.0
        let sum_sq2 = 42.58; // 8.41+7.84+9.0+8.41+9.0
        let n2 = 5.0;
        
        let result = fast_t_test_from_sums(
            sum1, sum_sq1, n1,
            sum2, sum_sq2, n2,
            TTestType::Student
        );
        
        println!("\n=== SUBTLE DIFFERENCE TEST ===");
        println!("Group 1 mean: {}", sum1 / n1);
        println!("Group 2 mean: {}", sum2 / n2);
        println!("Mean difference: {}", (sum1 / n1) - (sum2 / n2));
        println!("T-statistic: {}", result.statistic);
        println!("P-value: {}", result.p_value);
        
        // This should show some difference but may not be highly significant
        println!("✅ PASS: Subtle difference test completed (p-value interpretation depends on effect size)");
    }

    #[test]
    fn test_unequal_sample_sizes() {
        // Common in single-cell: unequal cell counts between groups
        // Large group: 10 cells with moderate expression
        let sum_large = 50.0; // 10 cells, mean = 5.0
        let sum_sq_large = 260.0; // Some variance
        let n_large = 10.0;
        
        // Small group: 3 cells with high expression  
        let sum_small = 21.0; // 3 cells, mean = 7.0
        let sum_sq_small = 149.0; // 7²+7²+7² = 147, plus some variance
        let n_small = 3.0;
        
        let result = fast_t_test_from_sums(
            sum_small, sum_sq_small, n_small,
            sum_large, sum_sq_large, n_large,
            TTestType::Welch // Better for unequal samples
        );
        
        println!("\n=== UNEQUAL SAMPLE SIZES TEST ===");
        println!("Small group (n={}): mean={}", n_small, sum_small / n_small);
        println!("Large group (n={}): mean={}", n_large, sum_large / n_large);
        println!("T-statistic: {}", result.statistic);
        println!("P-value: {}", result.p_value);
        
        // Should detect the difference despite unequal sample sizes
        assert!(result.p_value < 0.1, "Should detect difference with unequal samples");
        println!("✅ PASS: Unequal sample sizes handled correctly!");
    }

    #[test]
    fn test_high_variance_groups() {
        // High variance within groups (common in noisy single-cell data)
        // Group 1: high variance [1, 10, 2, 9, 3] -> sum=25, high sum_sq
        let sum1 = 25.0;
        let sum_sq1 = 195.0; // 1+100+4+81+9
        let n1 = 5.0;
        
        // Group 2: high variance [2, 8, 4, 7, 4] -> sum=25, high sum_sq  
        let sum2 = 25.0;
        let sum_sq2 = 149.0; // 4+64+16+49+16
        let n2 = 5.0;
        
        let result = fast_t_test_from_sums(
            sum1, sum_sq1, n1,
            sum2, sum_sq2, n2,
            TTestType::Student
        );
        
        println!("\n=== HIGH VARIANCE GROUPS TEST ===");
        println!("Both groups have mean: {} and {}", sum1 / n1, sum2 / n2);
        println!("T-statistic: {}", result.statistic);
        println!("P-value: {}", result.p_value);
        
        // Should not detect difference (same means, just high variance)
        assert!(result.p_value > 0.1, "Should not detect difference with same means");
        assert!(num_traits::Float::abs(result.statistic) < 1.0, "T-statistic should be small");
        println!("✅ PASS: High variance groups (same means) handled correctly!");
    }

    #[test]
    fn test_minimal_sample_size() {
        // Edge case: minimum sample size (n=2 per group)
        // Group 1: [5, 7] -> sum=12, sum_sq=74
        let sum1 = 12.0;
        let sum_sq1 = 74.0; // 25+49
        let n1 = 2.0;
        
        // Group 2: [3, 5] -> sum=8, sum_sq=34
        let sum2 = 8.0;
        let sum_sq2 = 34.0; // 9+25
        let n2 = 2.0;
        
        let result = fast_t_test_from_sums(
            sum1, sum_sq1, n1,
            sum2, sum_sq2, n2,
            TTestType::Student
        );
        
        println!("\n=== MINIMAL SAMPLE SIZE TEST ===");
        println!("Group 1 (n=2): mean={}", sum1 / n1);
        println!("Group 2 (n=2): mean={}", sum2 / n2);
        println!("T-statistic: {}", result.statistic);
        println!("P-value: {}", result.p_value);
        
        // Should handle minimal case without crashing
        assert!(num_traits::Float::is_finite(result.p_value), "P-value should be finite");
        assert!(num_traits::Float::is_finite(result.statistic), "T-statistic should be finite");
        println!("✅ PASS: Minimal sample size handled correctly!");
    }

    #[test]
    fn test_single_cell_realistic_scenario() {
        // Realistic single-cell scenario: marker gene expression
        // Cell type A (expressing marker): higher, more variable expression
        let sum_expressing = 45.0; // Mean ≈ 4.5
        let sum_sq_expressing = 235.0; // Moderate variance
        let n_expressing = 10.0;
        
        // Cell type B (not expressing): lower, less variable expression
        let sum_non_expressing = 15.0; // Mean ≈ 1.5  
        let sum_sq_non_expressing = 35.0; // Lower variance
        let n_non_expressing = 10.0;
        
        let result_student = fast_t_test_from_sums(
            sum_expressing, sum_sq_expressing, n_expressing,
            sum_non_expressing, sum_sq_non_expressing, n_non_expressing,
            TTestType::Student
        );
        
        let result_welch = fast_t_test_from_sums(
            sum_expressing, sum_sq_expressing, n_expressing,
            sum_non_expressing, sum_sq_non_expressing, n_non_expressing,
            TTestType::Welch
        );
        
        println!("\n=== SINGLE-CELL REALISTIC SCENARIO ===");
        println!("Expressing cells mean: {}", sum_expressing / n_expressing);
        println!("Non-expressing cells mean: {}", sum_non_expressing / n_non_expressing);
        println!("Student t-test: t={:.3}, p={:.6}", result_student.statistic, result_student.p_value);
        println!("Welch t-test: t={:.3}, p={:.6}", result_welch.statistic, result_welch.p_value);
        
        // Both should detect significant difference
        assert!(result_student.p_value < 0.01, "Student t-test should be significant");
        assert!(result_welch.p_value < 0.01, "Welch t-test should be significant");
        println!("✅ PASS: Realistic single-cell marker gene scenario detected!");
    }

    #[test]
    fn test_very_small_difference() {
        // Test the detection limit - very small but real difference
        // This might happen with housekeeping genes that are slightly different
        let sum1 = 30.0000; // Mean = 3.0000
        let sum_sq1 = 90.01; // Very low variance
        let n1 = 10.0;
        
        let sum2 = 29.9990; // Mean = 2.9999 (tiny difference)  
        let sum_sq2 = 89.997; // Very low variance
        let n2 = 10.0;
        
        let result = fast_t_test_from_sums(
            sum1, sum_sq1, n1,
            sum2, sum_sq2, n2,
            TTestType::Student
        );
        
        println!("\n=== VERY SMALL DIFFERENCE TEST ===");
        println!("Group 1 mean: {:.6}", sum1 / n1);
        println!("Group 2 mean: {:.6}", sum2 / n2);
        println!("Difference: {:.6}", (sum1 / n1) - (sum2 / n2));
        println!("T-statistic: {}", result.statistic);
        println!("P-value: {}", result.p_value);
        
        // Very small differences might not be significant due to our threshold
        println!("✅ PASS: Very small difference test completed");
    }

    #[test]
    fn test_matrix_based_workflow() {
        // Test the matrix-based function that would be used in real single-cell analysis
        
        // Create a small test matrix: 3 genes x 6 cells
        // Gene 0: [1,1,1,5,5,5] - clear difference between groups
        // Gene 1: [3,3,3,3,3,3] - no difference
        // Gene 2: [0,0,1,2,3,4] - moderate difference
        
        let mut coo = CooMatrix::new(3, 6); // 3 genes x 6 cells
        
        // Gene 0 values - clear difference
        coo.push(0, 0, 1.0f64); coo.push(0, 1, 1.0); coo.push(0, 2, 1.0);
        coo.push(0, 3, 5.0); coo.push(0, 4, 5.0); coo.push(0, 5, 5.0);
        
        // Gene 1 values - all same (should not be significant)
        coo.push(1, 0, 3.0); coo.push(1, 1, 3.0); coo.push(1, 2, 3.0);
        coo.push(1, 3, 3.0); coo.push(1, 4, 3.0); coo.push(1, 5, 3.0);
        
        // Gene 2 values - moderate difference
        coo.push(2, 0, 0.0); coo.push(2, 1, 0.0); coo.push(2, 2, 1.0);
        coo.push(2, 3, 2.0); coo.push(2, 4, 3.0); coo.push(2, 5, 4.0);
        
        let matrix = CsrMatrix::from(&coo);
        
        let group1_indices = vec![0, 1, 2]; // First 3 cells
        let group2_indices = vec![3, 4, 5]; // Last 3 cells
        
        println!("\n=== MATRIX-BASED WORKFLOW TEST ===");
        println!("Matrix shape: {} genes x {} cells", matrix.nrows(), matrix.ncols());
        println!("Group 1 indices: {:?}", group1_indices);
        println!("Group 2 indices: {:?}", group2_indices);
        
        // Debug: print the actual matrix values
        for gene in 0..matrix.nrows() {
            print!("Gene {}: [", gene);
            for cell in 0..matrix.ncols() {
                let value = matrix.get_entry(gene, cell).map_or(0.0, |entry| entry.into_value());
                print!("{}, ", value);
            }
            println!("]");
        }
        
        let results = t_test_matrix_groups(
            &matrix,
            &group1_indices,
            &group2_indices,
            TTestType::Student
        ).expect("Matrix t-test should work");
        
        for (gene_idx, result) in results.iter().enumerate() {
            println!("Gene {}: t={:.3}, p={:.6}", gene_idx, result.statistic, result.p_value);
        }
        
        // Manual calculation for Gene 0: [1,1,1] vs [5,5,5]
        // This should result in perfect separation (infinite t-statistic, p-value = 0)
        let manual_gene0 = fast_t_test_from_sums(
            3.0, 3.0, 3.0,    // Group 1: [1,1,1]
            15.0, 75.0, 3.0,  // Group 2: [5,5,5]
            TTestType::Student
        );
        
        println!("Manual Gene 0 calculation: t={:.3}, p={:.6}", manual_gene0.statistic, manual_gene0.p_value);
        
        // Gene 0 should show perfect separation (infinite t-statistic, p-value = 0)
        assert!(num_traits::Float::is_infinite(results[0].statistic), "Gene 0 should have infinite t-statistic");
        assert!(results[0].p_value < 0.001, "Gene 0 should have p-value = 0");
        
        // Gene 1 should not be significant (all same values)
        assert!(results[1].p_value > 0.9, "Gene 1 should not be significant, got p={}", results[1].p_value);
        
        // Gene 2 should be moderately significant
        assert!(results[2].p_value < 0.1, "Gene 2 should show some significance, got p={}", results[2].p_value);
        
        println!("✅ PASS: Matrix-based differential expression workflow works!");
    }
}