openentropy_core/
statistics.rs

1//! Statistical randomness tests (CvM, Ljung-Box, Gap Test).
2
3use crate::math;
4use serde::Serialize;
5
6#[derive(Debug, Clone, Serialize)]
7pub struct CramerVonMisesResult {
8    pub statistic: f64,
9    pub p_value: f64,
10    pub is_uniform: bool,
11    pub is_valid: bool,
12}
13
14pub fn cramer_von_mises(data: &[u8]) -> CramerVonMisesResult {
15    let n = data.len();
16    if n < 2 {
17        return CramerVonMisesResult {
18            statistic: 0.0,
19            p_value: 0.0,
20            is_uniform: false,
21            is_valid: false,
22        };
23    }
24
25    let mut sorted = data.to_vec();
26    sorted.sort_unstable();
27
28    let nf = n as f64;
29    let mut w2 = 0.0;
30    for (idx, &x) in sorted.iter().enumerate() {
31        let i = idx + 1;
32        let u_i = (x as f64 + 0.5) / 256.0;
33        let target = (2.0 * i as f64 - 1.0) / (2.0 * nf);
34        let d = u_i - target;
35        w2 += d * d;
36    }
37    w2 += 1.0 / (12.0 * nf);
38
39    let corrected = w2 * (1.0 + 0.5 / nf);
40    let pi = std::f64::consts::PI;
41    let raw_p = if corrected < 0.0275 {
42        let mut sum = 0.0;
43        for k in 0..=4 {
44            let kf = k as f64;
45            let odd = 2.0 * kf + 1.0;
46            let exponent = -(odd * odd) * pi * pi / (8.0 * corrected);
47            sum += exponent.exp();
48        }
49        1.0 - (2.0 * pi.sqrt() * sum / corrected.sqrt())
50    } else {
51        let mut sum = 0.0;
52        for k in 1..=4 {
53            let kf = k as f64;
54            let sign = if k % 2 == 1 { 1.0 } else { -1.0 };
55            sum += sign * (-2.0 * kf * kf * pi * pi * corrected).exp();
56        }
57        2.0 * sum
58    };
59
60    let p_value = raw_p.clamp(0.001, 0.999);
61    CramerVonMisesResult {
62        statistic: w2,
63        p_value,
64        is_uniform: p_value > 0.05,
65        is_valid: true,
66    }
67}
68
69#[derive(Debug, Clone, Serialize)]
70pub struct LjungBoxResult {
71    pub q_statistic: f64,
72    pub p_value: f64,
73    pub max_lag: usize,
74    pub has_serial_correlation: bool,
75    pub is_valid: bool,
76}
77
78pub fn ljung_box_default(data: &[u8]) -> LjungBoxResult {
79    ljung_box(data, 20)
80}
81
82pub fn ljung_box(data: &[u8], max_lag: usize) -> LjungBoxResult {
83    let n = data.len();
84    if max_lag == 0 || n <= max_lag {
85        return LjungBoxResult {
86            q_statistic: 0.0,
87            p_value: 0.0,
88            max_lag,
89            has_serial_correlation: false,
90            is_valid: false,
91        };
92    }
93
94    let arr: Vec<f64> = data.iter().map(|&b| b as f64).collect();
95    let mean = arr.iter().sum::<f64>() / n as f64;
96    let denom = arr.iter().map(|x| (x - mean).powi(2)).sum::<f64>();
97
98    let mut q_sum = 0.0;
99    for lag in 1..=max_lag {
100        let mut numer = 0.0;
101        for i in 0..(n - lag) {
102            numer += (arr[i] - mean) * (arr[i + lag] - mean);
103        }
104        let r_k = if denom <= 1e-12 { 0.0 } else { numer / denom };
105        q_sum += (r_k * r_k) / (n as f64 - lag as f64);
106    }
107
108    let q_statistic = n as f64 * (n as f64 + 2.0) * q_sum;
109    let p_value = math::chi_squared_survival(q_statistic, max_lag);
110
111    LjungBoxResult {
112        q_statistic,
113        p_value,
114        max_lag,
115        has_serial_correlation: p_value < 0.05,
116        is_valid: true,
117    }
118}
119
120#[derive(Debug, Clone, Serialize)]
121pub struct GapTestResult {
122    pub chi_squared: f64,
123    pub p_value: f64,
124    pub degrees_of_freedom: usize,
125    pub mean_gap: f64,
126    pub expected_gap: f64,
127    pub is_uniform_gaps: bool,
128    pub is_valid: bool,
129}
130
131pub fn gap_test_default(data: &[u8]) -> GapTestResult {
132    gap_test(data, 100, 155)
133}
134
135pub fn gap_test(data: &[u8], lower: u8, upper: u8) -> GapTestResult {
136    if data.is_empty() || lower > upper {
137        return GapTestResult {
138            chi_squared: 0.0,
139            p_value: 0.0,
140            degrees_of_freedom: 0,
141            mean_gap: 0.0,
142            expected_gap: 0.0,
143            is_uniform_gaps: false,
144            is_valid: false,
145        };
146    }
147
148    let p_range = (upper as f64 - lower as f64 + 1.0) / 256.0;
149    if p_range <= 0.0 {
150        return GapTestResult {
151            chi_squared: 0.0,
152            p_value: 0.0,
153            degrees_of_freedom: 0,
154            mean_gap: 0.0,
155            expected_gap: 0.0,
156            is_uniform_gaps: false,
157            is_valid: false,
158        };
159    }
160
161    let hits: Vec<usize> = data
162        .iter()
163        .enumerate()
164        .filter_map(|(idx, &b)| {
165            if (lower..=upper).contains(&b) {
166                Some(idx)
167            } else {
168                None
169            }
170        })
171        .collect();
172
173    if hits.len() < 2 {
174        return GapTestResult {
175            chi_squared: 0.0,
176            p_value: 0.0,
177            degrees_of_freedom: 0,
178            mean_gap: 0.0,
179            expected_gap: 1.0 / p_range,
180            is_uniform_gaps: false,
181            is_valid: false,
182        };
183    }
184
185    let gaps: Vec<usize> = hits.windows(2).map(|w| w[1] - w[0]).collect();
186    if gaps.len() < 10 {
187        let mean_gap = gaps.iter().sum::<usize>() as f64 / gaps.len() as f64;
188        return GapTestResult {
189            chi_squared: 0.0,
190            p_value: 0.0,
191            degrees_of_freedom: 0,
192            mean_gap,
193            expected_gap: 1.0 / p_range,
194            is_uniform_gaps: false,
195            is_valid: false,
196        };
197    }
198
199    let mut observed = [0usize; 9];
200    for &gap in &gaps {
201        let bin = match gap {
202            0 => 0,
203            1 => 1,
204            2 => 2,
205            3 => 3,
206            4 => 4,
207            5 => 5,
208            6..=10 => 6,
209            11..=20 => 7,
210            _ => 8,
211        };
212        observed[bin] += 1;
213    }
214
215    let n_gaps = gaps.len() as f64;
216    let q = 1.0 - p_range;
217    let probs = [
218        0.0,
219        p_range,
220        q * p_range,
221        q.powi(2) * p_range,
222        q.powi(3) * p_range,
223        q.powi(4) * p_range,
224        q.powi(5) - q.powi(10),
225        q.powi(10) - q.powi(20),
226        q.powi(20),
227    ];
228
229    let mut chi_squared = 0.0;
230    let mut contributing_bins = 0usize;
231    for i in 0..observed.len() {
232        let expected = n_gaps * probs[i];
233        if expected > 1e-12 {
234            let diff = observed[i] as f64 - expected;
235            chi_squared += diff * diff / expected;
236            contributing_bins += 1;
237        }
238    }
239
240    let degrees_of_freedom = contributing_bins.saturating_sub(1);
241    let p_value = math::chi_squared_survival(chi_squared, degrees_of_freedom);
242    let mean_gap = gaps.iter().sum::<usize>() as f64 / n_gaps;
243    let expected_gap = 1.0 / p_range;
244
245    GapTestResult {
246        chi_squared,
247        p_value,
248        degrees_of_freedom,
249        mean_gap,
250        expected_gap,
251        is_uniform_gaps: p_value > 0.05,
252        is_valid: true,
253    }
254}
255
256#[derive(Debug, Clone, Serialize)]
257pub struct AnovaResult {
258    pub f_statistic: f64,
259    pub p_value: f64,
260    pub df_between: usize,
261    pub df_within: usize,
262    pub group_means: Vec<f64>,
263    pub grand_mean: f64,
264    pub is_significant: bool,
265    pub is_valid: bool,
266}
267
268pub fn anova(groups: &[&[u8]]) -> AnovaResult {
269    let valid_groups: Vec<&[u8]> = groups.iter().copied().filter(|g| !g.is_empty()).collect();
270    if valid_groups.len() < 2 {
271        return AnovaResult {
272            f_statistic: 0.0,
273            p_value: 0.0,
274            df_between: 0,
275            df_within: 0,
276            group_means: Vec::new(),
277            grand_mean: 0.0,
278            is_significant: false,
279            is_valid: false,
280        };
281    }
282
283    let k = valid_groups.len();
284    let total_n: usize = valid_groups.iter().map(|g| g.len()).sum();
285    if total_n <= k {
286        return AnovaResult {
287            f_statistic: 0.0,
288            p_value: 0.0,
289            df_between: 0,
290            df_within: 0,
291            group_means: Vec::new(),
292            grand_mean: 0.0,
293            is_significant: false,
294            is_valid: false,
295        };
296    }
297
298    let group_means: Vec<f64> = valid_groups
299        .iter()
300        .map(|g| g.iter().map(|&x| x as f64).sum::<f64>() / g.len() as f64)
301        .collect();
302
303    let grand_mean = valid_groups
304        .iter()
305        .flat_map(|g| g.iter())
306        .map(|&x| x as f64)
307        .sum::<f64>()
308        / total_n as f64;
309
310    let ss_between = valid_groups
311        .iter()
312        .zip(group_means.iter())
313        .map(|(g, &mean)| g.len() as f64 * (mean - grand_mean).powi(2))
314        .sum::<f64>();
315
316    let ss_within = valid_groups
317        .iter()
318        .zip(group_means.iter())
319        .map(|(g, &mean)| {
320            g.iter()
321                .map(|&x| {
322                    let d = x as f64 - mean;
323                    d * d
324                })
325                .sum::<f64>()
326        })
327        .sum::<f64>();
328
329    let df_between = k - 1;
330    let df_within = total_n - k;
331    let ms_between = ss_between / df_between as f64;
332    let ms_within = ss_within / df_within as f64;
333
334    let (f_statistic, p_value) = if ms_within <= 1e-12 {
335        if ms_between <= 1e-12 {
336            (0.0, 1.0)
337        } else {
338            (f64::INFINITY, 0.0)
339        }
340    } else {
341        let f = ms_between / ms_within;
342        let p = math::f_distribution_survival(f, df_between, df_within);
343        (f, p)
344    };
345
346    AnovaResult {
347        f_statistic,
348        p_value,
349        df_between,
350        df_within,
351        group_means,
352        grand_mean,
353        is_significant: p_value < 0.05,
354        is_valid: true,
355    }
356}
357
358#[derive(Debug, Clone, Serialize)]
359pub struct KruskalWallisResult {
360    pub h_statistic: f64,
361    pub p_value: f64,
362    pub df: usize,
363    pub is_significant: bool,
364    pub is_valid: bool,
365}
366
367pub fn kruskal_wallis(groups: &[&[u8]]) -> KruskalWallisResult {
368    let valid_groups: Vec<&[u8]> = groups.iter().copied().filter(|g| !g.is_empty()).collect();
369    if valid_groups.len() < 2 {
370        return KruskalWallisResult {
371            h_statistic: 0.0,
372            p_value: 0.0,
373            df: 0,
374            is_significant: false,
375            is_valid: false,
376        };
377    }
378
379    let k = valid_groups.len();
380    let n_total: usize = valid_groups.iter().map(|g| g.len()).sum();
381    if n_total < 2 {
382        return KruskalWallisResult {
383            h_statistic: 0.0,
384            p_value: 0.0,
385            df: 0,
386            is_significant: false,
387            is_valid: false,
388        };
389    }
390
391    let mut values = Vec::with_capacity(n_total);
392    for (group_idx, group) in valid_groups.iter().enumerate() {
393        for &x in group.iter() {
394            values.push((x, group_idx));
395        }
396    }
397    values.sort_by_key(|&(x, _)| x);
398
399    let mut rank_sums = vec![0.0; k];
400    let mut tie_term_sum = 0.0;
401    let mut i = 0usize;
402    while i < values.len() {
403        let tie_value = values[i].0;
404        let start = i;
405        while i < values.len() && values[i].0 == tie_value {
406            i += 1;
407        }
408        let end = i;
409        let tie_count = end - start;
410        let rank_start = start as f64 + 1.0;
411        let rank_end = end as f64;
412        let avg_rank = (rank_start + rank_end) / 2.0;
413
414        for &(_, group_idx) in &values[start..end] {
415            rank_sums[group_idx] += avg_rank;
416        }
417
418        if tie_count > 1 {
419            let t = tie_count as f64;
420            tie_term_sum += t.powi(3) - t;
421        }
422    }
423
424    let n = n_total as f64;
425    let mut h = 0.0;
426    for (group, &rank_sum) in valid_groups.iter().zip(rank_sums.iter()) {
427        h += (rank_sum * rank_sum) / group.len() as f64;
428    }
429    h = (12.0 / (n * (n + 1.0))) * h - 3.0 * (n + 1.0);
430
431    let tie_den = n.powi(3) - n;
432    if tie_den > 0.0 {
433        let tie_correction = 1.0 - (tie_term_sum / tie_den);
434        if tie_correction > 1e-12 {
435            h /= tie_correction;
436        } else {
437            h = 0.0;
438        }
439    }
440
441    if h < 0.0 {
442        h = 0.0;
443    }
444
445    let df = k - 1;
446    let p_value = math::chi_squared_survival(h, df);
447    KruskalWallisResult {
448        h_statistic: h,
449        p_value,
450        df,
451        is_significant: p_value < 0.05,
452        is_valid: true,
453    }
454}
455
456#[derive(Debug, Clone, Serialize)]
457pub struct LeveneResult {
458    pub w_statistic: f64,
459    pub p_value: f64,
460    pub df_between: usize,
461    pub df_within: usize,
462    pub group_variances: Vec<f64>,
463    pub is_homogeneous: bool,
464    pub is_valid: bool,
465}
466
467fn anova_on_f64_groups(groups: &[Vec<f64>]) -> Option<(f64, f64, usize, usize)> {
468    if groups.len() < 2 || groups.iter().any(|g| g.is_empty()) {
469        return None;
470    }
471    let k = groups.len();
472    let total_n: usize = groups.iter().map(|g| g.len()).sum();
473    if total_n <= k {
474        return None;
475    }
476
477    let means: Vec<f64> = groups
478        .iter()
479        .map(|g| g.iter().sum::<f64>() / g.len() as f64)
480        .collect();
481    let grand_mean = groups.iter().flatten().sum::<f64>() / total_n as f64;
482
483    let ss_between = groups
484        .iter()
485        .zip(means.iter())
486        .map(|(g, &m)| g.len() as f64 * (m - grand_mean).powi(2))
487        .sum::<f64>();
488    let ss_within = groups
489        .iter()
490        .zip(means.iter())
491        .map(|(g, &m)| g.iter().map(|&x| (x - m).powi(2)).sum::<f64>())
492        .sum::<f64>();
493
494    let df_between = k - 1;
495    let df_within = total_n - k;
496    let ms_between = ss_between / df_between as f64;
497    let ms_within = ss_within / df_within as f64;
498
499    let (f_statistic, p_value) = if ms_within <= 1e-12 {
500        if ms_between <= 1e-12 {
501            (0.0, 1.0)
502        } else {
503            (f64::INFINITY, 0.0)
504        }
505    } else {
506        let f = ms_between / ms_within;
507        let p = math::f_distribution_survival(f, df_between, df_within);
508        (f, p)
509    };
510
511    Some((f_statistic, p_value, df_between, df_within))
512}
513
514pub fn levene_test(groups: &[&[u8]]) -> LeveneResult {
515    let valid_groups: Vec<&[u8]> = groups.iter().copied().filter(|g| !g.is_empty()).collect();
516    if valid_groups.len() < 2 || valid_groups.iter().any(|g| g.len() < 2) {
517        return LeveneResult {
518            w_statistic: 0.0,
519            p_value: 0.0,
520            df_between: 0,
521            df_within: 0,
522            group_variances: Vec::new(),
523            is_homogeneous: false,
524            is_valid: false,
525        };
526    }
527
528    let group_variances: Vec<f64> = valid_groups
529        .iter()
530        .map(|g| {
531            let mean = g.iter().map(|&x| x as f64).sum::<f64>() / g.len() as f64;
532            let ss = g
533                .iter()
534                .map(|&x| {
535                    let d = x as f64 - mean;
536                    d * d
537                })
538                .sum::<f64>();
539            ss / (g.len() as f64 - 1.0)
540        })
541        .collect();
542
543    let z_groups: Vec<Vec<f64>> = valid_groups
544        .iter()
545        .map(|g| {
546            let mean = g.iter().map(|&x| x as f64).sum::<f64>() / g.len() as f64;
547            g.iter().map(|&x| (x as f64 - mean).abs()).collect()
548        })
549        .collect();
550
551    let Some((w_statistic, p_value, df_between, df_within)) = anova_on_f64_groups(&z_groups) else {
552        return LeveneResult {
553            w_statistic: 0.0,
554            p_value: 0.0,
555            df_between: 0,
556            df_within: 0,
557            group_variances,
558            is_homogeneous: false,
559            is_valid: false,
560        };
561    };
562
563    LeveneResult {
564        w_statistic,
565        p_value,
566        df_between,
567        df_within,
568        group_variances,
569        is_homogeneous: p_value > 0.05,
570        is_valid: true,
571    }
572}
573
574#[derive(Debug, Clone, Serialize)]
575pub struct PowerResult {
576    pub power: f64,
577    pub effect_size: f64,
578    pub sample_size: usize,
579    pub alpha: f64,
580    pub required_n_for_80: usize,
581    pub required_n_for_90: usize,
582    pub is_valid: bool,
583}
584
585fn inverse_t_cdf(p: f64, df: usize) -> f64 {
586    if p <= 0.0 {
587        return f64::NEG_INFINITY;
588    }
589    if p >= 1.0 {
590        return f64::INFINITY;
591    }
592
593    let mut lo = -20.0;
594    let mut hi = 20.0;
595    for _ in 0..100 {
596        let mid = (lo + hi) / 2.0;
597        let cdf = math::t_distribution_cdf(mid, df);
598        if cdf < p {
599            lo = mid;
600        } else {
601            hi = mid;
602        }
603    }
604    (lo + hi) / 2.0
605}
606
607fn power_for_two_sample_t(effect_size: f64, sample_size: usize, alpha: f64) -> f64 {
608    let df = 2 * (sample_size - 1);
609    let ncp = effect_size * (sample_size as f64 / 2.0).sqrt();
610    let t_critical = inverse_t_cdf(1.0 - alpha / 2.0, df);
611    let left = math::t_distribution_cdf(t_critical - ncp, df);
612    let right = math::t_distribution_cdf(-t_critical - ncp, df);
613    (1.0 - left + right).clamp(0.0, 1.0)
614}
615
616fn required_n_for_power(effect_size: f64, alpha: f64, target: f64) -> usize {
617    for n in 2..=10_000 {
618        if power_for_two_sample_t(effect_size, n, alpha) >= target {
619            return n;
620        }
621    }
622    10_000
623}
624
625pub fn power_analysis(effect_size: f64, sample_size: usize, alpha: f64) -> PowerResult {
626    if effect_size <= 0.0 || sample_size < 2 || alpha <= 0.0 || alpha >= 1.0 {
627        return PowerResult {
628            power: 0.0,
629            effect_size,
630            sample_size,
631            alpha,
632            required_n_for_80: 0,
633            required_n_for_90: 0,
634            is_valid: false,
635        };
636    }
637
638    let power = power_for_two_sample_t(effect_size, sample_size, alpha);
639    let required_n_for_80 = required_n_for_power(effect_size, alpha, 0.80);
640    let required_n_for_90 = required_n_for_power(effect_size, alpha, 0.90);
641
642    PowerResult {
643        power,
644        effect_size,
645        sample_size,
646        alpha,
647        required_n_for_80,
648        required_n_for_90,
649        is_valid: true,
650    }
651}
652
653pub fn power_analysis_default(sample_size: usize) -> PowerResult {
654    power_analysis(0.2, sample_size, 0.05)
655}
656
657#[derive(Debug, Clone, Serialize)]
658pub struct MultipleCorrectionResult {
659    pub adjusted_p_values: Vec<f64>,
660    pub rejected: Vec<bool>,
661    pub method: String,
662    pub alpha: f64,
663    pub n_tests: usize,
664    pub n_rejected: usize,
665    pub is_valid: bool,
666}
667
668pub fn bonferroni_correction(p_values: &[f64], alpha: f64) -> MultipleCorrectionResult {
669    if p_values.is_empty()
670        || alpha <= 0.0
671        || alpha >= 1.0
672        || p_values
673            .iter()
674            .any(|&p| !p.is_finite() || !(0.0..=1.0).contains(&p))
675    {
676        return MultipleCorrectionResult {
677            adjusted_p_values: Vec::new(),
678            rejected: Vec::new(),
679            method: "bonferroni".to_string(),
680            alpha,
681            n_tests: p_values.len(),
682            n_rejected: 0,
683            is_valid: false,
684        };
685    }
686
687    let m = p_values.len() as f64;
688    let adjusted_p_values: Vec<f64> = p_values.iter().map(|&p| (p * m).min(1.0)).collect();
689    let rejected: Vec<bool> = adjusted_p_values.iter().map(|&p| p < alpha).collect();
690    let n_rejected = rejected.iter().filter(|&&r| r).count();
691
692    MultipleCorrectionResult {
693        adjusted_p_values,
694        rejected,
695        method: "bonferroni".to_string(),
696        alpha,
697        n_tests: p_values.len(),
698        n_rejected,
699        is_valid: true,
700    }
701}
702
703pub fn holm_bonferroni_correction(p_values: &[f64], alpha: f64) -> MultipleCorrectionResult {
704    if p_values.is_empty()
705        || alpha <= 0.0
706        || alpha >= 1.0
707        || p_values
708            .iter()
709            .any(|&p| !p.is_finite() || !(0.0..=1.0).contains(&p))
710    {
711        return MultipleCorrectionResult {
712            adjusted_p_values: Vec::new(),
713            rejected: Vec::new(),
714            method: "holm-bonferroni".to_string(),
715            alpha,
716            n_tests: p_values.len(),
717            n_rejected: 0,
718            is_valid: false,
719        };
720    }
721
722    let m = p_values.len();
723    let mut indexed: Vec<(usize, f64)> = p_values.iter().copied().enumerate().collect();
724    indexed.sort_by(|a, b| a.1.total_cmp(&b.1));
725
726    let mut adjusted_sorted = vec![0.0; m];
727    let mut rejected_sorted = vec![false; m];
728    let mut all_previous_rejected = true;
729
730    for i in 0..m {
731        let factor = (m - i) as f64;
732        let raw_adjusted = (indexed[i].1 * factor).min(1.0);
733        if i == 0 {
734            adjusted_sorted[i] = raw_adjusted;
735        } else {
736            adjusted_sorted[i] = adjusted_sorted[i - 1].max(raw_adjusted);
737        }
738
739        let rejected = adjusted_sorted[i] < alpha && all_previous_rejected;
740        rejected_sorted[i] = rejected;
741        all_previous_rejected &= rejected;
742    }
743
744    let mut adjusted_p_values = vec![0.0; m];
745    let mut rejected = vec![false; m];
746    for i in 0..m {
747        let original_idx = indexed[i].0;
748        adjusted_p_values[original_idx] = adjusted_sorted[i];
749        rejected[original_idx] = rejected_sorted[i];
750    }
751
752    let n_rejected = rejected.iter().filter(|&&r| r).count();
753    MultipleCorrectionResult {
754        adjusted_p_values,
755        rejected,
756        method: "holm-bonferroni".to_string(),
757        alpha,
758        n_tests: m,
759        n_rejected,
760        is_valid: true,
761    }
762}
763
764#[derive(Debug, Clone, Serialize)]
765pub struct StatisticsAnalysis {
766    pub cramer_von_mises: CramerVonMisesResult,
767    pub ljung_box: LjungBoxResult,
768    pub gap_test: GapTestResult,
769}
770
771pub fn statistics_analysis(data: &[u8]) -> StatisticsAnalysis {
772    StatisticsAnalysis {
773        cramer_von_mises: cramer_von_mises(data),
774        ljung_box: ljung_box_default(data),
775        gap_test: gap_test_default(data),
776    }
777}
778
779#[cfg(test)]
780mod tests {
781    use super::*;
782
783    fn random_data_seeded(len: usize, seed: u64) -> Vec<u8> {
784        let mut state = seed;
785        let mut data = Vec::with_capacity(len);
786        for _ in 0..len {
787            state = state
788                .wrapping_mul(6364136223846793005)
789                .wrapping_add(1442695040888963407);
790            data.push((state >> 33) as u8);
791        }
792        data
793    }
794
795    #[test]
796    fn statistics_cvm_random_uniform() {
797        let data = random_data_seeded(5000, 0xdeadbeef);
798        let result = cramer_von_mises(&data);
799        assert!(result.is_uniform);
800        assert!(result.p_value > 0.05);
801    }
802
803    #[test]
804    fn statistics_cvm_constant_non_uniform() {
805        let data = vec![42u8; 1000];
806        let result = cramer_von_mises(&data);
807        assert!(!result.is_uniform);
808    }
809
810    #[test]
811    fn statistics_cvm_empty_invalid() {
812        let result = cramer_von_mises(&[]);
813        assert!(!result.is_valid);
814    }
815
816    #[test]
817    fn statistics_cvm_all_zero_non_uniform() {
818        let data = vec![0u8; 1000];
819        let result = cramer_von_mises(&data);
820        assert!(!result.is_uniform);
821    }
822
823    #[test]
824    fn statistics_ljung_box_random_no_serial_correlation() {
825        let data = random_data_seeded(5000, 0xdeadbeef);
826        let result = ljung_box_default(&data);
827        assert!(!result.has_serial_correlation);
828    }
829
830    #[test]
831    fn statistics_ljung_box_alternating_has_serial_correlation() {
832        let data: Vec<u8> = (0..5000)
833            .map(|i| if i % 2 == 0 { 0u8 } else { 255u8 })
834            .collect();
835        let result = ljung_box_default(&data);
836        assert!(result.has_serial_correlation);
837    }
838
839    #[test]
840    fn statistics_ljung_box_too_short_invalid() {
841        let result = ljung_box(&[1u8, 2u8], 20);
842        assert!(!result.is_valid);
843    }
844
845    #[test]
846    fn statistics_ljung_box_empty_invalid() {
847        let result = ljung_box_default(&[]);
848        assert!(!result.is_valid);
849    }
850
851    #[test]
852    fn statistics_gap_random_uniform_gaps() {
853        let data = random_data_seeded(10000, 0xdeadbeef);
854        let result = gap_test_default(&data);
855        assert!(result.is_uniform_gaps);
856    }
857
858    #[test]
859    fn statistics_gap_no_values_in_range_invalid() {
860        let data = vec![0u8; 5000];
861        let result = gap_test(&data, 200, 255);
862        assert!(!result.is_valid);
863    }
864
865    #[test]
866    fn statistics_gap_empty_invalid() {
867        let result = gap_test_default(&[]);
868        assert!(!result.is_valid);
869    }
870
871    #[test]
872    fn statistics_gap_mean_close_to_expected_for_random() {
873        let data = random_data_seeded(5000, 0xdeadbeef);
874        let result = gap_test_default(&data);
875        assert!(result.is_valid);
876        assert!((result.mean_gap - result.expected_gap).abs() < 1.5);
877    }
878
879    fn shifted_data_seeded(len: usize, seed: u64, shift: u8) -> Vec<u8> {
880        random_data_seeded(len, seed)
881            .into_iter()
882            .map(|v| v.wrapping_add(shift))
883            .collect()
884    }
885
886    #[test]
887    fn statistics_anova_random_groups_not_significant() {
888        let g1 = random_data_seeded(2000, 0x1001);
889        let g2 = random_data_seeded(2000, 0x1002);
890        let g3 = random_data_seeded(2000, 0x1003);
891        let groups: [&[u8]; 3] = [&g1, &g2, &g3];
892        let result = anova(&groups);
893        assert!(result.is_valid);
894        assert!(!result.is_significant);
895    }
896
897    #[test]
898    fn statistics_anova_biased_group_significant() {
899        let g1 = random_data_seeded(2000, 0x2001);
900        let g2 = random_data_seeded(2000, 0x2002);
901        let g3 = vec![200u8; 2000];
902        let groups: [&[u8]; 3] = [&g1, &g2, &g3];
903        let result = anova(&groups);
904        assert!(result.is_valid);
905        assert!(result.is_significant);
906    }
907
908    #[test]
909    fn statistics_anova_too_few_groups_invalid() {
910        let g1 = random_data_seeded(2000, 0x3001);
911        let groups: [&[u8]; 1] = [&g1];
912        let result = anova(&groups);
913        assert!(!result.is_valid);
914    }
915
916    #[test]
917    fn statistics_anova_identical_groups_zero_f() {
918        let g1 = vec![128u8; 2000];
919        let g2 = vec![128u8; 2000];
920        let g3 = vec![128u8; 2000];
921        let groups: [&[u8]; 3] = [&g1, &g2, &g3];
922        let result = anova(&groups);
923        assert!(result.is_valid);
924        assert!(result.f_statistic.abs() < 1e-12);
925        assert!((result.p_value - 1.0).abs() < 1e-12);
926    }
927
928    #[test]
929    fn statistics_kruskal_wallis_same_distribution_not_significant() {
930        let g1 = random_data_seeded(2000, 0x4001);
931        let g2 = random_data_seeded(2000, 0x4002);
932        let g3 = random_data_seeded(2000, 0x4003);
933        let groups: [&[u8]; 3] = [&g1, &g2, &g3];
934        let result = kruskal_wallis(&groups);
935        assert!(result.is_valid);
936        assert!(!result.is_significant);
937    }
938
939    #[test]
940    fn statistics_kruskal_wallis_biased_group_significant() {
941        let g1 = random_data_seeded(2000, 0x5001);
942        let g2 = random_data_seeded(2000, 0x5002);
943        let g3 = vec![200u8; 2000];
944        let groups: [&[u8]; 3] = [&g1, &g2, &g3];
945        let result = kruskal_wallis(&groups);
946        assert!(result.is_valid);
947        assert!(result.is_significant);
948    }
949
950    #[test]
951    fn statistics_kruskal_wallis_too_few_groups_invalid() {
952        let g1 = random_data_seeded(2000, 0x6001);
953        let groups: [&[u8]; 1] = [&g1];
954        let result = kruskal_wallis(&groups);
955        assert!(!result.is_valid);
956    }
957
958    #[test]
959    fn statistics_kruskal_wallis_identical_groups_not_significant() {
960        let g1 = vec![77u8; 1000];
961        let g2 = vec![77u8; 1000];
962        let g3 = vec![77u8; 1000];
963        let groups: [&[u8]; 3] = [&g1, &g2, &g3];
964        let result = kruskal_wallis(&groups);
965        assert!(result.is_valid);
966        assert!(!result.is_significant);
967    }
968
969    #[test]
970    fn statistics_levene_same_variance_homogeneous() {
971        let g1 = random_data_seeded(2000, 0x7001);
972        let g2 = random_data_seeded(2000, 0x7002);
973        let g3 = random_data_seeded(2000, 0x7003);
974        let groups: [&[u8]; 3] = [&g1, &g2, &g3];
975        let result = levene_test(&groups);
976        assert!(result.is_valid);
977        assert!(result.is_homogeneous);
978    }
979
980    #[test]
981    fn statistics_levene_high_variance_group_not_homogeneous() {
982        let g1 = random_data_seeded(2000, 0x8001);
983        let g2 = random_data_seeded(2000, 0x8002);
984        let g3: Vec<u8> = (0..2000)
985            .map(|i| if i % 2 == 0 { 0u8 } else { 255u8 })
986            .collect();
987        let groups: [&[u8]; 3] = [&g1, &g2, &g3];
988        let result = levene_test(&groups);
989        assert!(result.is_valid);
990        assert!(!result.is_homogeneous);
991    }
992
993    #[test]
994    fn statistics_levene_too_few_groups_invalid() {
995        let g1 = random_data_seeded(2000, 0x9001);
996        let groups: [&[u8]; 1] = [&g1];
997        let result = levene_test(&groups);
998        assert!(!result.is_valid);
999    }
1000
1001    #[test]
1002    fn statistics_levene_single_element_group_invalid() {
1003        let g1 = vec![1u8];
1004        let g2 = vec![2u8, 3u8];
1005        let groups: [&[u8]; 2] = [&g1, &g2];
1006        let result = levene_test(&groups);
1007        assert!(!result.is_valid);
1008    }
1009
1010    #[test]
1011    fn statistics_power_large_n_medium_effect_high_power() {
1012        let result = power_analysis(0.5, 10_000, 0.05);
1013        assert!(result.is_valid);
1014        assert!(result.power > 0.99);
1015    }
1016
1017    #[test]
1018    fn statistics_power_small_n_small_effect_low_power() {
1019        let result = power_analysis(0.2, 10, 0.05);
1020        assert!(result.is_valid);
1021        assert!(result.power < 0.2);
1022    }
1023
1024    #[test]
1025    fn statistics_power_required_n_ordering() {
1026        let result = power_analysis_default(50);
1027        assert!(result.is_valid);
1028        assert!(result.required_n_for_90 >= result.required_n_for_80);
1029    }
1030
1031    #[test]
1032    fn statistics_power_invalid_inputs() {
1033        assert!(!power_analysis(0.0, 10, 0.05).is_valid);
1034        assert!(!power_analysis(0.2, 1, 0.05).is_valid);
1035    }
1036
1037    #[test]
1038    fn statistics_bonferroni_expected_rejection_pattern() {
1039        let result = bonferroni_correction(&[0.01, 0.02, 0.03], 0.05);
1040        assert!(result.is_valid);
1041        assert_eq!(result.rejected, vec![true, false, false]);
1042    }
1043
1044    #[test]
1045    fn statistics_holm_rejects_at_least_bonferroni() {
1046        let p = [0.01, 0.02, 0.03];
1047        let bonf = bonferroni_correction(&p, 0.05);
1048        let holm = holm_bonferroni_correction(&p, 0.05);
1049        assert!(holm.is_valid);
1050        assert!(holm.n_rejected >= bonf.n_rejected);
1051    }
1052
1053    #[test]
1054    fn statistics_multiple_correction_empty_invalid() {
1055        assert!(!bonferroni_correction(&[], 0.05).is_valid);
1056        assert!(!holm_bonferroni_correction(&[], 0.05).is_valid);
1057    }
1058
1059    #[test]
1060    fn statistics_holm_monotonic_adjusted_p_values_sorted() {
1061        let result = holm_bonferroni_correction(&[0.03, 0.01, 0.02], 0.05);
1062        assert!(result.is_valid);
1063        let mut sorted = result.adjusted_p_values.clone();
1064        sorted.sort_by(|a, b| a.total_cmp(b));
1065        assert_eq!(result.adjusted_p_values.len(), 3);
1066        assert_eq!(sorted.len(), 3);
1067    }
1068
1069    #[test]
1070    fn test_statistics_analysis_serializes() {
1071        let data = random_data_seeded(5000, 0xdeadbeef);
1072        let result = statistics_analysis(&data);
1073        let json = serde_json::to_string(&result).expect("serialization failed");
1074        assert!(json.contains("cramer_von_mises"));
1075        assert!(json.contains("ljung_box"));
1076    }
1077}
openentropy_core/statistics.rs

openentropy_core/
statistics.rs