cbtop/statistics/
comparison.rs

1//! Statistical comparison tests and outlier filtering.
2
3use std::cmp::Ordering;
4
5use super::analysis::EffectSize;
6use super::helpers::{normal_cdf, percentile};
7
8// ---------------------------------------------------------------------------
9// Shared helpers extracted to eliminate repeated data-transformation patterns
10// ---------------------------------------------------------------------------
11
12/// Compute sample count (as f64), mean, and Bessel-corrected variance.
13///
14/// Returns `(n, mean, variance)` where `n = samples.len() as f64`.
15/// Callers are responsible for ensuring `samples.len() >= 2` when the
16/// variance value matters (single-element slices yield `variance = 0.0`).
17fn sample_stats(samples: &[f64]) -> (f64, f64, f64) {
18    let n = samples.len() as f64;
19    let mean = samples.iter().sum::<f64>() / n;
20    let divisor = (n - 1.0).max(1.0);
21    let var = samples.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / divisor;
22    (n, mean, var)
23}
24
25/// Two-tailed p-value from an absolute z (or t) statistic using the normal
26/// approximation.  Used by both Welch's t-test and Mann-Whitney U.
27fn two_tailed_p(abs_stat: f64) -> f64 {
28    2.0 * (1.0 - normal_cdf(abs_stat))
29}
30
31/// Filter non-finite values and return a sorted `Vec<f64>`.
32///
33/// Returns `None` when the resulting vector is empty.
34fn sort_finite(samples: &[f64]) -> Option<Vec<f64>> {
35    let mut v: Vec<f64> = samples.iter().copied().filter(|x| x.is_finite()).collect();
36    if v.is_empty() {
37        return None;
38    }
39    v.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
40    Some(v)
41}
42
43/// Result of statistical comparison between two samples
44#[derive(Debug, Clone)]
45pub struct ComparisonResult {
46    /// Welch's t-statistic
47    pub t_statistic: f64,
48    /// Two-tailed p-value
49    pub p_value: f64,
50    /// Effect size
51    pub effect_size: EffectSize,
52    /// Whether difference is statistically significant (p < 0.05)
53    pub statistically_significant: bool,
54    /// Whether difference is practically significant (|d| >= 0.2)
55    pub practically_significant: bool,
56    /// Degrees of freedom (Welch-Satterthwaite)
57    pub degrees_of_freedom: f64,
58}
59
60impl ComparisonResult {
61    /// Perform Welch's t-test between two samples
62    pub fn welch_t_test(sample1: &[f64], sample2: &[f64]) -> Option<Self> {
63        if sample1.len() < 2 || sample2.len() < 2 {
64            return None;
65        }
66
67        let (n1, mean1, var1) = sample_stats(sample1);
68        let (n2, mean2, var2) = sample_stats(sample2);
69
70        let se1 = var1 / n1;
71        let se2 = var2 / n2;
72        let se_diff = (se1 + se2).sqrt();
73
74        if se_diff == 0.0 {
75            return None;
76        }
77
78        let t = (mean1 - mean2) / se_diff;
79
80        // Welch-Satterthwaite degrees of freedom
81        let df = (se1 + se2).powi(2) / (se1.powi(2) / (n1 - 1.0) + se2.powi(2) / (n2 - 1.0));
82
83        let p_value = two_tailed_p(t.abs());
84        let effect_size = EffectSize::cohens_d(sample1, sample2)?;
85
86        Some(Self {
87            t_statistic: t,
88            p_value,
89            effect_size: effect_size.clone(),
90            statistically_significant: p_value < 0.05,
91            practically_significant: effect_size.is_significant(),
92            degrees_of_freedom: df,
93        })
94    }
95
96    /// Both statistically and practically significant
97    pub fn is_meaningful(&self) -> bool {
98        self.statistically_significant && self.practically_significant
99    }
100}
101
102/// Mann-Whitney U test result (nonparametric)
103#[derive(Debug, Clone)]
104pub struct MannWhitneyResult {
105    /// U statistic
106    pub u_statistic: f64,
107    /// Approximate p-value (normal approximation)
108    pub p_value: f64,
109    /// Rank-biserial correlation (effect size)
110    pub effect_size: f64,
111    /// Whether difference is significant (p < 0.05)
112    pub significant: bool,
113}
114
115impl MannWhitneyResult {
116    /// Perform Mann-Whitney U test between two samples
117    pub fn test(sample1: &[f64], sample2: &[f64]) -> Option<Self> {
118        if sample1.is_empty() || sample2.is_empty() {
119            return None;
120        }
121
122        let n1 = sample1.len();
123        let n2 = sample2.len();
124
125        // Combine and rank
126        let mut combined: Vec<(f64, usize)> = sample1
127            .iter()
128            .map(|&x| (x, 0))
129            .chain(sample2.iter().map(|&x| (x, 1)))
130            .collect();
131
132        combined.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal));
133
134        // Assign ranks (handling ties with average rank)
135        let mut ranks: Vec<f64> = vec![0.0; combined.len()];
136        let mut i = 0;
137        while i < combined.len() {
138            let mut j = i;
139            while j < combined.len() && combined[j].0 == combined[i].0 {
140                j += 1;
141            }
142            let avg_rank = (i + j + 1) as f64 / 2.0;
143            for k in i..j {
144                ranks[k] = avg_rank;
145            }
146            i = j;
147        }
148
149        // Sum of ranks for sample 1
150        let r1: f64 = combined
151            .iter()
152            .enumerate()
153            .filter(|(_, (_, group))| *group == 0)
154            .map(|(idx, _)| ranks[idx])
155            .sum();
156
157        // U statistics
158        let u1 = r1 - (n1 * (n1 + 1)) as f64 / 2.0;
159        let u2 = (n1 * n2) as f64 - u1;
160        let u = u1.min(u2);
161
162        // Normal approximation for p-value
163        let mean_u = (n1 * n2) as f64 / 2.0;
164        let std_u = ((n1 * n2 * (n1 + n2 + 1)) as f64 / 12.0).sqrt();
165
166        let z = if std_u > 0.0 {
167            (u - mean_u) / std_u
168        } else {
169            0.0
170        };
171        let p_value = two_tailed_p(z.abs());
172
173        // Rank-biserial correlation as effect size
174        let effect_size = 1.0 - (2.0 * u) / (n1 * n2) as f64;
175
176        Some(Self {
177            u_statistic: u,
178            p_value,
179            effect_size,
180            significant: p_value < 0.05,
181        })
182    }
183}
184
185/// IQR-based outlier filter
186#[derive(Debug, Clone)]
187pub struct OutlierFilter {
188    /// Lower fence (Q1 - 1.5*IQR)
189    pub lower_fence: f64,
190    /// Upper fence (Q3 + 1.5*IQR)
191    pub upper_fence: f64,
192    /// Q1 (25th percentile)
193    pub q1: f64,
194    /// Q3 (75th percentile)
195    pub q3: f64,
196    /// IQR (Q3 - Q1)
197    pub iqr: f64,
198    /// Multiplier (default 1.5)
199    pub multiplier: f64,
200}
201
202impl OutlierFilter {
203    /// Create outlier filter from samples
204    pub fn new(samples: &[f64]) -> Option<Self> {
205        Self::with_multiplier(samples, 1.5)
206    }
207
208    /// Create outlier filter with custom multiplier
209    pub fn with_multiplier(samples: &[f64], multiplier: f64) -> Option<Self> {
210        let sorted = sort_finite(samples)?;
211
212        let q1 = percentile(&sorted, 0.25);
213        let q3 = percentile(&sorted, 0.75);
214        let iqr = q3 - q1;
215
216        let lower_fence = q1 - multiplier * iqr;
217        let upper_fence = q3 + multiplier * iqr;
218
219        Some(Self {
220            lower_fence,
221            upper_fence,
222            q1,
223            q3,
224            iqr,
225            multiplier,
226        })
227    }
228
229    /// Check if value is an outlier
230    pub fn is_outlier(&self, value: f64) -> bool {
231        value < self.lower_fence || value > self.upper_fence
232    }
233
234    /// Filter outliers from samples
235    pub fn filter(&self, samples: &[f64]) -> Vec<f64> {
236        samples
237            .iter()
238            .copied()
239            .filter(|&x| !self.is_outlier(x))
240            .collect()
241    }
242
243    /// Count outliers in samples
244    pub fn count_outliers(&self, samples: &[f64]) -> usize {
245        samples.iter().filter(|&&x| self.is_outlier(x)).count()
246    }
247}
cbtop/statistics/comparison.rs

cbtop/statistics/
comparison.rs