behrens_fisher/
lib.rs

1/*!
2A crate for testing whether the means of two Normal distributions are the same.
3
4This crate implements [Welch's t-test], an approximate solution to the
5[Behrens-Fisher problem].  The results are presented in the form of a
6confidence interval.
7
8[Welch's t-test]: https://en.wikipedia.org/wiki/Welch%27s_t-test
9[Behrens-Fisher problem]: https://en.wikipedia.org/wiki/Behrens%E2%80%93Fisher_problem
10
11## Example
12
13Suppose we have a population distributed as `X` (normal), and another
14distributed as `Y` (also normal, but possibly with different mean/variance to
15`X`).  Let's take a sample from each population to estimate the difference
16between the population means.
17
18```
19use behrens_fisher::*;
20let x_sample: Vec<f64> = vec![1., 2., 3., 4.];
21let y_sample: Vec<f64> = vec![3., 5., 7., 9., 11.];
22
23let x_stats: SampleStats = x_sample.into_iter().collect();
24let y_stats: SampleStats = y_sample.into_iter().collect();
25let ci = difference_of_means(0.95, x_stats, y_stats).unwrap();
26assert_eq!(ci.to_string(), "+4.50 ± 3.89 (p=95%)");
27// Looks like μ[Y] > μ[X]!
28```
29
30*/
31
32mod stats;
33pub mod student_t;
34
35pub use stats::*;
36use std::fmt;
37
38#[derive(Clone, PartialEq, Debug, Copy)]
39pub struct ConfidenceInterval {
40    /// The center of the two-sided confidence interval; the `x` in `x ± y`.
41    pub center: f64,
42    /// The half-width of the two-sided confidence interval; the `y` in
43    /// `x ± y`.
44    pub radius: f64,
45    /// The significance level
46    pub sig_level: f64,
47}
48
49impl fmt::Display for ConfidenceInterval {
50    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
51        let prec = f.precision().unwrap_or(2);
52        write!(
53            f,
54            "{:+.prec$} ± {:.prec$} (p={}%)",
55            self.center,
56            self.radius,
57            self.sig_level * 100.0,
58        )
59    }
60}
61
62/// An estimate of μ_x (the population mean), based on a sample taken from X.
63pub fn mean(sig_level: f64, x: SampleStats) -> Result<ConfidenceInterval, Error> {
64    if sig_level <= 0.0 || sig_level >= 1.0 {
65        return Err(Error::BadSigLevel);
66    }
67    if !x.var.is_finite() {
68        return Err(Error::InfiniteVariance);
69    }
70    if x.var == 0. {
71        return Err(Error::ZeroVariance);
72    }
73
74    // Convert `sig_level`, which is two-sided, into `p`, which is one-sided
75    let alpha = 1. - sig_level;
76    let p = 1. - (alpha / 2.);
77
78    // The degrees of freedom of the mean variance
79    let v = x.count as f64 - 1.0;
80
81    // Compute the critical value at the chosen confidence level
82    assert!(p.is_normal()); // "normal" in the f64 sense, not gaussian!
83    assert!(v.is_normal()); // "normal" in the f64 sense, not gaussian!
84    let t = student_t::inv_cdf(p, v);
85
86    let center = x.mean;
87    let radius = t * x.mean_var().sqrt();
88    Ok(ConfidenceInterval {
89        center,
90        radius,
91        sig_level,
92    })
93}
94
95/// An estimate of `μ_y - μ_x` (the difference in population means),
96/// based on samples taken from X and Y.
97///
98/// Given two normally distributed populations X ~ N(μ_x, σ²_x) and Y ~
99/// N(μ_y, σ²_y), Y-X is distributed as N(μ_y - μ_x, σ²_x + σ²_y).
100///
101/// We have a sample from X and a sample from Y and we want to use these to
102/// estimate μ_y - μ_x.
103///
104/// ## Variance of the difference between the means
105///
106/// We have an estimate of μ_(Y-X) - namely, ̄y - ̄x, and we want to
107/// know the variance of that estimate.  For this we can use the sum of the
108/// variances of ̄x and ̄y, which gives s²_x/n_x + s²_y/n_y.
109///
110/// ## Degrees of freedom
111///
112/// The degrees of freedom for s² is n-1.  To compute the pooled degrees
113/// of freedom of the linear combination s²_x/n_x + s²_y/n_y, we use
114/// the Welch–Satterthwaite equation.
115pub fn difference_of_means(
116    sig_level: f64,
117    x: SampleStats,
118    y: SampleStats,
119) -> Result<ConfidenceInterval, Error> {
120    if sig_level <= 0.0 || sig_level >= 1.0 {
121        return Err(Error::BadSigLevel);
122    }
123    // Prevent division by zero (see "degrees of freedom")
124    if x.count < 2 || y.count < 2 {
125        return Err(Error::NotEnoughData);
126    }
127    if !x.var.is_finite() || !y.var.is_finite() {
128        return Err(Error::InfiniteVariance);
129    }
130    if x.var == 0. || y.var == 0. {
131        return Err(Error::ZeroVariance);
132    }
133
134    // Convert `sig_level`, which is two-sided, into `p`, which is one-sided
135    let alpha = 1. - sig_level;
136    let p = 1. - (alpha / 2.);
137
138    // Estimate the variance of the `y.mean - x.mean`
139    let x_mean_var = x.mean_var();
140    let y_mean_var = y.mean_var();
141    let var_delta = x_mean_var + y_mean_var;
142
143    // Approximate the degrees of freedom of `var_delta`
144    let k_x = x_mean_var * x_mean_var / (x.count - 1) as f64;
145    let k_y = y_mean_var * y_mean_var / (y.count - 1) as f64;
146    let v = var_delta * (var_delta / (k_x + k_y));
147
148    // Compute the critical value at the chosen confidence level
149    assert!(p.is_normal()); // "normal" in the f64 sense, not gaussian!
150    assert!(v.is_normal()); // "normal" in the f64 sense, not gaussian!
151    let t = student_t::inv_cdf(p, v);
152
153    let center = y.mean - x.mean;
154    let radius = t * var_delta.sqrt();
155    Ok(ConfidenceInterval {
156        center,
157        radius,
158        sig_level,
159    })
160}
161
162#[derive(Debug, Clone, Copy)]
163pub enum Error {
164    BadSigLevel,
165    NotEnoughData,
166    InfiniteVariance,
167    ZeroVariance,
168}
169
170impl fmt::Display for Error {
171    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
172        match self {
173            Error::BadSigLevel => {
174                f.write_str("The significance level must be between 0 and 1 (exclusive)")
175            }
176            Error::NotEnoughData => f.write_str("Can't compute CI when sample size is less than 2"),
177            Error::InfiniteVariance => {
178                f.write_str("The variance of one of the samples is infinite")
179            }
180            Error::ZeroVariance => f.write_str("The variance of one of the samples is zero"),
181        }
182    }
183}
184impl std::error::Error for Error {}
185
186#[cfg(test)]
187mod tests {
188    use super::*;
189
190    #[test]
191    fn cis() {
192        let s1 = SampleStats {
193            count: 10,
194            mean: 5.,
195            var: 1.,
196        };
197        let s2 = SampleStats {
198            count: 10,
199            mean: 6.,
200            var: 2.25,
201        };
202
203        let ci = difference_of_means(0.9, s1, s2).unwrap();
204        assert_eq!(ci.center, 1.0);
205        assert_eq!(ci.radius, 0.9965524858858832);
206
207        let ci = difference_of_means(0.95, s1, s2).unwrap();
208        assert_eq!(ci.center, 1.0);
209        assert_eq!(ci.radius, 1.2105369242089192);
210
211        let ci = difference_of_means(0.99, s1, s2).unwrap();
212        assert_eq!(ci.center, 1.0);
213        assert_eq!(ci.radius, 1.6695970385386518);
214    }
215
216    #[test]
217    fn onlinestatbook() {
218        // From http://onlinestatbook.com/2/estimation/difference_means.html
219        let females = SampleStats {
220            count: 17,
221            mean: 5.353,
222            var: 2.743f64,
223        };
224        let males = SampleStats {
225            count: 17,
226            mean: 3.882,
227            var: 2.985f64,
228        };
229        assert_eq!(
230            student_t::inv_cdf(0.975, 31.773948759590525),
231            2.037501835321414
232        );
233        let ci = difference_of_means(0.95, males, females).unwrap();
234        assert_eq!(ci.center, 1.4709999999999996);
235        assert_eq!(ci.radius, 1.1824540265693935);
236        // the orginal example has it as 1.4709999999999996 ± 1.1824540265693928
237        // the last two digits are different - probably just a rounding error
238    }
239
240    #[test]
241    fn zar() {
242        // From Zar (1984) page 132
243        let x = SampleStats {
244            count: 6,
245            mean: 10.,
246            var: (0.7206_f64).powf(2.),
247        };
248        let y = SampleStats {
249            count: 7,
250            mean: 15.,
251            var: (0.7206_f64).powf(2.),
252        };
253        let ci = difference_of_means(0.95, x, y).unwrap();
254        assert_eq!(ci.center, 5.0);
255        assert_eq!(ci.radius, 0.885452937134633);
256    }
257
258    // #[test]
259    // fn nist() {
260    //     // From the worked example at https://www.itl.nist.gov/div898/handbook/eda/section3/eda352.htm
261    //     let x = SampleStats {
262    //         count: 100,
263    //         mean: 10.,
264    //         var: (0.022789).powf(2.),
265    //     };
266    //     let y = SampleStats {
267    //         count: 95,
268    //         mean: 19.261460,
269    //         var: (0.022789).powf(2.),
270    //     };
271    //     assert_eq!(
272    //         ConfidenceInterval::new(0.95, x, y).to_string(),
273    //         "9.26146 ± 0.0032187032419323048"
274    //     );
275    // }
276}